Sentiment Analysis: Shopee App Reviews (SG)¶
%%capture
!pip install emoji
!pip install wordcloud
!pip install imblearn
!pip install tensorflow
Setup Base Directory¶
from google.colab import drive
drive.mount('/content/drive')
Mounted at /content/drive
BaseDir = '/content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/'
Library¶
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import emoji
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('punkt')
nltk.download('stopwords')
pd.set_option('display.max_colwidth', None)
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
plt.figure(figsize=(8, 4))
[nltk_data] Downloading package punkt to /root/nltk_data... [nltk_data] Unzipping tokenizers/punkt.zip. [nltk_data] Downloading package stopwords to /root/nltk_data... [nltk_data] Unzipping corpora/stopwords.zip.
<Figure size 800x400 with 0 Axes>
<Figure size 800x400 with 0 Axes>
Function¶
Text Preprocessing¶
def cleaningText(text):
text = text.lower() # Mengubah teks menjadi huruf kecil
text = re.sub(r'@\w+', '', text) # Hapus mention
text = re.sub(r'#\w+', '', text) # Hapus hashtag
text = re.sub(r'https?://\S+|www.\S+', '', text) # Hapus URL
text = re.sub(r'<.*?>', '', text) # Hapus tag HTML
text = re.sub(r'\d+', '', text) # Hapus angka
text = re.sub(r'[^\x00-\x7F]+', '', text) # Hapus karakter non-ASCII
text = emoji.replace_emoji(text, '') # Hapus emotikon
text = re.sub(r'[\u00B2\u00B3\u00B9\u2070-\u2079]', '', text) # Hapus superscript
text = text.replace('\n', ' ') # Ganti baris baru dengan spasi
text = re.sub(r'[{}]'.format(re.escape(string.punctuation)), ' ', text) # Mengganti tanda baca dengan spasi
text = text.strip() # Hapus spasi tambahan di awal dan akhir
text = re.sub(r'\s+', ' ', text) # Hapus spasi ganda
return text
# Memecah atau membagi string, teks menjadi daftar token
def tokenizingText(text):
text = word_tokenize(text)
return text
# Menghapus stopwords
def filteringText(text):
listStopwords = set(stopwords.words('english'))
# listStopwords = set(stopwords.words('indonesian'))
# listStopwords.update(listStopwords1)
# listStopwords.update(['iya','yaa','gak','nya','nah','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
filtered = []
for txt in text:
if txt not in listStopwords:
filtered.append(txt)
text = filtered
return text
# Mengurangi kata ke bentuk dasarnya
def stemmingText(text):
stemmer = PorterStemmer()
words = text.split()
stemmed_words = [stemmer.stem(word) for word in words]
stemmed_text = ' '.join(stemmed_words)
return stemmed_text
# Mengubah menjadi kalimat
def toSentence(list_words):
sentence = ' '.join(word for word in list_words)
return sentence
Exploratory Data Analysis (EDA)¶
def graph_freq(df, text_column, label_column=None, label_value=None):
if label_column is not None and label_value is not None:
filtered_df = df[df[label_column] == label_value]
else:
filtered_df = df
cv = CountVectorizer()
data = filtered_df[text_column]
words = cv.fit_transform(data)
sum_words = words.sum(axis=0)
words_freq = [(word, sum_words[0, i]) for word, i in cv.vocabulary_.items()]
words_freq = sorted(words_freq, key=lambda x: x[1], reverse=True)
frequency = pd.DataFrame(words_freq, columns=['word', 'freq'])
# Plotting
frequency.head(50).plot(x='word', y='freq', kind='bar', figsize=(15, 5), cmap='magma')
plt.title("Most Frequently Occurring Words - Top 50")
plt.show()
def wordcloud(df, text_column):
result_wordcloud = WordCloud(background_color="white", width=8000, height=4000, max_words=1000).generate(' '.join(df[text_column]))
plt.imshow(result_wordcloud)
plt.axis("off")
plt.show()
Labelling¶
# Menggunakan score (rating) untuk memberikan label
def score_label(rating):
rating = int(rating)
if rating < 2:
return 'positive'
elif rating > 3:
return 'negative'
else:
return 'neutral'
# Menggunakan keyword tertentu untuk memberikan label
# Keyword ini didapatkan dari Frequency word based Score
positive_keywords = ['great', 'excellent', 'good', 'easy', 'friendly', 'love', 'fast', 'best', 'nice']
negative_keywords = ['bad', 'unable','terrible', 'poor', 'slow', 'lagging', 'lag', 'issue']
def Label(content):
# content = content.lower()
for word in positive_keywords:
if word in content:
return 'positive'
for word in negative_keywords:
if word in content:
return 'negative'
return 'neutral'
# Encoding Label
label_mapping = {'negative': 0, 'positive': 1, 'neutral': 2}
def map_labels(label):
return label_mapping[label]
Load Dataset¶
df = pd.read_csv(BaseDir + 'dataset/shopee-en-sg-all.csv')
df.head(1)
reviewId | userName | userImage | content | score | thumbsUpCount | reviewCreatedVersion | at | replyContent | repliedAt | appVersion | |
---|---|---|---|---|---|---|---|---|---|---|---|
0 | c5689c61-60cc-4e9b-b426-5c1982580aea | A Google user | https://play-lh.googleusercontent.com/EGemoI2NTXmTsBVtJqk8jxF9rh8ApRWfsIMQSt2uE4OcpQqbFu7f7NbTK05lx80nuSijCz7sc3a277R67g | I wouldn't use this app again, at least for a long time until they catch up on both the technological and human performances. The pages load extremely slowly or cannot load, they don't integrate with foreign credit cards, and there are errors everywhere ( from shipping options to stock availability). The welcome gift disappeared after they collected my credentials and they still expect me to rate it! Furthermore, they really should train their sellers to be more friendly. | 1 | 108 | 2.54.16 | 2020-04-30 10:39:46 | Thank you for your review. We are sorry to hear that your experience with Shopee was less than perfect.\n\nAt Shopee, we aim to provide you with the best buying and selling experience. We hope you will give us another chance to earn that 5 stars from you. | 2020-04-30 12:25:06 | 2.54.16 |
Menghapus kolom yang tidak diperlukan
df = df.drop(columns=['reviewId', 'userName', 'userImage', 'thumbsUpCount', 'reviewCreatedVersion', 'at', 'replyContent', 'repliedAt', 'appVersion'])
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 41680 entries, 0 to 41679 Data columns (total 2 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 content 41680 non-null object 1 score 41680 non-null int64 dtypes: int64(1), object(1) memory usage: 651.4+ KB
> Check Missing Values¶
missing_info = pd.DataFrame({
'missing_values': df.isnull().sum(),
'percent_missing': df.isnull().mean() * 100
})
missing_info
missing_values | percent_missing | |
---|---|---|
content | 0 | 0.0 |
score | 0 | 0.0 |
✅ Tidak ada missing value
> Check Duplicate¶
duplicates = df[df.duplicated()]
duplicates
content | score | |
---|---|---|
3306 | Easy to use | 5 |
3310 | Easy to use | 5 |
3315 | Easy to use | 5 |
5590 | Easy to use | 5 |
6786 | Easy to use | 5 |
... | ... | ... |
41672 | 👍👍👍 | 5 |
41673 | 👍👍👍👍👍👍👍👍👍 | 5 |
41675 | 👍 👍 👍 | 5 |
41677 | 👍 | 1 |
41679 | ❤️❤️ | 5 |
8572 rows × 2 columns
⚠️ Terdapat data yang duplikat dan perlu dihapus.
df['content'][3306]
'Easy to use'
df.drop_duplicates(subset='content', inplace=True)
df.duplicated().sum()
0
✅ Sudah tidak ada data yang duplikat
Text Preprocessing¶
df['text_clean'] = df['content'].apply(cleaningText)
df['text_tokenizingText'] = df['text_clean'].apply(tokenizingText)
df['text_stopword'] = df['text_tokenizingText'].apply(filteringText)
df['content_clean'] = df['text_stopword'].apply(toSentence)
df['stemmingText'] = df['content_clean'].apply(stemmingText)
df.head(1)
content | score | text_clean | text_tokenizingText | text_stopword | content_clean | stemmingText | |
---|---|---|---|---|---|---|---|
0 | I wouldn't use this app again, at least for a long time until they catch up on both the technological and human performances. The pages load extremely slowly or cannot load, they don't integrate with foreign credit cards, and there are errors everywhere ( from shipping options to stock availability). The welcome gift disappeared after they collected my credentials and they still expect me to rate it! Furthermore, they really should train their sellers to be more friendly. | 1 | i wouldn t use this app again at least for a long time until they catch up on both the technological and human performances the pages load extremely slowly or cannot load they don t integrate with foreign credit cards and there are errors everywhere from shipping options to stock availability the welcome gift disappeared after they collected my credentials and they still expect me to rate it furthermore they really should train their sellers to be more friendly | [i, wouldn, t, use, this, app, again, at, least, for, a, long, time, until, they, catch, up, on, both, the, technological, and, human, performances, the, pages, load, extremely, slowly, or, can, not, load, they, don, t, integrate, with, foreign, credit, cards, and, there, are, errors, everywhere, from, shipping, options, to, stock, availability, the, welcome, gift, disappeared, after, they, collected, my, credentials, and, they, still, expect, me, to, rate, it, furthermore, they, really, should, train, their, sellers, to, be, more, friendly] | [use, app, least, long, time, catch, technological, human, performances, pages, load, extremely, slowly, load, integrate, foreign, credit, cards, errors, everywhere, shipping, options, stock, availability, welcome, gift, disappeared, collected, credentials, still, expect, rate, furthermore, really, train, sellers, friendly] | use app least long time catch technological human performances pages load extremely slowly load integrate foreign credit cards errors everywhere shipping options stock availability welcome gift disappeared collected credentials still expect rate furthermore really train sellers friendly | use app least long time catch technolog human perform page load extrem slowli load integr foreign credit card error everywher ship option stock avail welcom gift disappear collect credenti still expect rate furthermor realli train seller friendli |
Recheck after Preprocessing¶
Langkah ini diperlukan lagi untuk memastikan tidak ada lagi missing values atau duplikat.
> Missing values¶
missing_info = pd.DataFrame({
'missing_values': df.isnull().sum(),
'percent_missing': df.isnull().mean() * 100
})
missing_info
missing_values | percent_missing | |
---|---|---|
content | 0 | 0.0 |
score | 0 | 0.0 |
text_clean | 0 | 0.0 |
text_tokenizingText | 0 | 0.0 |
text_stopword | 0 | 0.0 |
content_clean | 0 | 0.0 |
stemmingText | 0 | 0.0 |
> Duplicates¶
duplicates = df[df['content_clean'].duplicated()]
duplicates
content | score | text_clean | text_tokenizingText | text_stopword | content_clean | stemmingText | |
---|---|---|---|---|---|---|---|
1723 | fast and convenient | 5 | fast and convenient | [fast, and, convenient] | [fast, convenient] | fast convenient | fast conveni |
1815 | Good and fast delivery | 5 | good and fast delivery | [good, and, fast, delivery] | [good, fast, delivery] | good fast delivery | good fast deliveri |
3302 | Very good service 👍👍👍 | 5 | very good service | [very, good, service] | [good, service] | good service | good servic |
6422 | Easy to use. | 5 | easy to use | [easy, to, use] | [easy, use] | easy use | easi use |
6718 | Good shopping experience. | 5 | good shopping experience | [good, shopping, experience] | [good, shopping, experience] | good shopping experience | good shop experi |
... | ... | ... | ... | ... | ... | ... | ... |
41667 | 😍😍😍😍😍😍😍😍 | 5 | [] | [] | |||
41670 | 🤗🤗🤗 | 5 | [] | [] | |||
41674 | ... ... | 5 | [] | [] | |||
41676 | ok.... | 5 | ok | [ok] | [ok] | ok | ok |
41678 | 😓 | 1 | [] | [] |
3251 rows × 7 columns
⚠️ Setelah dilakukan preprocessing ternyata masih ada data yang duplikat dan perlu dihilangkan.
df.drop_duplicates(subset='content_clean', inplace=True)
df['content_clean'].duplicated().sum()
0
✅ Okay, sudah tidak ada data duplicate.
df.head(1)
content | score | text_clean | text_tokenizingText | text_stopword | content_clean | stemmingText | |
---|---|---|---|---|---|---|---|
0 | I wouldn't use this app again, at least for a long time until they catch up on both the technological and human performances. The pages load extremely slowly or cannot load, they don't integrate with foreign credit cards, and there are errors everywhere ( from shipping options to stock availability). The welcome gift disappeared after they collected my credentials and they still expect me to rate it! Furthermore, they really should train their sellers to be more friendly. | 1 | i wouldn t use this app again at least for a long time until they catch up on both the technological and human performances the pages load extremely slowly or cannot load they don t integrate with foreign credit cards and there are errors everywhere from shipping options to stock availability the welcome gift disappeared after they collected my credentials and they still expect me to rate it furthermore they really should train their sellers to be more friendly | [i, wouldn, t, use, this, app, again, at, least, for, a, long, time, until, they, catch, up, on, both, the, technological, and, human, performances, the, pages, load, extremely, slowly, or, can, not, load, they, don, t, integrate, with, foreign, credit, cards, and, there, are, errors, everywhere, from, shipping, options, to, stock, availability, the, welcome, gift, disappeared, after, they, collected, my, credentials, and, they, still, expect, me, to, rate, it, furthermore, they, really, should, train, their, sellers, to, be, more, friendly] | [use, app, least, long, time, catch, technological, human, performances, pages, load, extremely, slowly, load, integrate, foreign, credit, cards, errors, everywhere, shipping, options, stock, availability, welcome, gift, disappeared, collected, credentials, still, expect, rate, furthermore, really, train, sellers, friendly] | use app least long time catch technological human performances pages load extremely slowly load integrate foreign credit cards errors everywhere shipping options stock availability welcome gift disappeared collected credentials still expect rate furthermore really train sellers friendly | use app least long time catch technolog human perform page load extrem slowli load integr foreign credit card error everywher ship option stock avail welcom gift disappear collect credenti still expect rate furthermor realli train seller friendli |
Distribution content based score¶
sns.countplot(x='score', data=df, hue='score', palette=HAPPY_COLORS_PALETTE, dodge=False)
plt.title('Distribution of Review Scores')
plt.xlabel('Review Score')
plt.legend([],[], frameon=False)
plt.show()
Terlihat review untuk aplikasi Shopee di Singapore tampaknya didominasi oleh penilaian bintang lima (score: 5), menunjukkan bahwa mayoritas pengguna sangat puas dengan layanan yang diberikan.
Wordcloud¶
wordcloud(df, 'content_clean')
graph_freq(df, 'content_clean')
Frequency word based Score¶
Score 4 dan 5¶
graph_freq(df, 'content_clean', 'score', 4 and 5)
Melalui grafik di atas, kita bisa mengambil beberapa keyword seperti: good, easy, great, fast dan lainnya yang relevan untuk memberikan label positive.
Score 1 dan 2¶
graph_freq(df, 'content_clean', 'score', 1 and 2)
Melalui grafik di atas, kita bisa mengambil beberapa keyword seperti: bad, error, unable, issue dan lainnya yang relevan untuk memberikan label negative.
Labelling¶
Memberikan label sesuai dengan yang sudah didefinisikan sebelumnya
df['score_label'] = df['score'].apply(score_label)
df['label'] = df['content_clean'].apply(Label)
Distribution Reviews¶
fig, axs = plt.subplots(1, 2, figsize=(12, 6))
# Based Keyword Sentiment
sns.countplot(x='label', data=df, dodge=False, ax=axs[0])
axs[0].set_title('Distribution of Review based Keyword Sentiment')
axs[0].set_xlabel('Label')
# Based Score
sns.countplot(x='score_label', data=df, dodge=False, ax=axs[1])
axs[1].set_title('Distribution of Review based Score Label')
axs[1].set_xlabel('Score Label')
plt.tight_layout()
plt.show()
Terlihat dari grafik di atas terlalu jauh perbedaanya. Sebelumnya kita sudah melihat distribusi pada data score dan distribusi dengan score 5 lebih banyak.
Kemudian melakukan labeling dengan 2 cara yaitu berdasarkan keyword sentimen dan score.
Distribusi dengan keyword lebih masuk akal jika dibandingkan dengan score, karena pada distribusi dengan score 5 sebelumnya lebih banyak.
Melihat frekuensi untuk Positive Reviews¶
graph_freq(df, 'content_clean', 'label', 'positive')
Melihat frekuensi untuk Neutral Reviews¶
graph_freq(df, 'content_clean', 'label', 'neutral')
Melihat frekuensi untuk Negative Reviews¶
graph_freq(df, 'content_clean', 'label', 'negative')
Export dataset after Preprocessing¶
df.to_csv(BaseDir + 'dataset/shopee-en-sg-all-preprocessed.csv', index=False)
Modelling¶
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imPipeline
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import HTML
import joblib
df = pd.read_csv(BaseDir + 'dataset/shopee-en-sg-all-preprocessed.csv')
df.head(1)
content | score | text_clean | text_tokenizingText | text_stopword | content_clean | stemmingText | score_label | label | |
---|---|---|---|---|---|---|---|---|---|
0 | I wouldn't use this app again, at least for a long time until they catch up on both the technological and human performances. The pages load extremely slowly or cannot load, they don't integrate with foreign credit cards, and there are errors everywhere ( from shipping options to stock availability). The welcome gift disappeared after they collected my credentials and they still expect me to rate it! Furthermore, they really should train their sellers to be more friendly. | 1 | i wouldn t use this app again at least for a long time until they catch up on both the technological and human performances the pages load extremely slowly or cannot load they don t integrate with foreign credit cards and there are errors everywhere from shipping options to stock availability the welcome gift disappeared after they collected my credentials and they still expect me to rate it furthermore they really should train their sellers to be more friendly | ['i', 'wouldn', 't', 'use', 'this', 'app', 'again', 'at', 'least', 'for', 'a', 'long', 'time', 'until', 'they', 'catch', 'up', 'on', 'both', 'the', 'technological', 'and', 'human', 'performances', 'the', 'pages', 'load', 'extremely', 'slowly', 'or', 'can', 'not', 'load', 'they', 'don', 't', 'integrate', 'with', 'foreign', 'credit', 'cards', 'and', 'there', 'are', 'errors', 'everywhere', 'from', 'shipping', 'options', 'to', 'stock', 'availability', 'the', 'welcome', 'gift', 'disappeared', 'after', 'they', 'collected', 'my', 'credentials', 'and', 'they', 'still', 'expect', 'me', 'to', 'rate', 'it', 'furthermore', 'they', 'really', 'should', 'train', 'their', 'sellers', 'to', 'be', 'more', 'friendly'] | ['use', 'app', 'least', 'long', 'time', 'catch', 'technological', 'human', 'performances', 'pages', 'load', 'extremely', 'slowly', 'load', 'integrate', 'foreign', 'credit', 'cards', 'errors', 'everywhere', 'shipping', 'options', 'stock', 'availability', 'welcome', 'gift', 'disappeared', 'collected', 'credentials', 'still', 'expect', 'rate', 'furthermore', 'really', 'train', 'sellers', 'friendly'] | use app least long time catch technological human performances pages load extremely slowly load integrate foreign credit cards errors everywhere shipping options stock availability welcome gift disappeared collected credentials still expect rate furthermore really train sellers friendly | use app least long time catch technolog human perform page load extrem slowli load integr foreign credit card error everywher ship option stock avail welcom gift disappear collect credenti still expect rate furthermor realli train seller friendli | positive | positive |
print(f'Terdapat {df.shape[0]} sample data')
Terdapat 29556 sample data
df.dropna(inplace=True)
df['label'].value_counts()
label positive 16310 neutral 11314 negative 1931 Name: count, dtype: int64
Encoding label:¶
- 'negative' : 0
- 'positive' : 1
- 'neutral' : 2
df['label'] = df['label'].apply(map_labels)
df['score_label'] = df['score_label'].apply(map_labels)
Percobaan 1 (ML wo GridSearch) - Data Label¶
- Data label dengan metode keyword sentiment
- Menggunakan Pipeline dengan beberapa model machine learning
- Menggunakan TFIDF
- Terdapat metode sampling dengan smote, random under sampling dan tanpa sampling
- Tanpa GridSearch
X = df['content_clean']
y = df['label']
num_classes = len(y.unique())
print(f'Number of classes: {num_classes}')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Sampling
smote = SMOTE(random_state=42)
under_sampler = RandomUnderSampler(random_state=42)
models = {
'Logistic Regression': LogisticRegression(max_iter=1000, C=1.0, solver='liblinear', multi_class='ovr', random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
# 'Naive Bayes': BernoulliNB(alpha=1.0),
'Decision Tree': DecisionTreeClassifier(max_depth=10, criterion='entropy', random_state=42),
# 'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
'SVM': SVC(kernel='linear', C=1.0, decision_function_shape='ovr', random_state=42)
}
results = []
sampling_methods = {
'None': None,
'SMOTE': smote,
'Random Under Sampling': under_sampler
}
for model_name, model in models.items():
for sampling_name, sampler in sampling_methods.items():
steps = [('tfidf', TfidfVectorizer(max_features=200, min_df=17, max_df=0.8))]
if sampler:
steps.append(('sampling', sampler))
steps.append(('classifier', model))
pipeline = imPipeline(steps)
# Latih model
pipeline.fit(X_train, y_train)
# Simpan model
model_filename = BaseDir + 'models/' + f"{model_name.replace(' ', '_')}_{sampling_name.replace(' ', '_')}.pkl"
joblib.dump(pipeline, model_filename)
print(f"Model saved to {model_filename}")
# Prediksi
y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)
# Evaluasi
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
# Simpan hasil
results.append({
'Model': model_name,
'Sampling Method': sampling_name,
'Accuracy Train': accuracy_train,
'Accuracy Test': accuracy_test
})
results_df = pd.DataFrame(results)
def highlight_best_3(df):
# 3 model terbaik berdasarkan 'Accuracy Test'
top_3_indices = df.nlargest(3, 'Accuracy Test').index
def highlight(row):
return ['background-color: lightgreen' if i in top_3_indices else '' for i in row.index]
return df.style.apply(highlight, axis=1)
# Tampilkan hasil dengan pewarnaan
styled_results_df = highlight_best_3(results_df)
display(styled_results_df)
Number of classes: 3 Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Logistic_Regression_None.pkl Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Logistic_Regression_SMOTE.pkl Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Logistic_Regression_Random_Under_Sampling.pkl Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Random_Forest_None.pkl Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Random_Forest_SMOTE.pkl Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Random_Forest_Random_Under_Sampling.pkl Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Decision_Tree_None.pkl Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Decision_Tree_SMOTE.pkl Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Decision_Tree_Random_Under_Sampling.pkl Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/SVM_None.pkl Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/SVM_SMOTE.pkl Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/SVM_Random_Under_Sampling.pkl
Model | Sampling Method | Accuracy Train | Accuracy Test | |
---|---|---|---|---|
0 | Logistic Regression | None | 0.972044 | 0.972593 |
1 | Logistic Regression | SMOTE | 0.958467 | 0.955676 |
2 | Logistic Regression | Random Under Sampling | 0.940661 | 0.938927 |
3 | Random Forest | None | 0.958975 | 0.955168 |
4 | Random Forest | SMOTE | 0.960751 | 0.955676 |
5 | Random Forest | Random Under Sampling | 0.951785 | 0.949586 |
6 | Decision Tree | None | 0.941338 | 0.942311 |
7 | Decision Tree | SMOTE | 0.908941 | 0.907799 |
8 | Decision Tree | Random Under Sampling | 0.906995 | 0.905261 |
9 | SVM | None | 0.974158 | 0.975300 |
10 | SVM | SMOTE | 0.972509 | 0.973778 |
11 | SVM | Random Under Sampling | 0.955676 | 0.956353 |
Percobaan 2 (ML wo GridSearch) - Data Score Label¶
- Data label dengan metode score (rating reviews)
- Menggunakan Pipeline dengan beberapa model machine learning
- Menggunakan TFIDF
- Terdapat metode sampling dengan smote, random under sampling dan tanpa sampling
- Tanpa GridSearch
X = df['content_clean']
y = df['score_label']
num_classes = len(y.unique())
print(f'Number of classes: {num_classes}')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Sampling
smote = SMOTE(random_state=42)
under_sampler = RandomUnderSampler(random_state=42)
models = {
'Logistic Regression': LogisticRegression(max_iter=1000, C=1.0, solver='liblinear', multi_class='ovr', random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
# 'Naive Bayes': BernoulliNB(alpha=1.0),
'Decision Tree': DecisionTreeClassifier(max_depth=10, criterion='entropy', random_state=42),
# 'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
'SVM': SVC(kernel='linear', C=1.0, decision_function_shape='ovr', random_state=42)
}
results = []
sampling_methods = {
'None': None,
'SMOTE': smote,
'Random Under Sampling': under_sampler
}
for model_name, model in models.items():
for sampling_name, sampler in sampling_methods.items():
steps = [('tfidf', TfidfVectorizer(max_features=200, min_df=17, max_df=0.8))]
if sampler:
steps.append(('sampling', sampler)) # Terapkan teknik sampling jika ada
steps.append(('classifier', model))
pipeline = imPipeline(steps)
# Latih model
pipeline.fit(X_train, y_train)
# Simpan model
model_filename = f"{model_name.replace(' ', '_')}_{sampling_name.replace(' ', '_')}.pkl"
joblib.dump(pipeline, model_filename)
print(f"Model saved to {model_filename}")
# Prediksi
y_pred_train = pipeline.predict(X_train)
y_pred_test = pipeline.predict(X_test)
# Evaluasi
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
# Simpan hasil ke list
results.append({
'Model': model_name,
'Sampling Method': sampling_name,
'Accuracy Train': accuracy_train,
'Accuracy Test': accuracy_test
})
results_df = pd.DataFrame(results)
def highlight_best_3(df):
# 3 model terbaik berdasarkan 'Accuracy Test'
top_3_indices = df.nlargest(3, 'Accuracy Test').index
def highlight(row):
return ['background-color: lightgreen' if i in top_3_indices else '' for i in row.index]
return df.style.apply(highlight, axis=1)
styled_results_df = highlight_best_3(results_df)
display(styled_results_df)
Number of classes: 3
Model | Sampling Method | Accuracy Train | Accuracy Test | |
---|---|---|---|---|
0 | Logistic Regression | None | 0.861952 | 0.862460 |
1 | Logistic Regression | SMOTE | 0.780663 | 0.779902 |
2 | Logistic Regression | Random Under Sampling | 0.768609 | 0.768398 |
3 | Random Forest | None | 0.817205 | 0.817459 |
4 | Random Forest | SMOTE | 0.731010 | 0.709525 |
5 | Random Forest | Random Under Sampling | 0.711766 | 0.688547 |
6 | Decision Tree | None | 0.815725 | 0.812722 |
7 | Decision Tree | SMOTE | 0.605481 | 0.596853 |
8 | Decision Tree | Random Under Sampling | 0.629800 | 0.622230 |
9 | SVM | None | 0.860176 | 0.863982 |
10 | SVM | SMOTE | 0.771697 | 0.770597 |
11 | SVM | Random Under Sampling | 0.749366 | 0.744713 |
Menggunakan data Score Label, hasil akurasi masih belum mendapatkan akurasi >90%. Hal ini mungkin terjadi karena proses pelabelan dilakukan berdasarkan ratingnya. Jauh berbeda ketika pelabelan menggunakan metode keyword.
Percobaan 3 (ML with GridSearch) - Data Label¶
- Data label dengan metode keyword sentiment
- Menggunakan Pipeline dengan beberapa model machine learning
- Menggunakan TFIDF
- Terdapat metode sampling dengan smote, random under sampling dan tanpa sampling
- Dengan GridSearch
X = df['content_clean']
y = df['label']
num_classes = len(y.unique())
print(f'Number of classes: {num_classes}')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Sampling
smote = SMOTE(random_state=42)
under_sampler = RandomUnderSampler(random_state=42)
models = {
'Logistic Regression': LogisticRegression(max_iter=1000, C=1.0, solver='liblinear', multi_class='ovr', random_state=42),
'Random Forest': RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42),
# 'Naive Bayes': BernoulliNB(alpha=1.0),
'Decision Tree': DecisionTreeClassifier(max_depth=10, criterion='entropy', random_state=42),
# 'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42),
'SVM': SVC(kernel='linear', C=1.0, decision_function_shape='ovr', random_state=42)
}
results = []
# Parameter grid
param_grids = {
'Logistic Regression': {
'classifier__C': [0.1, 1.0, 10.0],
'classifier__solver': ['liblinear', 'lbfgs']
},
'Random Forest': {
'classifier__n_estimators': [50, 100, 200],
'classifier__max_depth': [10, 20, None]
},
# 'Naive Bayes': {
# 'classifier__alpha': [0.5, 1.0, 2.0]
# },
'Decision Tree': {
'classifier__max_depth': [10, 20, None],
'classifier__criterion': ['gini', 'entropy']
},
# 'Gradient Boosting': {
# 'classifier__n_estimators': [50, 100, 200],
# 'classifier__learning_rate': [0.01, 0.1, 1.0],
# 'classifier__max_depth': [3, 5, 7]
# },
'SVM': {
'classifier__C': [0.1, 1.0, 10.0],
'classifier__kernel': ['linear', 'rbf']
}
}
sampling_methods = {
'None': None,
'SMOTE': smote,
'Random Under Sampling': under_sampler
}
for model_name, model in models.items():
for sampling_name, sampler in sampling_methods.items():
steps = [('tfidf', TfidfVectorizer(max_features=200, min_df=17, max_df=0.8))]
if sampler:
steps.append(('sampling', sampler))
steps.append(('classifier', model))
pipeline = imPipeline(steps)
# Setup GridSearchCV
param_grid = param_grids.get(model_name, {})
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy', n_jobs=-1, verbose=1)
# Latih model
grid_search.fit(X_train, y_train)
# Simpan model terbaik
best_model_filename = BaseDir + 'models/' + f"{model_name.replace(' ', '_')}_{sampling_name.replace(' ', '_')}_best_model.pkl"
joblib.dump(grid_search.best_estimator_, best_model_filename)
print(f"Best model saved to {best_model_filename}")
# Prediksi
y_pred_train = grid_search.predict(X_train)
y_pred_test = grid_search.predict(X_test)
# Evaluasi
accuracy_train = accuracy_score(y_train, y_pred_train)
accuracy_test = accuracy_score(y_test, y_pred_test)
# Simpan hasil
results.append({
'Model': model_name,
'Sampling Method': sampling_name,
'Best Params': grid_search.best_params_,
'Accuracy Train': accuracy_train,
'Accuracy Test': accuracy_test
})
results_df = pd.DataFrame(results)
def highlight_best_3(df):
# 3 model terbaik berdasarkan 'Accuracy Test'
top_3_indices = df.nlargest(3, 'Accuracy Test').index
def highlight(row):
return ['background-color: lightgreen' if i in top_3_indices else '' for i in row.index]
return df.style.apply(highlight, axis=1)
styled_results_df = highlight_best_3(results_df)
display(styled_results_df)
Number of classes: 3 Fitting 5 folds for each of 6 candidates, totalling 30 fits Best model saved to Logistic_Regression_None_best_model.pkl Fitting 5 folds for each of 6 candidates, totalling 30 fits Best model saved to Logistic_Regression_SMOTE_best_model.pkl Fitting 5 folds for each of 6 candidates, totalling 30 fits Best model saved to Logistic_Regression_Random_Under_Sampling_best_model.pkl Fitting 5 folds for each of 9 candidates, totalling 45 fits Best model saved to Random_Forest_None_best_model.pkl Fitting 5 folds for each of 9 candidates, totalling 45 fits Best model saved to Random_Forest_SMOTE_best_model.pkl Fitting 5 folds for each of 9 candidates, totalling 45 fits Best model saved to Random_Forest_Random_Under_Sampling_best_model.pkl Fitting 5 folds for each of 6 candidates, totalling 30 fits Best model saved to Decision_Tree_None_best_model.pkl Fitting 5 folds for each of 6 candidates, totalling 30 fits Best model saved to Decision_Tree_SMOTE_best_model.pkl Fitting 5 folds for each of 6 candidates, totalling 30 fits Best model saved to Decision_Tree_Random_Under_Sampling_best_model.pkl Fitting 5 folds for each of 6 candidates, totalling 30 fits Best model saved to SVM_None_best_model.pkl Fitting 5 folds for each of 6 candidates, totalling 30 fits Best model saved to SVM_SMOTE_best_model.pkl Fitting 5 folds for each of 6 candidates, totalling 30 fits Best model saved to SVM_Random_Under_Sampling_best_model.pkl
Model | Sampling Method | Best Params | Accuracy Train | Accuracy Test | |
---|---|---|---|---|---|
0 | Logistic Regression | None | {'classifier__C': 10.0, 'classifier__solver': 'liblinear'} | 0.975004 | 0.975808 |
1 | Logistic Regression | SMOTE | {'classifier__C': 10.0, 'classifier__solver': 'liblinear'} | 0.962993 | 0.960244 |
2 | Logistic Regression | Random Under Sampling | {'classifier__C': 10.0, 'classifier__solver': 'lbfgs'} | 0.952969 | 0.947386 |
3 | Random Forest | None | {'classifier__max_depth': 20, 'classifier__n_estimators': 200} | 0.976146 | 0.976485 |
4 | Random Forest | SMOTE | {'classifier__max_depth': None, 'classifier__n_estimators': 200} | 0.993825 | 0.972763 |
5 | Random Forest | Random Under Sampling | {'classifier__max_depth': 20, 'classifier__n_estimators': 200} | 0.961386 | 0.957875 |
6 | Decision Tree | None | {'classifier__criterion': 'entropy', 'classifier__max_depth': 20} | 0.977288 | 0.974962 |
7 | Decision Tree | SMOTE | {'classifier__criterion': 'entropy', 'classifier__max_depth': 20} | 0.976104 | 0.972424 |
8 | Decision Tree | Random Under Sampling | {'classifier__criterion': 'entropy', 'classifier__max_depth': 20} | 0.965234 | 0.961935 |
9 | SVM | None | {'classifier__C': 10.0, 'classifier__kernel': 'linear'} | 0.975512 | 0.976654 |
10 | SVM | SMOTE | {'classifier__C': 10.0, 'classifier__kernel': 'linear'} | 0.972297 | 0.973778 |
11 | SVM | Random Under Sampling | {'classifier__C': 1.0, 'classifier__kernel': 'linear'} | 0.955676 | 0.956353 |
Import Library Deep Learning¶
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping
from imblearn.under_sampling import RandomUnderSampler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional, SimpleRNN, Embedding, SpatialDropout1D, GRU
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score
import math
Percobaan 4 (DL - LSTM)¶
X = df['content_clean']
y = df['label']
num_classes = len(y.unique())
print(f'Number of classes: {num_classes}')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Tokenisasi teks
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
# Konversi teks menjadi sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
# Padding sequences
max_length = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post', truncating='post')
# Konversi label menjadi categorical
y_train_categorical = to_categorical(y_train, num_classes=num_classes)
y_test_categorical = to_categorical(y_test, num_classes=num_classes)
# Create LSTM model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_length))
model.add(LSTM(128))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Callback
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
# Pelatihan model
history = model.fit(X_train_padded, y_train_categorical, epochs=10, batch_size=32, validation_data=(X_test_padded, y_test_categorical), callbacks=[early_stop])
# Evaluasi model
loss, accuracy = model.evaluate(X_test_padded, y_test_categorical, verbose=2)
print(f'Test Accuracy: {accuracy:.4f}')
# Menyimpan model
model.save(BaseDir + 'lstm_model.h5')
print("Model berhasil disimpan: 'lstm_model.h5'")
Number of classes: 3 Epoch 1/10 739/739 [==============================] - 222s 294ms/step - loss: 0.8861 - accuracy: 0.5465 - val_loss: 0.8673 - val_accuracy: 0.5552 Epoch 2/10 739/739 [==============================] - 201s 272ms/step - loss: 0.8804 - accuracy: 0.5498 - val_loss: 0.8675 - val_accuracy: 0.5552 Epoch 3/10 739/739 [==============================] - 173s 234ms/step - loss: 0.8786 - accuracy: 0.5511 - val_loss: 0.8672 - val_accuracy: 0.5552 Epoch 4/10 739/739 [==============================] - 160s 217ms/step - loss: 0.8779 - accuracy: 0.5511 - val_loss: 0.8677 - val_accuracy: 0.5552 Epoch 5/10 739/739 [==============================] - 172s 233ms/step - loss: 0.8774 - accuracy: 0.5511 - val_loss: 0.8689 - val_accuracy: 0.5552 Epoch 6/10 739/739 [==============================] - 180s 243ms/step - loss: 0.8782 - accuracy: 0.5511 - val_loss: 0.8702 - val_accuracy: 0.5552 185/185 - 12s - loss: 0.8672 - accuracy: 0.5552 - 12s/epoch - 67ms/step Test Accuracy: 0.5552 Model berhasil disimpan: 'lstm_model.h5'
Percobaan dengan LSTM tidak mendapatkan akurasi yang diharapkan, hanya mendapatkan 55 %.
Percobaan 5 (DL - BILSTM)¶
X = df['content_clean']
y = df['label']
num_classes = len(y.unique())
print(f'Number of classes: {num_classes}')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Tokenisasi teks
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
# Konversi teks menjadi sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
# Padding sequences
max_length = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post', truncating='post')
y_train_categorical = to_categorical(y_train, num_classes=num_classes)
y_test_categorical = to_categorical(y_test, num_classes=num_classes)
# Model BILSTM
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_length))
model.add(Bidirectional(LSTM(128)))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Callback
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
# Pelatihan model
history = model.fit(X_train_padded, y_train_categorical, epochs=10, batch_size=32, validation_data=(X_test_padded, y_test_categorical), callbacks=[early_stop])
# Evaluasi model
loss, accuracy = model.evaluate(X_test_padded, y_test_categorical, verbose=2)
print(f'Test Accuracy: {accuracy:.4f}')
# Menyimpan model
model.save(BaseDIr + 'bilstm_model.keras')
print("Model berhasil disimpan: 'bilstm_model.keras'")
Number of classes: 3 Epoch 1/10 739/739 [==============================] - 344s 460ms/step - loss: 0.1577 - accuracy: 0.9407 - val_loss: 0.0317 - val_accuracy: 0.9939 Epoch 2/10 739/739 [==============================] - 342s 463ms/step - loss: 0.0199 - accuracy: 0.9964 - val_loss: 0.0240 - val_accuracy: 0.9961 Epoch 3/10 739/739 [==============================] - 317s 429ms/step - loss: 0.0153 - accuracy: 0.9971 - val_loss: 0.0230 - val_accuracy: 0.9966 Epoch 4/10 739/739 [==============================] - 317s 429ms/step - loss: 0.0103 - accuracy: 0.9980 - val_loss: 0.0258 - val_accuracy: 0.9966 Epoch 5/10 739/739 [==============================] - 307s 416ms/step - loss: 0.0207 - accuracy: 0.9965 - val_loss: 0.0305 - val_accuracy: 0.9944 Epoch 6/10 739/739 [==============================] - 324s 438ms/step - loss: 0.0081 - accuracy: 0.9983 - val_loss: 0.0270 - val_accuracy: 0.9959 185/185 - 23s - loss: 0.0230 - accuracy: 0.9966 - 23s/epoch - 124ms/step Test Accuracy: 0.9966 Model berhasil disimpan: 'bilstm_model.keras'
Percobaan 6 (DL - GRU)¶
X = df['content_clean']
y = df['label']
num_classes = len(y.unique())
print(f'Number of classes: {num_classes}')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Tokenisasi teks
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
# Konversi teks menjadi sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)
# Padding sequences
max_length = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post', truncating='post')
# Konversi label menjadi categorical
y_train_categorical = to_categorical(y_train, num_classes=num_classes)
y_test_categorical = to_categorical(y_test, num_classes=num_classes)
# GRU model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_length))
model.add(Bidirectional(GRU(128, dropout=0.2, return_sequences=True)))
model.add(Bidirectional(GRU(128)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
# Callback
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
# Pelatihan model
history = model.fit(X_train_padded, y_train_categorical, epochs=10, batch_size=32, validation_data=(X_test_padded, y_test_categorical), callbacks=[early_stop])
# Evaluasi model
loss, accuracy = model.evaluate(X_test_padded, y_test_categorical, verbose=2)
print(f'Test Accuracy: {accuracy:.4f}')
# Menyimpan model
model.save(BaseDir + 'gru_model.h5')
print("Model berhasil disimpan: 'gru_model.h5'")
Number of classes: 3 Epoch 1/10 739/739 [==============================] - 603s 801ms/step - loss: 0.1238 - accuracy: 0.9537 - val_loss: 0.0260 - val_accuracy: 0.9956 Epoch 2/10 739/739 [==============================] - 570s 771ms/step - loss: 0.0158 - accuracy: 0.9969 - val_loss: 0.0230 - val_accuracy: 0.9966 Epoch 3/10 739/739 [==============================] - 552s 748ms/step - loss: 0.0108 - accuracy: 0.9979 - val_loss: 0.0228 - val_accuracy: 0.9961 Epoch 4/10 739/739 [==============================] - 570s 771ms/step - loss: 0.0122 - accuracy: 0.9976 - val_loss: 0.0204 - val_accuracy: 0.9966 Epoch 5/10 739/739 [==============================] - 580s 786ms/step - loss: 0.0099 - accuracy: 0.9975 - val_loss: 0.0354 - val_accuracy: 0.9966 Epoch 6/10 739/739 [==============================] - 561s 760ms/step - loss: 0.0091 - accuracy: 0.9981 - val_loss: 0.0284 - val_accuracy: 0.9948 Epoch 7/10 739/739 [==============================] - 560s 757ms/step - loss: 0.0089 - accuracy: 0.9981 - val_loss: 0.0218 - val_accuracy: 0.9976 185/185 - 31s - loss: 0.0204 - accuracy: 0.9966 - 31s/epoch - 166ms/step Test Accuracy: 0.9966 Model berhasil disimpan: 'gru_model.h5'
Conclusion¶
Berdasarkan analisa data, secara keseluruhan Aplikasi Shopee di Singapore lebih dominan mendapatkan score bintang 5. Ini menunjukkan bahwa pengguna aplikasi Shoope di negara tersebut sangat puas dengan layanan yang diberikan.
Berdasarkan beberapa percobaan, model machine learning terbaik yang dihasilkan yaitu:
- Model : Random Forest dengan SMOTE
- Akurasi : > 99 %
- Validasi akurasi : > 97 %
Model deep learning yang terbaik:
- Model : BILSTM
- Akurasi : > 99.83 %
- Validasi akurasi : > 99.66 %
Sehingga kedepannya model ini dapat digunakan untuk memprediksi sentiment.
~ Thank you !