%%capture
!pip install emoji
!pip install wordcloud
!pip install imblearn
!pip install tensorflow

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive

BaseDir = '/content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
import emoji
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from wordcloud import WordCloud
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('punkt')
nltk.download('stopwords')

pd.set_option('display.max_colwidth', None)

sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
plt.figure(figsize=(8, 4))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.

<Figure size 800x400 with 0 Axes>

<Figure size 800x400 with 0 Axes>

def cleaningText(text):
    text = text.lower()               # Mengubah teks menjadi huruf kecil
    text = re.sub(r'@\w+', '', text)  # Hapus mention
    text = re.sub(r'#\w+', '', text)  # Hapus hashtag
    text = re.sub(r'https?://\S+|www.\S+', '', text)  # Hapus URL
    text = re.sub(r'<.*?>', '', text) # Hapus tag HTML
    text = re.sub(r'\d+', '', text)   # Hapus angka
    text = re.sub(r'[^\x00-\x7F]+', '', text)  # Hapus karakter non-ASCII
    text = emoji.replace_emoji(text, '')  # Hapus emotikon
    text = re.sub(r'[\u00B2\u00B3\u00B9\u2070-\u2079]', '', text)  # Hapus superscript
    text = text.replace('\n', ' ')    # Ganti baris baru dengan spasi
    text = re.sub(r'[{}]'.format(re.escape(string.punctuation)), ' ', text) # Mengganti tanda baca dengan spasi
    text = text.strip()               # Hapus spasi tambahan di awal dan akhir
    text = re.sub(r'\s+', ' ', text)  # Hapus spasi ganda
    return text

# Memecah atau membagi string, teks menjadi daftar token
def tokenizingText(text):
    text = word_tokenize(text)
    return text

# Menghapus stopwords
def filteringText(text):
    listStopwords = set(stopwords.words('english'))
    # listStopwords = set(stopwords.words('indonesian'))
    # listStopwords.update(listStopwords1)
    # listStopwords.update(['iya','yaa','gak','nya','nah','sih','ku',"di","ga","ya","gaa","loh","kah","woi","woii","woy"])
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

# Mengurangi kata ke bentuk dasarnya
def stemmingText(text):
    stemmer = PorterStemmer()
    words = text.split()
    stemmed_words = [stemmer.stem(word) for word in words]
    stemmed_text = ' '.join(stemmed_words)
    return stemmed_text

# Mengubah menjadi kalimat
def toSentence(list_words):
    sentence = ' '.join(word for word in list_words)
    return sentence

df = pd.read_csv(BaseDir + 'dataset/shopee-en-sg-all.csv')
df.head(1)

df = df.drop(columns=['reviewId',	'userName',	'userImage', 'thumbsUpCount',	'reviewCreatedVersion',	'at',	'replyContent',	'repliedAt', 'appVersion'])

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41680 entries, 0 to 41679
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   content  41680 non-null  object
 1   score    41680 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 651.4+ KB

missing_info = pd.DataFrame({
    'missing_values': df.isnull().sum(),
    'percent_missing': df.isnull().mean() * 100
})
missing_info

duplicates = df[df.duplicated()]
duplicates

df['content'][3306]

'Easy to use'

df.drop_duplicates(subset='content', inplace=True)

df.duplicated().sum()

0

df['text_clean'] = df['content'].apply(cleaningText)
df['text_tokenizingText'] = df['text_clean'].apply(tokenizingText)
df['text_stopword'] = df['text_tokenizingText'].apply(filteringText)
df['content_clean'] = df['text_stopword'].apply(toSentence)
df['stemmingText'] = df['content_clean'].apply(stemmingText)

df.head(1)

missing_info = pd.DataFrame({
    'missing_values': df.isnull().sum(),
    'percent_missing': df.isnull().mean() * 100
})
missing_info

duplicates = df[df['content_clean'].duplicated()]
duplicates

df.drop_duplicates(subset='content_clean', inplace=True)

df['content_clean'].duplicated().sum()

0

df.head(1)

sns.countplot(x='score', data=df, hue='score', palette=HAPPY_COLORS_PALETTE, dodge=False)
plt.title('Distribution of Review Scores')
plt.xlabel('Review Score')
plt.legend([],[], frameon=False)

plt.show()

wordcloud(df, 'content_clean')

graph_freq(df, 'content_clean')

graph_freq(df, 'content_clean', 'score', 4 and 5)

graph_freq(df, 'content_clean', 'score', 1 and 2)

df['score_label'] = df['score'].apply(score_label)
df['label'] = df['content_clean'].apply(Label)

graph_freq(df, 'content_clean', 'label', 'positive')

graph_freq(df, 'content_clean', 'label', 'neutral')

graph_freq(df, 'content_clean', 'label', 'negative')

df.to_csv(BaseDir + 'dataset/shopee-en-sg-all-preprocessed.csv', index=False)

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import BernoulliNB
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier
from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imPipeline
from sklearn.feature_extraction.text import CountVectorizer
from IPython.display import HTML
import joblib

df = pd.read_csv(BaseDir + 'dataset/shopee-en-sg-all-preprocessed.csv')

df.head(1)

print(f'Terdapat {df.shape[0]} sample data')

Terdapat 29556 sample data

df.dropna(inplace=True)

df['label'].value_counts()

label
positive    16310
neutral     11314
negative     1931
Name: count, dtype: int64

df['label'] = df['label'].apply(map_labels)
df['score_label'] = df['score_label'].apply(map_labels)

Number of classes: 3
Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Logistic_Regression_None.pkl
Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Logistic_Regression_SMOTE.pkl
Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Logistic_Regression_Random_Under_Sampling.pkl
Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Random_Forest_None.pkl
Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Random_Forest_SMOTE.pkl
Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Random_Forest_Random_Under_Sampling.pkl
Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Decision_Tree_None.pkl
Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Decision_Tree_SMOTE.pkl
Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/Decision_Tree_Random_Under_Sampling.pkl
Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/SVM_None.pkl
Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/SVM_SMOTE.pkl
Model saved to /content/drive/MyDrive/Colab Notebooks/Sentiment-Analysis/models/SVM_Random_Under_Sampling.pkl

Number of classes: 3

Number of classes: 3
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best model saved to Logistic_Regression_None_best_model.pkl
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best model saved to Logistic_Regression_SMOTE_best_model.pkl
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best model saved to Logistic_Regression_Random_Under_Sampling_best_model.pkl
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best model saved to Random_Forest_None_best_model.pkl
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best model saved to Random_Forest_SMOTE_best_model.pkl
Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best model saved to Random_Forest_Random_Under_Sampling_best_model.pkl
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best model saved to Decision_Tree_None_best_model.pkl
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best model saved to Decision_Tree_SMOTE_best_model.pkl
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best model saved to Decision_Tree_Random_Under_Sampling_best_model.pkl
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best model saved to SVM_None_best_model.pkl
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best model saved to SVM_SMOTE_best_model.pkl
Fitting 5 folds for each of 6 candidates, totalling 30 fits
Best model saved to SVM_Random_Under_Sampling_best_model.pkl

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.callbacks import EarlyStopping
from imblearn.under_sampling import RandomUnderSampler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Bidirectional, SimpleRNN, Embedding, SpatialDropout1D, GRU
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score
import math

Number of classes: 3
Epoch 1/10
739/739 [==============================] - 222s 294ms/step - loss: 0.8861 - accuracy: 0.5465 - val_loss: 0.8673 - val_accuracy: 0.5552
Epoch 2/10
739/739 [==============================] - 201s 272ms/step - loss: 0.8804 - accuracy: 0.5498 - val_loss: 0.8675 - val_accuracy: 0.5552
Epoch 3/10
739/739 [==============================] - 173s 234ms/step - loss: 0.8786 - accuracy: 0.5511 - val_loss: 0.8672 - val_accuracy: 0.5552
Epoch 4/10
739/739 [==============================] - 160s 217ms/step - loss: 0.8779 - accuracy: 0.5511 - val_loss: 0.8677 - val_accuracy: 0.5552
Epoch 5/10
739/739 [==============================] - 172s 233ms/step - loss: 0.8774 - accuracy: 0.5511 - val_loss: 0.8689 - val_accuracy: 0.5552
Epoch 6/10
739/739 [==============================] - 180s 243ms/step - loss: 0.8782 - accuracy: 0.5511 - val_loss: 0.8702 - val_accuracy: 0.5552
185/185 - 12s - loss: 0.8672 - accuracy: 0.5552 - 12s/epoch - 67ms/step
Test Accuracy: 0.5552
Model berhasil disimpan: 'lstm_model.h5'

Number of classes: 3
Epoch 1/10
739/739 [==============================] - 344s 460ms/step - loss: 0.1577 - accuracy: 0.9407 - val_loss: 0.0317 - val_accuracy: 0.9939
Epoch 2/10
739/739 [==============================] - 342s 463ms/step - loss: 0.0199 - accuracy: 0.9964 - val_loss: 0.0240 - val_accuracy: 0.9961
Epoch 3/10
739/739 [==============================] - 317s 429ms/step - loss: 0.0153 - accuracy: 0.9971 - val_loss: 0.0230 - val_accuracy: 0.9966
Epoch 4/10
739/739 [==============================] - 317s 429ms/step - loss: 0.0103 - accuracy: 0.9980 - val_loss: 0.0258 - val_accuracy: 0.9966
Epoch 5/10
739/739 [==============================] - 307s 416ms/step - loss: 0.0207 - accuracy: 0.9965 - val_loss: 0.0305 - val_accuracy: 0.9944
Epoch 6/10
739/739 [==============================] - 324s 438ms/step - loss: 0.0081 - accuracy: 0.9983 - val_loss: 0.0270 - val_accuracy: 0.9959
185/185 - 23s - loss: 0.0230 - accuracy: 0.9966 - 23s/epoch - 124ms/step
Test Accuracy: 0.9966
Model berhasil disimpan: 'bilstm_model.keras'

X = df['content_clean']
y = df['label']

num_classes = len(y.unique())
print(f'Number of classes: {num_classes}')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Tokenisasi teks
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

# Konversi teks menjadi sequences
X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_test_sequences = tokenizer.texts_to_sequences(X_test)

# Padding sequences
max_length = 100
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post', truncating='post')

# Konversi label menjadi categorical
y_train_categorical = to_categorical(y_train, num_classes=num_classes)
y_test_categorical = to_categorical(y_test, num_classes=num_classes)

# GRU model
model = Sequential()
model.add(Embedding(input_dim=5000, output_dim=128, input_length=max_length))
model.add(Bidirectional(GRU(128, dropout=0.2, return_sequences=True)))
model.add(Bidirectional(GRU(128)))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Callback
early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Pelatihan model
history = model.fit(X_train_padded, y_train_categorical, epochs=10, batch_size=32, validation_data=(X_test_padded, y_test_categorical), callbacks=[early_stop])

# Evaluasi model
loss, accuracy = model.evaluate(X_test_padded, y_test_categorical, verbose=2)
print(f'Test Accuracy: {accuracy:.4f}')

# Menyimpan model
model.save(BaseDir + 'gru_model.h5')
print("Model berhasil disimpan: 'gru_model.h5'")

Number of classes: 3
Epoch 1/10
739/739 [==============================] - 603s 801ms/step - loss: 0.1238 - accuracy: 0.9537 - val_loss: 0.0260 - val_accuracy: 0.9956
Epoch 2/10
739/739 [==============================] - 570s 771ms/step - loss: 0.0158 - accuracy: 0.9969 - val_loss: 0.0230 - val_accuracy: 0.9966
Epoch 3/10
739/739 [==============================] - 552s 748ms/step - loss: 0.0108 - accuracy: 0.9979 - val_loss: 0.0228 - val_accuracy: 0.9961
Epoch 4/10
739/739 [==============================] - 570s 771ms/step - loss: 0.0122 - accuracy: 0.9976 - val_loss: 0.0204 - val_accuracy: 0.9966
Epoch 5/10
739/739 [==============================] - 580s 786ms/step - loss: 0.0099 - accuracy: 0.9975 - val_loss: 0.0354 - val_accuracy: 0.9966
Epoch 6/10
739/739 [==============================] - 561s 760ms/step - loss: 0.0091 - accuracy: 0.9981 - val_loss: 0.0284 - val_accuracy: 0.9948
Epoch 7/10
739/739 [==============================] - 560s 757ms/step - loss: 0.0089 - accuracy: 0.9981 - val_loss: 0.0218 - val_accuracy: 0.9976
185/185 - 31s - loss: 0.0204 - accuracy: 0.9966 - 31s/epoch - 166ms/step
Test Accuracy: 0.9966
Model berhasil disimpan: 'gru_model.h5'

	content	score
3306	Easy to use	5
3310	Easy to use	5
3315	Easy to use	5
5590	Easy to use	5
6786	Easy to use	5
...	...	...
41672	👍👍👍	5
41673	👍👍👍👍👍👍👍👍👍	5
41675	👍 👍 👍	5
41677	👍	1
41679	❤️❤️	5

	content	score	text_clean	text_tokenizingText	text_stopword	content_clean	stemmingText
1723	fast and convenient	5	fast and convenient	[fast, and, convenient]	[fast, convenient]	fast convenient	fast conveni
1815	Good and fast delivery	5	good and fast delivery	[good, and, fast, delivery]	[good, fast, delivery]	good fast delivery	good fast deliveri
3302	Very good service 👍👍👍	5	very good service	[very, good, service]	[good, service]	good service	good servic
6422	Easy to use.	5	easy to use	[easy, to, use]	[easy, use]	easy use	easi use
6718	Good shopping experience.	5	good shopping experience	[good, shopping, experience]	[good, shopping, experience]	good shopping experience	good shop experi
...	...	...	...	...	...	...	...
41667	😍😍😍😍😍😍😍😍	5		[]	[]
41670	🤗🤗🤗	5		[]	[]
41674	... ...	5		[]	[]
41676	ok....	5	ok	[ok]	[ok]	ok	ok
41678	😓	1		[]	[]

	Model	Sampling Method	Accuracy Train	Accuracy Test
0	Logistic Regression	None	0.972044	0.972593
1	Logistic Regression	SMOTE	0.958467	0.955676
2	Logistic Regression	Random Under Sampling	0.940661	0.938927
3	Random Forest	None	0.958975	0.955168
4	Random Forest	SMOTE	0.960751	0.955676
5	Random Forest	Random Under Sampling	0.951785	0.949586
6	Decision Tree	None	0.941338	0.942311
7	Decision Tree	SMOTE	0.908941	0.907799
8	Decision Tree	Random Under Sampling	0.906995	0.905261
9	SVM	None	0.974158	0.975300
10	SVM	SMOTE	0.972509	0.973778
11	SVM	Random Under Sampling	0.955676	0.956353

	Model	Sampling Method	Accuracy Train	Accuracy Test
0	Logistic Regression	None	0.861952	0.862460
1	Logistic Regression	SMOTE	0.780663	0.779902
2	Logistic Regression	Random Under Sampling	0.768609	0.768398
3	Random Forest	None	0.817205	0.817459
4	Random Forest	SMOTE	0.731010	0.709525
5	Random Forest	Random Under Sampling	0.711766	0.688547
6	Decision Tree	None	0.815725	0.812722
7	Decision Tree	SMOTE	0.605481	0.596853
8	Decision Tree	Random Under Sampling	0.629800	0.622230
9	SVM	None	0.860176	0.863982
10	SVM	SMOTE	0.771697	0.770597
11	SVM	Random Under Sampling	0.749366	0.744713

	Model	Sampling Method	Best Params	Accuracy Train	Accuracy Test
0	Logistic Regression	None	{'classifier__C': 10.0, 'classifier__solver': 'liblinear'}	0.975004	0.975808
1	Logistic Regression	SMOTE	{'classifier__C': 10.0, 'classifier__solver': 'liblinear'}	0.962993	0.960244
2	Logistic Regression	Random Under Sampling	{'classifier__C': 10.0, 'classifier__solver': 'lbfgs'}	0.952969	0.947386
3	Random Forest	None	{'classifier__max_depth': 20, 'classifier__n_estimators': 200}	0.976146	0.976485
4	Random Forest	SMOTE	{'classifier__max_depth': None, 'classifier__n_estimators': 200}	0.993825	0.972763
5	Random Forest	Random Under Sampling	{'classifier__max_depth': 20, 'classifier__n_estimators': 200}	0.961386	0.957875
6	Decision Tree	None	{'classifier__criterion': 'entropy', 'classifier__max_depth': 20}	0.977288	0.974962
7	Decision Tree	SMOTE	{'classifier__criterion': 'entropy', 'classifier__max_depth': 20}	0.976104	0.972424
8	Decision Tree	Random Under Sampling	{'classifier__criterion': 'entropy', 'classifier__max_depth': 20}	0.965234	0.961935
9	SVM	None	{'classifier__C': 10.0, 'classifier__kernel': 'linear'}	0.975512	0.976654
10	SVM	SMOTE	{'classifier__C': 10.0, 'classifier__kernel': 'linear'}	0.972297	0.973778
11	SVM	Random Under Sampling	{'classifier__C': 1.0, 'classifier__kernel': 'linear'}	0.955676	0.956353

Sentiment Analysis: Shopee App Reviews (SG)¶

Setup Base Directory¶

Library¶

Function¶

Text Preprocessing¶

Exploratory Data Analysis (EDA)¶

Labelling¶

Load Dataset¶

> Check Missing Values¶

> Check Duplicate¶

Text Preprocessing¶

Recheck after Preprocessing¶

> Missing values¶

> Duplicates¶

Distribution content based score¶

Wordcloud¶

Frequency word based Score¶

Score 4 dan 5¶

Score 1 dan 2¶

Labelling¶

Distribution Reviews¶

Melihat frekuensi untuk Positive Reviews¶

Melihat frekuensi untuk Neutral Reviews¶

Melihat frekuensi untuk Negative Reviews¶

Export dataset after Preprocessing¶

Modelling¶

Encoding label:¶

Percobaan 1 (ML wo GridSearch) - Data Label¶

Percobaan 2 (ML wo GridSearch) - Data Score Label¶

Percobaan 3 (ML with GridSearch) - Data Label¶

Import Library Deep Learning¶

Percobaan 4 (DL - LSTM)¶

Percobaan 5 (DL - BILSTM)¶

Percobaan 6 (DL - GRU)¶

Conclusion¶

	missing_values	percent_missing
content	0	0.0
score	0	0.0

	missing_values	percent_missing
content	0	0.0
score	0	0.0
text_clean	0	0.0
text_tokenizingText	0	0.0
text_stopword	0	0.0
content_clean	0	0.0
stemmingText	0	0.0