import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix

# Load dataset
df = pd.read_csv("dataset-label.csv")

df.head(2)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7551 entries, 0 to 7550
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Order_ID       7551 non-null   object 
 1   Customer_ID    7551 non-null   object 
 2   Customer_Type  7551 non-null   object 
 3   Product        7551 non-null   object 
 4   Category       7551 non-null   object 
 5   Unit_Price     7551 non-null   float64
 6   Quantity       7551 non-null   float64
 7   Discount       7551 non-null   float64
 8   Total_Price    7551 non-null   float64
 9   Region         7551 non-null   object 
 10  Order_Date     7551 non-null   object 
 11  Cluster_PCA    7551 non-null   int64  
dtypes: float64(4), int64(1), object(7)
memory usage: 708.0+ KB

# Drop kolom yang tidak diperlukan
df = df.drop(['Order_ID', 'Customer_ID', 'Order_Date'], axis=1)

# Memisahkan fitur dan target
X = df.drop('Cluster_PCA', axis=1)
y = df['Cluster_PCA']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (6040, 8)
Shape of X_test: (1511, 8)
Shape of y_train: (6040,)
Shape of y_test: (1511,)

# Preprocessing
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), categorical_cols)
])

# Model
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}

# Build Model & Training
train_results = []
trained_models = {}
y_preds_test = {}

for model_name, model in models.items():
    pipeline = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    
    pipeline.fit(X_train, y_train)

    # Simpan pipeline dan prediksi untuk confusion matrix
    trained_models[model_name] = pipeline
    y_preds_test[model_name] = pipeline.predict(X_test)

    # Training
    y_pred_train = pipeline.predict(X_train)
    train_results.append({
        'Model': model_name,
        'Train Accuracy': accuracy_score(y_train, y_pred_train),
        'Train Precision': precision_score(y_train, y_pred_train, average='weighted', zero_division=0),
        'Train Recall': recall_score(y_train, y_pred_train, average='weighted', zero_division=0),
        'Train F1-Score': f1_score(y_train, y_pred_train, average='weighted', zero_division=0)
    })

# DataFrame hasil training
train_results_df = pd.DataFrame(train_results)
train_results_df

# Confusion matrix
def plot_confusion_matrix(model_name):
    if model_name not in y_preds_test:
        print(f"Model '{model_name}' tidak ditemukan.")
        return
    y_pred = y_preds_test[model_name]
    cm = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=False)
    plt.title(f'Confusion Matrix - {model_name}')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.tight_layout()
    plt.show()

plot_confusion_matrix("Random Forest")

plot_confusion_matrix("Gradient Boosting")

plot_confusion_matrix("Logistic Regression")

# Tidak dilakukan karena model sudah baik

# Tidak dilakukan karena model sudah baik

print("Hasil Training:")
train_results_df

Hasil Training:

print("Hasil Testing:")
test_results_df

Hasil Testing:

	Order_ID	Customer_ID	Customer_Type	Product	Category	Unit_Price	Quantity	Discount	Total_Price	Region	Order_Date	Cluster_PCA
0	ORD100001	CUS1112	B2C	Mango Juice	Juices	3.12	1.0	0.0	3.12	Sachsen	2021-03-16	4
1	ORD1000934	CUS9934	B2B	Cranberry Juice	Juices	3.41	26.0	0.1	79.79	Nordrhein-Westfalen	2021-11-11	3

	Model	Train Accuracy	Train Precision	Train Recall	Train F1-Score
0	Random Forest	1.0	1.0	1.0	1.0
1	Logistic Regression	1.0	1.0	1.0	1.0
2	Gradient Boosting	1.0	1.0	1.0	1.0

	Model	Test Accuracy	Test Precision	Test Recall	Test F1-Score
0	Random Forest	1.0	1.0	1.0	1.0
1	Logistic Regression	1.0	1.0	1.0	1.0
2	Gradient Boosting	1.0	1.0	1.0	1.0

	Model	Train Accuracy	Train Precision	Train Recall	Train F1-Score
0	Random Forest	1.0	1.0	1.0	1.0
1	Logistic Regression	1.0	1.0	1.0	1.0
2	Gradient Boosting	1.0	1.0	1.0	1.0

	Model	Test Accuracy	Test Precision	Test Recall	Test F1-Score
0	Random Forest	1.0	1.0	1.0	1.0
1	Logistic Regression	1.0	1.0	1.0	1.0
2	Gradient Boosting	1.0	1.0	1.0	1.0

1. Import Library¶

2. Memuat Dataset dari Hasil Clustering¶

3. Data Splitting¶

4. Membangun Model Klasifikasi¶

a. Membangun Model Klasifikasi¶

Tentang model:¶

b. Evaluasi Model Klasifikasi¶

c. Tuning Model Klasifikasi (Optional)¶

d. Evaluasi Model Klasifikasi setelah Tuning (Optional)¶

e. Analisis Hasil Evaluasi Model Klasifikasi¶