import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re

df = pd.read_csv('dataset/Car details v3.csv')

df.head()

df.shape

(8128, 13)

df.describe().T

df.drop(['torque'], axis=1, inplace=True)
df.head()

df.duplicated().sum()

1202

df = df.drop_duplicates()
df.shape

(6926, 12)

df.isnull().sum()

name               0
year               0
selling_price      0
km_driven          0
fuel               0
seller_type        0
transmission       0
owner              0
mileage          208
engine           208
max_power        205
seats            208
dtype: int64

df.dropna(axis=0, inplace=True)

df.shape

(6718, 12)

df['age'] = 2024 - df['year']
df.drop(['year'], axis=1, inplace=True)

df['brand'] = df['name'].str.split(' ').str.get(0)
df.drop(['name'],axis=1,inplace=True)

first_column = df.pop('brand')
df.insert(0, 'brand', first_column)

df.head()

def remove_unit_and_convert(df, col_name, to_type=float):
    df[col_name] = df[col_name].apply(lambda x: str(x).split(' ')[0])
    df[col_name] = pd.to_numeric(df[col_name], errors='coerce').astype(to_type)
    return df

df = remove_unit_and_convert(df, 'mileage', float)
df = remove_unit_and_convert(df, 'engine', int)
df = remove_unit_and_convert(df, 'max_power', float)

df.head()

df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6718 entries, 0 to 8125
Data columns (total 12 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   brand          6718 non-null   object 
 1   selling_price  6718 non-null   int64  
 2   km_driven      6718 non-null   int64  
 3   fuel           6718 non-null   object 
 4   seller_type    6718 non-null   object 
 5   transmission   6718 non-null   object 
 6   owner          6718 non-null   object 
 7   mileage        6718 non-null   float64
 8   engine         6718 non-null   int32  
 9   max_power      6717 non-null   float64
 10  seats          6718 non-null   float64
 11  age            6718 non-null   int64  
dtypes: float64(3), int32(1), int64(3), object(5)
memory usage: 656.1+ KB

df.dropna(axis=0, inplace=True)

df_clean = df.copy()

df_clean = df_clean[
    (df_clean['selling_price'] < 2500000) & 
    (df_clean['km_driven'] < 300000) & 
    (~df_clean['fuel'].isin(['CNG', 'LPG'])) & 
    (df_clean['mileage'].between(5, 35)) & 
    (df_clean['max_power'] < 300)
]

# Transformasi logaritma
df_clean['selling_price'] = np.log(df_clean['selling_price'])
df_clean['max_power'] = np.log(df_clean['max_power'])
df_clean['age'] = np.log(df_clean['age'])

df_clean.head()

fig, axes = plt.subplots(2, 3, figsize=(18, 10))

# Plot 1: 5 Brand dengan Jumlah Kendaraan Terbanyak
top_5_brands = df['brand'].value_counts().nlargest(5)
sns.barplot(x=top_5_brands.index, y=top_5_brands.values, ax=axes[0, 0])
axes[0, 0].set_title('5 Brand dengan Jumlah Kendaraan Terbanyak')
axes[0, 0].set_xlabel('Brand')
axes[0, 0].set_ylabel('Jumlah Kendaraan')

# Plot 2: Distribusi Jenis Bahan Bakar
sns.countplot(x='fuel', data=df, ax=axes[0, 1])
axes[0, 1].set_title('Distribusi Jenis Bahan Bakar')
axes[0, 1].set_xlabel('Jenis Bahan Bakar')
axes[0, 1].set_ylabel('Jumlah Kendaraan')

# Plot 3: Distribusi Jenis Transmisi
sns.countplot(x='transmission', data=df, ax=axes[0, 2])
axes[0, 2].set_title('Distribusi Jenis Transmisi')
axes[0, 2].set_xlabel('Jenis Transmisi')
axes[0, 2].set_ylabel('Jumlah Kendaraan')

# Plot 4: Distribusi Seller Type
sns.countplot(x='seller_type', data=df, ax=axes[1, 0])
axes[1, 0].set_title('Distribusi Seller Type')
axes[1, 0].set_xlabel('Jenis Seller Type')
axes[1, 0].set_ylabel('Jumlah Kendaraan')

# Plot 5: Distribusi Owner
sns.countplot(x='owner', data=df, ax=axes[1, 1])
axes[1, 1].set_title('Distribusi Owner')
axes[1, 1].set_xlabel('Jenis Owner')
axes[1, 1].set_ylabel('Jumlah Kendaraan')
axes[1, 1].tick_params(axis='x', rotation=25)

fig.delaxes(axes[1, 2])
plt.tight_layout()

plt.show()

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))

columns = ['selling_price', 'km_driven', 'mileage', 'engine', 'max_power', 'age']
titles = ["Selling Price", "Total KM Driven", "Fuel Efficiency in KM per litre",
          "Engine CC", "Brake Horse Power(BHP)", "Age of Car"]

for i, (column, title) in enumerate(zip(columns, titles)):
    row, col = divmod(i, 3)
    axes[row, col].hist(df[column], bins=20, color='blue', edgecolor='black')
    axes[row, col].set_title(title)  
    axes[row, col].set_xlabel(column.replace('_', ' ').title())
    axes[row, col].set_ylabel('Frequency') 

plt.tight_layout()

fig.suptitle("Distribution of Numerical Data", fontsize=16, y=1.02)

plt.show()

fig, axes = plt.subplots(nrows=2, ncols=3, figsize=(15, 10))

columns = ['selling_price', 'km_driven', 'mileage', 'engine', 'max_power', 'age']
titles = ["Selling Price", "Total KM Driven", "Fuel Efficiency in KM per litre",
          "Engine CC", "BHP", "Age of Car"]

for i, (column, title) in enumerate(zip(columns, titles)):
    row, col = divmod(i, 3)
    axes[row, col].boxplot(df[column].dropna())
    axes[row, col].set_title(title)
    axes[row, col].set_xlabel(column.replace('_', ' ').title())

plt.tight_layout()

fig.suptitle("Distribution of Numerical Data", fontsize=16, y=1.02)
plt.show()

fig, ax = plt.subplots(figsize=(10,6))
correlation_matrix = df.corr(numeric_only=True)
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))

sns.heatmap(
    correlation_matrix,
    annot=True,
    mask=mask,
    cmap="coolwarm",
    center=0,
    fmt=".2f")
plt.title("Correlation Heatmap")
plt.show()

sns.pairplot(df)

<seaborn.axisgrid.PairGrid at 0x18417325290>

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV

X = df_clean.drop('selling_price', axis=1)
y = df_clean['selling_price']

numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='median')),
            ('scaler', StandardScaler())
        ]), numerical_features),
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
            ('ordinal', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
        ]), categorical_features)
    ])

def create_pipeline(model):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(n_estimators=100),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100)
}

results = []
for model_name, model in models.items():
    pipeline = create_pipeline(model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    results.append({
        'Model': model_name,
        'MSE': round(mse, 2),
        'R^2': round(r2, 2)
    })

results_df = pd.DataFrame(results)
results_df

predictions = {}
for model_name, model in models.items():
    pipeline = create_pipeline(model)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    predictions[model_name] = y_pred

plt.figure(figsize=(15, 10))

for model_name, y_pred in predictions.items():
    plt.subplot(2, 2, 1)
    plt.scatter(y_test, y_pred, label=model_name, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
    plt.xlabel('Actual Selling Price')
    plt.ylabel('Predicted Selling Price')
    plt.title('Actual vs Predicted Selling Price')
    plt.legend()
    
    plt.subplot(2, 2, 2)
    residuals = y_test - y_pred
    sns.histplot(residuals, kde=True, label=model_name, alpha=0.5)
    plt.xlabel('Residuals')
    plt.ylabel('Frequency')
    plt.title('Residuals Distribution')
    plt.legend()
    
plt.tight_layout()
plt.show()

param_grids = {
    'Linear Regression': {},
    'Random Forest': {
        'regressor__n_estimators': [50, 100, 200],
        'regressor__max_depth': [None, 10, 20, 30],
        'regressor__min_samples_split': [2, 5, 10],
    },
    'Gradient Boosting': {
        'regressor__n_estimators': [50, 100, 200],
        'regressor__learning_rate': [0.01, 0.1, 0.2],
        'regressor__max_depth': [3, 5, 7],
    }
}

models = {
    'Linear Regression': LinearRegression(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor()
}

results = []
best_estimators = {}

for model_name, model in models.items():
    pipeline = create_pipeline(model)
    
    # GridSearchCV
    grid_search = GridSearchCV(pipeline, param_grid=param_grids[model_name], cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_estimators[model_name] = grid_search.best_estimator_
    
    y_pred = grid_search.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    results.append({
        'Model': model_name,
        'Best Params': grid_search.best_params_,
        'MSE': round(mse, 2),
        'R^2': round(r2, 2)
    })

results_df = pd.DataFrame(results)
results_df

fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))
i = 0

for model_name, model in best_estimators.items():
    if model_name in ['Random Forest', 'Gradient Boosting']:
        feature_importance = model.named_steps['regressor'].feature_importances_
        features = numerical_features + list(model.named_steps['preprocessor'].transformers_[1][1].named_steps['ordinal'].get_feature_names_out(categorical_features))
        
        sorted_idx = np.argsort(feature_importance)
        axes[i].barh(np.array(features)[sorted_idx], feature_importance[sorted_idx])
        axes[i].set_xlabel('Feature Importance')
        axes[i].set_title(f'Feature Importance for {model_name}')
        i += 1
        
        if i == 2:
            break

plt.tight_layout()
plt.show()

predictions = {}
best_estimators = {}

for model_name, model in models.items():
    pipeline = create_pipeline(model)
    grid_search = GridSearchCV(
        estimator=pipeline,
        param_grid=param_grids[model_name],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    grid_search.fit(X_train, y_train)
    
    # Menyimpan estimator terbaik dan prediksinya
    best_model = grid_search.best_estimator_
    best_estimators[model_name] = best_model
    y_pred = best_model.predict(X_test)
    predictions[model_name] = y_pred

# Plotting
plt.figure(figsize=(15, 10))

for i, (model_name, y_pred) in enumerate(predictions.items(), 1):
    # Plot actual vs predicted selling price
    plt.subplot(2, 2, 1)
    plt.scatter(y_test, y_pred, label=model_name, alpha=0.5)
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2)
    plt.xlabel('Actual Selling Price')
    plt.ylabel('Predicted Selling Price')
    plt.title('Actual vs Predicted Selling Price')
    plt.legend()
    
    # Plot residuals distribution
    plt.subplot(2, 2, 2)
    residuals = y_test - y_pred
    sns.histplot(residuals, kde=True, label=model_name, alpha=0.5)
    plt.xlabel('Residuals')
    plt.ylabel('Frequency')
    plt.title('Residuals Distribution')
    plt.legend()

plt.tight_layout()
plt.show()

	name	year	selling_price	km_driven	fuel	seller_type	transmission	owner	mileage	engine	max_power	torque	seats
0	Maruti Swift Dzire VDI	2014	450000	145500	Diesel	Individual	Manual	First Owner	23.4 kmpl	1248 CC	74 bhp	190Nm@ 2000rpm	5.0
1	Skoda Rapid 1.5 TDI Ambition	2014	370000	120000	Diesel	Individual	Manual	Second Owner	21.14 kmpl	1498 CC	103.52 bhp	250Nm@ 1500-2500rpm	5.0
2	Honda City 2017-2020 EXi	2006	158000	140000	Petrol	Individual	Manual	Third Owner	17.7 kmpl	1497 CC	78 bhp	12.7@ 2,700(kgm@ rpm)	5.0
3	Hyundai i20 Sportz Diesel	2010	225000	127000	Diesel	Individual	Manual	First Owner	23.0 kmpl	1396 CC	90 bhp	22.4 kgm at 1750-2750rpm	5.0
4	Maruti Swift VXI BSIII	2007	130000	120000	Petrol	Individual	Manual	First Owner	16.1 kmpl	1298 CC	88.2 bhp	11.5@ 4,500(kgm@ rpm)	5.0

	count	mean	std	min	25%	50%	75%	max
year	8128.0	2013.804011	4.044249	1983.0	2011.0	2015.0	2017.0	2020.0
selling_price	8128.0	638271.807702	806253.403508	29999.0	254999.0	450000.0	675000.0	10000000.0
km_driven	8128.0	69819.510827	56550.554958	1.0	35000.0	60000.0	98000.0	2360457.0
seats	7907.0	5.416719	0.959588	2.0	5.0	5.0	5.0	14.0

	name	year	selling_price	km_driven	fuel	seller_type	transmission	owner	mileage	engine	max_power	seats
0	Maruti Swift Dzire VDI	2014	450000	145500	Diesel	Individual	Manual	First Owner	23.4 kmpl	1248 CC	74 bhp	5.0
1	Skoda Rapid 1.5 TDI Ambition	2014	370000	120000	Diesel	Individual	Manual	Second Owner	21.14 kmpl	1498 CC	103.52 bhp	5.0
2	Honda City 2017-2020 EXi	2006	158000	140000	Petrol	Individual	Manual	Third Owner	17.7 kmpl	1497 CC	78 bhp	5.0
3	Hyundai i20 Sportz Diesel	2010	225000	127000	Diesel	Individual	Manual	First Owner	23.0 kmpl	1396 CC	90 bhp	5.0
4	Maruti Swift VXI BSIII	2007	130000	120000	Petrol	Individual	Manual	First Owner	16.1 kmpl	1298 CC	88.2 bhp	5.0

	brand	selling_price	km_driven	fuel	seller_type	transmission	owner	mileage	engine	max_power	seats	age
0	Maruti	450000	145500	Diesel	Individual	Manual	First Owner	23.4 kmpl	1248 CC	74 bhp	5.0	10
1	Skoda	370000	120000	Diesel	Individual	Manual	Second Owner	21.14 kmpl	1498 CC	103.52 bhp	5.0	10
2	Honda	158000	140000	Petrol	Individual	Manual	Third Owner	17.7 kmpl	1497 CC	78 bhp	5.0	18
3	Hyundai	225000	127000	Diesel	Individual	Manual	First Owner	23.0 kmpl	1396 CC	90 bhp	5.0	14
4	Maruti	130000	120000	Petrol	Individual	Manual	First Owner	16.1 kmpl	1298 CC	88.2 bhp	5.0	17

	brand	selling_price	km_driven	fuel	seller_type	transmission	owner	mileage	engine	max_power	seats	age
0	Maruti	450000	145500	Diesel	Individual	Manual	First Owner	23.40	1248	74.00	5.0	10
1	Skoda	370000	120000	Diesel	Individual	Manual	Second Owner	21.14	1498	103.52	5.0	10
2	Honda	158000	140000	Petrol	Individual	Manual	Third Owner	17.70	1497	78.00	5.0	18
3	Hyundai	225000	127000	Diesel	Individual	Manual	First Owner	23.00	1396	90.00	5.0	14
4	Maruti	130000	120000	Petrol	Individual	Manual	First Owner	16.10	1298	88.20	5.0	17

Car Price Prediction¶

Import Library¶

Data Loading¶

> Delete Column¶

> Duplicate Data¶

> Missing Values¶

Data Preprocessing¶

Fitur baru¶

Remove unit¶

Menghapus outlier (Filter)¶

EDA¶

Modeling¶

Tanpa Greadsearch¶

Plot Model Tanpa Gridsearch¶

Dengan Grid Search¶

Feature Importance¶

Plot Model - Gridsearch¶

Kesimpulan:¶

	brand	selling_price	km_driven	fuel	seller_type	transmission	owner	mileage	engine	max_power	seats	age
0	Maruti	13.017003	145500	Diesel	Individual	Manual	First Owner	23.40	1248	4.304065	5.0	2.302585
1	Skoda	12.821258	120000	Diesel	Individual	Manual	Second Owner	21.14	1498	4.639765	5.0	2.302585
2	Honda	11.970350	140000	Petrol	Individual	Manual	Third Owner	17.70	1497	4.356709	5.0	2.890372
3	Hyundai	12.323856	127000	Diesel	Individual	Manual	First Owner	23.00	1396	4.499810	5.0	2.639057
4	Maruti	11.775290	120000	Petrol	Individual	Manual	First Owner	16.10	1298	4.479607	5.0	2.833213

	Model	MSE	R^2
0	Linear Regression	0.10	0.81
1	Random Forest	0.05	0.91
2	Gradient Boosting	0.05	0.91