import os
# For Processing/Manupulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px 
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")
from sys import displayhook

# For show all inside dataframe
pd.set_option("display.max_columns",None)
pd.set_option("display.max_rows",None)

def get_all_cols(df):
    all_col = list(df.columns)
    return all_col

def get_cat_cols(df):
    cat=list(df.columns[df.dtypes=='object'])
    return cat

def get_num_cols(df):
    num=list(df.columns[df.dtypes=='int64'])+list(df.columns[df.dtypes=='int32'])+list(df.columns[df.dtypes=='float64'])
    return num

def get_head(df):
  print(f'Shape : {df.shape}')
  displayhook(df.head())

def check_missing(df, percentage=0):
    percent=df.isnull().sum()/df.shape[0]*100
    freq=df.isnull().sum()
    types=df.dtypes
    df_miss=pd.DataFrame({'percentage':percent, 'frequency':freq, 'var_type':types})
    df_miss.sort_values(by='frequency',ascending= False, inplace=True)
    # return df_miss[df_miss['percentage']>percentage]
    return df_miss[df_miss['percentage']>percentage]

def get_missing(df):
    null = pd.DataFrame(df.isnull().sum(),columns=["Null Values"])
    null["% Missing Values"] = (df.isna().sum()/len(df)*100)
    null = null[null["% Missing Values"] > 0]
    most_null = null.sort_values('% Missing Values', ascending=False)
    most_null.style.background_gradient(cmap='viridis',low = 0.2,high = 0.1)
    return most_null

df = pd.read_csv('dataset/application_train.csv')

# Memilih kolom
df = df[['TARGET','NAME_CONTRACT_TYPE','CODE_GENDER','DAYS_BIRTH','FLAG_OWN_CAR', 'FLAG_OWN_REALTY',
        'CNT_CHILDREN','AMT_INCOME_TOTAL','AMT_CREDIT','AMT_ANNUITY','AMT_GOODS_PRICE','NAME_TYPE_SUITE',
        'NAME_INCOME_TYPE','NAME_EDUCATION_TYPE','NAME_FAMILY_STATUS','NAME_HOUSING_TYPE','REGION_POPULATION_RELATIVE',
        'OWN_CAR_AGE','FLAG_MOBIL','FLAG_EMP_PHONE','FLAG_WORK_PHONE','FLAG_CONT_MOBILE','FLAG_PHONE','FLAG_EMAIL',
        'OCCUPATION_TYPE','CNT_FAM_MEMBERS','REGION_RATING_CLIENT','REGION_RATING_CLIENT_W_CITY',
        'WEEKDAY_APPR_PROCESS_START','HOUR_APPR_PROCESS_START','REG_REGION_NOT_LIVE_REGION','REG_REGION_NOT_WORK_REGION',
        'LIVE_REGION_NOT_WORK_REGION','ORGANIZATION_TYPE','OBS_30_CNT_SOCIAL_CIRCLE','DEF_30_CNT_SOCIAL_CIRCLE',
        'OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE','DAYS_LAST_PHONE_CHANGE','AMT_REQ_CREDIT_BUREAU_HOUR',
        'AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_WEEK','AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_QRT',
        'AMT_REQ_CREDIT_BUREAU_YEAR'
        ]]

get_head(df)

Shape : (307511, 45)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 307511 entries, 0 to 307510
Data columns (total 45 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   TARGET                       307511 non-null  int64  
 1   NAME_CONTRACT_TYPE           307511 non-null  object 
 2   CODE_GENDER                  307511 non-null  object 
 3   DAYS_BIRTH                   307511 non-null  int64  
 4   FLAG_OWN_CAR                 307511 non-null  object 
 5   FLAG_OWN_REALTY              307511 non-null  object 
 6   CNT_CHILDREN                 307511 non-null  int64  
 7   AMT_INCOME_TOTAL             307511 non-null  float64
 8   AMT_CREDIT                   307511 non-null  float64
 9   AMT_ANNUITY                  307499 non-null  float64
 10  AMT_GOODS_PRICE              307233 non-null  float64
 11  NAME_TYPE_SUITE              306219 non-null  object 
 12  NAME_INCOME_TYPE             307511 non-null  object 
 13  NAME_EDUCATION_TYPE          307511 non-null  object 
 14  NAME_FAMILY_STATUS           307511 non-null  object 
 15  NAME_HOUSING_TYPE            307511 non-null  object 
 16  REGION_POPULATION_RELATIVE   307511 non-null  float64
 17  OWN_CAR_AGE                  104582 non-null  float64
 18  FLAG_MOBIL                   307511 non-null  int64  
 19  FLAG_EMP_PHONE               307511 non-null  int64  
 20  FLAG_WORK_PHONE              307511 non-null  int64  
 21  FLAG_CONT_MOBILE             307511 non-null  int64  
 22  FLAG_PHONE                   307511 non-null  int64  
 23  FLAG_EMAIL                   307511 non-null  int64  
 24  OCCUPATION_TYPE              211120 non-null  object 
 25  CNT_FAM_MEMBERS              307509 non-null  float64
 26  REGION_RATING_CLIENT         307511 non-null  int64  
 27  REGION_RATING_CLIENT_W_CITY  307511 non-null  int64  
 28  WEEKDAY_APPR_PROCESS_START   307511 non-null  object 
 29  HOUR_APPR_PROCESS_START      307511 non-null  int64  
 30  REG_REGION_NOT_LIVE_REGION   307511 non-null  int64  
 31  REG_REGION_NOT_WORK_REGION   307511 non-null  int64  
 32  LIVE_REGION_NOT_WORK_REGION  307511 non-null  int64  
 33  ORGANIZATION_TYPE            307511 non-null  object 
 34  OBS_30_CNT_SOCIAL_CIRCLE     306490 non-null  float64
 35  DEF_30_CNT_SOCIAL_CIRCLE     306490 non-null  float64
 36  OBS_60_CNT_SOCIAL_CIRCLE     306490 non-null  float64
 37  DEF_60_CNT_SOCIAL_CIRCLE     306490 non-null  float64
 38  DAYS_LAST_PHONE_CHANGE       307510 non-null  float64
 39  AMT_REQ_CREDIT_BUREAU_HOUR   265992 non-null  float64
 40  AMT_REQ_CREDIT_BUREAU_DAY    265992 non-null  float64
 41  AMT_REQ_CREDIT_BUREAU_WEEK   265992 non-null  float64
 42  AMT_REQ_CREDIT_BUREAU_MON    265992 non-null  float64
 43  AMT_REQ_CREDIT_BUREAU_QRT    265992 non-null  float64
 44  AMT_REQ_CREDIT_BUREAU_YEAR   265992 non-null  float64
dtypes: float64(18), int64(15), object(12)
memory usage: 105.6+ MB

df.describe()

df.describe(include ='O').T.sort_values(by='unique')

df_obj = pd.DataFrame(df.select_dtypes('object').nunique()).reset_index()
df_obj.columns = ['Column','N Unique Values']

## ekstrak unique value
df_obj['Unique Values'] = df_obj['Column'].apply(lambda x: df[x].unique())
df_obj.sort_values(by='N Unique Values')

# Unique value for categorical columns
for col in df.select_dtypes('object'):
    print(df[col].unique().tolist())

['Cash loans', 'Revolving loans']
['M', 'F', 'XNA']
['N', 'Y']
['Y', 'N']
['Unaccompanied', 'Family', 'Spouse, partner', 'Children', 'Other_A', nan, 'Other_B', 'Group of people']
['Working', 'State servant', 'Commercial associate', 'Pensioner', 'Unemployed', 'Student', 'Businessman', 'Maternity leave']
['Secondary / secondary special', 'Higher education', 'Incomplete higher', 'Lower secondary', 'Academic degree']
['Single / not married', 'Married', 'Civil marriage', 'Widow', 'Separated', 'Unknown']
['House / apartment', 'Rented apartment', 'With parents', 'Municipal apartment', 'Office apartment', 'Co-op apartment']
['Laborers', 'Core staff', 'Accountants', 'Managers', nan, 'Drivers', 'Sales staff', 'Cleaning staff', 'Cooking staff', 'Private service staff', 'Medicine staff', 'Security staff', 'High skill tech staff', 'Waiters/barmen staff', 'Low-skill Laborers', 'Realty agents', 'Secretaries', 'IT staff', 'HR staff']
['WEDNESDAY', 'MONDAY', 'THURSDAY', 'SUNDAY', 'SATURDAY', 'FRIDAY', 'TUESDAY']
['Business Entity Type 3', 'School', 'Government', 'Religion', 'Other', 'XNA', 'Electricity', 'Medicine', 'Business Entity Type 2', 'Self-employed', 'Transport: type 2', 'Construction', 'Housing', 'Kindergarten', 'Trade: type 7', 'Industry: type 11', 'Military', 'Services', 'Security Ministries', 'Transport: type 4', 'Industry: type 1', 'Emergency', 'Security', 'Trade: type 2', 'University', 'Transport: type 3', 'Police', 'Business Entity Type 1', 'Postal', 'Industry: type 4', 'Agriculture', 'Restaurant', 'Culture', 'Hotel', 'Industry: type 7', 'Trade: type 3', 'Industry: type 3', 'Bank', 'Industry: type 9', 'Insurance', 'Trade: type 6', 'Industry: type 2', 'Transport: type 1', 'Industry: type 12', 'Mobile', 'Trade: type 1', 'Industry: type 5', 'Industry: type 10', 'Legal Services', 'Advertising', 'Trade: type 5', 'Cleaning', 'Industry: type 13', 'Trade: type 4', 'Telecom', 'Industry: type 8', 'Realtor', 'Industry: type 6']

check_missing(df)

# Handling missing value
for i in df.columns:
    data = df[i]
    types = str(data.dtype)
    if types == 'object':
        df[i].fillna(df[i].mode()[0], inplace=True)
    else:
        df[i].fillna(df[i].median(), inplace=True)

check_missing(df)

#check duplicate train

duplicate = df[df.duplicated()]
print("Shape:", duplicate.shape)
duplicate

Shape: (11, 45)

df.drop_duplicates(inplace=True)
print(df.duplicated().sum())

0

# define target variable
target_variable = 'TARGET'

labels = df[target_variable].value_counts().keys().tolist()
values = df[target_variable].value_counts().values.tolist()
print(df.shape)
print(values)
fig1, ax1 = plt.subplots(figsize=(5,5))
ax1.pie(values, 
        labels=labels, 
#         explode=(0,0.1), 
        autopct='%1.1f%%',
        colors=['lightgreen', 'red'])
plt.legend()
plt.title('Grafik Customer loan (no/yes)')
plt.show()

(307500, 45)
[282676, 24824]

ms_acc = df[df['TARGET'] == 0].groupby(['NAME_FAMILY_STATUS'])['CODE_GENDER'].count().reset_index()
ms_acc.rename(columns = {'CODE_GENDER' : 'Customers'}, inplace=True)

ms_acc.sort_values(by='Customers', ascending=False)

ct = df.groupby(['TARGET', 'NAME_CONTRACT_TYPE'])['CODE_GENDER'].count().reset_index().sort_values(by = 'NAME_CONTRACT_TYPE')
ct.rename(columns = {'CODE_GENDER' : 'Customers'}, inplace=True)
        
name_of_ct = list(ct['NAME_CONTRACT_TYPE'].unique())
percentage = []

for i in name_of_ct:
    data = ct[ct['NAME_CONTRACT_TYPE'] == i]
    total = data['Customers'].sum()
    for x,y in enumerate(data['Customers']):
        pct = round(y / total, 2)
        percentage.append(pct)
        
ct['Percent'] = percentage
ct

ot = df.groupby(['TARGET', 'OCCUPATION_TYPE'])['CODE_GENDER'].count().reset_index().sort_values(by=['OCCUPATION_TYPE', 'TARGET'])
ot.rename(columns = {'CODE_GENDER' : 'Customers'}, inplace=True)

name_of_ot = list(ot['OCCUPATION_TYPE'].unique())
percentage = []

for i in name_of_ot:
    data = ot[ot['OCCUPATION_TYPE'] == i]
    total = data['Customers'].sum()
    for x,y in enumerate(data['Customers']):
        pct = round(y / total, 2)
        percentage.append(pct)

ot['percentage'] = percentage
ot

df['Age'] = np.floor(df['DAYS_BIRTH'] / -365).astype(int)
df.drop(columns= 'DAYS_BIRTH', inplace=True)

# Mendefinisakan categori umur applicant

age_cat = []
for i in df['Age']:
    if i < 36:
        age_cat.append('Young Adult')
    elif i < 51:
        age_cat.append('Adult')
    else:
        age_cat.append('Older')

df['Age_category'] = age_cat

g_age = df.groupby(['TARGET', 'Age_category'])['CODE_GENDER'].count().reset_index().sort_values('Age_category').reset_index()
g_age.rename(columns = {'CODE_GENDER' : 'Customers'}, inplace=True)
g_age.drop(columns = 'index', inplace=True)

name_of_ot = list(g_age['Age_category'].unique())
percentage = []

for i in name_of_ot:
    data = g_age[g_age['Age_category'] == i]
    total = data['Customers'].sum()
    for x,y in enumerate(data['Customers']):
        pct = round(y / total, 2)
        percentage.append(pct)

g_age['percentage'] = percentage

mapping_target = {
    1 : 'Have Problem',
    0 : 'Dont Have Problem'
}

g_age['TARGET'] = g_age['TARGET'].map(mapping_target)
g_age.sort_values(by='Customers', ascending=False)

fig = px.histogram(g_age, x='Age_category', y='Customers', width=500, height=400, color=target_variable, barmode="group")
fig.show()

itc = df.groupby(['TARGET', 'NAME_INCOME_TYPE', 'NAME_CONTRACT_TYPE'])['CODE_GENDER'].count().reset_index().sort_values(['NAME_INCOME_TYPE', 'NAME_CONTRACT_TYPE']).reset_index()
itc.rename(columns = {'CODE_GENDER' : 'Customers'}, inplace=True)
itc.drop(columns = 'index', inplace=True)

income_type   = list(itc['NAME_INCOME_TYPE'].unique())
percentage  = []

for i in income_type:
    data = itc[itc.NAME_INCOME_TYPE == i]
    contract_type = list(data['NAME_CONTRACT_TYPE'].unique())
    for x in contract_type:
        data2 = data[data.NAME_CONTRACT_TYPE == x]
        total = data2['Customers'].sum()
        for y,z in enumerate(data2['Customers']):
            pct = round(z / total, 2)
            percentage.append(pct)
            
itc['percentage'] = percentage
itc

fig = px.histogram(itc, x='NAME_INCOME_TYPE', y='Customers', width=500, height=400, color=target_variable, barmode="group")
fig.show()

df.drop(columns= ['Age', 'Age_category'], inplace=True)

nums = get_num_cols(df)

nums

['TARGET',
 'CNT_CHILDREN',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'OWN_CAR_AGE',
 'CNT_FAM_MEMBERS',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR']

# fig = px.histogram(df, x='Age', marginal='box', width=800, height=600, color=target_variable)
# fig.show()

fig = px.histogram(df, x='CNT_CHILDREN', marginal='box', width=800, height=600, color=target_variable)
fig.show()

fig = px.histogram(df, x='AMT_INCOME_TOTAL', marginal='box', width=800, height=600, color=target_variable)
fig.show()

fig = px.histogram(df, x='AMT_CREDIT', marginal='box', width=800, height=600, color=target_variable)
fig.show()

plt.figure(figsize=(12,8))
sns.heatmap(df.corr(),annot=True)
plt.show()

cats = get_cat_cols(df)

cats

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE']

# for i in cat_col:
#   fig = px.histogram(df, x=i, width=500, height=400, color=target_variable, barmode="group")
#   fig.show()

fig = px.histogram(df, x='NAME_CONTRACT_TYPE',  width=800, height=600, color=target_variable, barmode="group")
fig.show()

fig = px.histogram(df, x='CODE_GENDER', width=800, height=600, color=target_variable, barmode="group")
fig.show()

fig = px.histogram(df, x='NAME_EDUCATION_TYPE', width=800, height=600, color=target_variable, barmode="group")
fig.show()

fig = px.histogram(df, x='NAME_FAMILY_STATUS', width=800, height=600, color=target_variable, barmode="group")
fig.show()

X = df.drop([target_variable], axis=1)
y = df[[target_variable]]

X.head()

y.head()

# For Modelling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn import *
from imblearn import *
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import recall_score, precision_score, roc_auc_score
from imblearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

num_col = get_num_cols(X)
num_col

['CNT_CHILDREN',
 'FLAG_MOBIL',
 'FLAG_EMP_PHONE',
 'FLAG_WORK_PHONE',
 'FLAG_CONT_MOBILE',
 'FLAG_PHONE',
 'FLAG_EMAIL',
 'REGION_RATING_CLIENT',
 'REGION_RATING_CLIENT_W_CITY',
 'HOUR_APPR_PROCESS_START',
 'REG_REGION_NOT_LIVE_REGION',
 'REG_REGION_NOT_WORK_REGION',
 'LIVE_REGION_NOT_WORK_REGION',
 'AMT_INCOME_TOTAL',
 'AMT_CREDIT',
 'AMT_ANNUITY',
 'AMT_GOODS_PRICE',
 'REGION_POPULATION_RELATIVE',
 'OWN_CAR_AGE',
 'CNT_FAM_MEMBERS',
 'OBS_30_CNT_SOCIAL_CIRCLE',
 'DEF_30_CNT_SOCIAL_CIRCLE',
 'OBS_60_CNT_SOCIAL_CIRCLE',
 'DEF_60_CNT_SOCIAL_CIRCLE',
 'DAYS_LAST_PHONE_CHANGE',
 'AMT_REQ_CREDIT_BUREAU_HOUR',
 'AMT_REQ_CREDIT_BUREAU_DAY',
 'AMT_REQ_CREDIT_BUREAU_WEEK',
 'AMT_REQ_CREDIT_BUREAU_MON',
 'AMT_REQ_CREDIT_BUREAU_QRT',
 'AMT_REQ_CREDIT_BUREAU_YEAR']

scaler = MinMaxScaler()
X_transform = scaler.fit_transform(X[num_col])

X_transform = pd.DataFrame(X_transform,columns = num_col)
print(X_transform.shape)
X_transform.head()

(307500, 31)

cat_col = get_cat_cols(df)
cat_col

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE']

print(X_transform.shape)
X_transform.head()

(307500, 31)

print(y.shape)
y.head()

(307500, 1)

new_all = pd.concat([y, X[cat_col], X_transform], axis=1) #Data Train
print(new_all.shape)

(307511, 44)

new_all.head()

cat_col

['NAME_CONTRACT_TYPE',
 'CODE_GENDER',
 'FLAG_OWN_CAR',
 'FLAG_OWN_REALTY',
 'NAME_TYPE_SUITE',
 'NAME_INCOME_TYPE',
 'NAME_EDUCATION_TYPE',
 'NAME_FAMILY_STATUS',
 'NAME_HOUSING_TYPE',
 'OCCUPATION_TYPE',
 'WEEKDAY_APPR_PROCESS_START',
 'ORGANIZATION_TYPE']

labelencoder=LabelEncoder()

# for i in cat_col:
#     new_all[i] = labelencoder.fit_transform(new_all[i])

new_all['NAME_CONTRACT_TYPE'] = labelencoder.fit_transform(new_all['NAME_CONTRACT_TYPE'])
new_all['CODE_GENDER'] = labelencoder.fit_transform(new_all['CODE_GENDER'])
new_all['FLAG_OWN_CAR'] = labelencoder.fit_transform(new_all['FLAG_OWN_CAR'])
new_all['FLAG_OWN_REALTY'] = labelencoder.fit_transform(new_all['FLAG_OWN_REALTY'])
new_all['NAME_TYPE_SUITE'] = labelencoder.fit_transform(new_all['NAME_TYPE_SUITE'])
new_all['NAME_INCOME_TYPE'] = labelencoder.fit_transform(new_all['NAME_INCOME_TYPE'])
new_all['NAME_EDUCATION_TYPE'] = labelencoder.fit_transform(new_all['NAME_EDUCATION_TYPE'])
new_all['NAME_FAMILY_STATUS'] = labelencoder.fit_transform(new_all['NAME_FAMILY_STATUS'])
new_all['NAME_HOUSING_TYPE'] = labelencoder.fit_transform(new_all['NAME_HOUSING_TYPE'])
new_all['OCCUPATION_TYPE'] = labelencoder.fit_transform(new_all['OCCUPATION_TYPE'])
new_all['WEEKDAY_APPR_PROCESS_START'] = labelencoder.fit_transform(new_all['WEEKDAY_APPR_PROCESS_START'])
new_all['ORGANIZATION_TYPE'] = labelencoder.fit_transform(new_all['ORGANIZATION_TYPE'])


new_all.head()

check_missing(new_all)

for i in new_all.columns:
    data = df[i]
    types = str(data.dtype)
    if types == 'object':
        new_all[i].fillna(new_all[i].mode()[0], inplace=True)
    else:
        new_all[i].fillna(new_all[i].median(), inplace=True)

X = new_all.drop(target_variable, axis=1)
y = new_all[[target_variable]]

get_head(X)

Shape : (307511, 43)

get_head(y)

Shape : (307511, 1)

X_train,X_test,y_train,y_test = train_test_split(X, y, test_size = 0.3, random_state = 11, stratify=y)
print('Shape of X_train dan y_train: ', X_train.shape, y_train.shape)
print('Shape of X_test dan y_test: ', X_test.shape, y_test.shape)

Shape of X_train dan y_train:  (215257, 43) (215257, 1)
Shape of X_test dan y_test:  (92254, 43) (92254, 1)

y.value_counts(normalize=True)

TARGET
0.0       0.919274
1.0       0.080726
dtype: float64

y_train.value_counts(normalize=True)

TARGET
0.0       0.919273
1.0       0.080727
dtype: float64

y_test.value_counts(normalize=True)

TARGET
0.0       0.919277
1.0       0.080723
dtype: float64

y_train.value_counts()

TARGET
0.0       197880
1.0        17377
dtype: int64

y_train.value_counts(normalize=True)

TARGET
0.0       0.919273
1.0       0.080727
dtype: float64

X_rus, y_rus = RandomUnderSampler().fit_resample(X_train, y_train)

y_rus.value_counts()

TARGET
0.0       17377
1.0       17377
dtype: int64

y_rus.value_counts(normalize=True)

TARGET
0.0       0.5
1.0       0.5
dtype: float64

# membuat fungsi evaluasi model
def evaluasi_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    return roc_auc_score(y_test, y_pred)

dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

DecisionTreeClassifier()

DecisionTreeClassifier()

evaluasi_model(dtc, X_test, y_test)

0.5072372323069139

evaluasi_model(dtc,  X_rus, y_rus)

1.0

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, gpu_id=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=None, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=None, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=None, num_parallel_tree=None,
              predictor=None, random_state=None, ...)

evaluasi_model(xgb, X_test, y_test)

0.5000140794730105

evaluasi_model(xgb,  X_rus, y_rus)

0.5030212349657593

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

RandomForestClassifier()

RandomForestClassifier()

evaluasi_model(rf, X_test, y_test)

0.5

evaluasi_model(rf,  X_rus, y_rus)

0.9996834896702538

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

# Accuracy Train
print('Decision Tree Classifier')
print(' ')
print('Accuracy Train')
dtree_Train_pred = dtree.predict(X_train)
print(classification_report(y_train, dtree_Train_pred))

print('===================================================')
# Accuracy Test
print('Accuracy Test')
dtree_Test_pred = dtree.predict(X_test)
print(classification_report(y_test, dtree_Test_pred))

Decision Tree Classifier
 
Accuracy Train
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    197880
         1.0       1.00      1.00      1.00     17377

    accuracy                           1.00    215257
   macro avg       1.00      1.00      1.00    215257
weighted avg       1.00      1.00      1.00    215257

===================================================
Accuracy Test
              precision    recall  f1-score   support

         0.0       0.92      0.90      0.91     84807
         1.0       0.09      0.11      0.10      7447

    accuracy                           0.84     92254
   macro avg       0.51      0.51      0.51     92254
weighted avg       0.85      0.84      0.85     92254

dtree = DecisionTreeClassifier()
dtree.fit(X_train, y_train)

# Accuracy Train
print('Decision Tree Classifier')
print(' ')
print('Accuracy Train')
dtree_Train_pred = dtree.predict(X_rus)
print(classification_report(y_rus, dtree_Train_pred))

print('===================================================')
# Accuracy Test
print('Accuracy Test')
dtree_Test_pred = dtree.predict(X_rus)
print(classification_report(y_rus, dtree_Test_pred))

Decision Tree Classifier
 
Accuracy Train
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     17377
         1.0       1.00      1.00      1.00     17377

    accuracy                           1.00     34754
   macro avg       1.00      1.00      1.00     34754
weighted avg       1.00      1.00      1.00     34754

===================================================
Accuracy Test
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     17377
         1.0       1.00      1.00      1.00     17377

    accuracy                           1.00     34754
   macro avg       1.00      1.00      1.00     34754
weighted avg       1.00      1.00      1.00     34754

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

# Accuracy Train
print('XGBClassifier')
print(' ')
print('Accuracy Train')
xgb_Train_pred = xgb.predict(X_train)
print(classification_report(y_train,xgb_Train_pred))

print('===================================================')
# Accuracy Test
print('Accuracy Test')
xgb_Test_pred = xgb.predict(X_test)
print(classification_report(y_test,xgb_Test_pred))

XGBClassifier
 
Accuracy Train
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96    197880
         1.0       1.00      0.01      0.01     17377

    accuracy                           0.92    215257
   macro avg       0.96      0.50      0.49    215257
weighted avg       0.93      0.92      0.88    215257

===================================================
Accuracy Test
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96     84807
         1.0       0.10      0.00      0.00      7447

    accuracy                           0.92     92254
   macro avg       0.51      0.50      0.48     92254
weighted avg       0.85      0.92      0.88     92254

xgb = XGBClassifier()
xgb.fit(X_train, y_train)

# Accuracy Train
print('XGBClassifier')
print(' ')
print('Accuracy Train')
xgb_Train_pred = xgb.predict(X_rus)
print(classification_report(y_rus, xgb_Train_pred))

print('===================================================')
# Accuracy Test
print('Accuracy Test')
xgb_Test_pred = xgb.predict(X_rus)
print(classification_report(y_rus, xgb_Test_pred))

XGBClassifier
 
Accuracy Train
              precision    recall  f1-score   support

         0.0       0.50      1.00      0.67     17377
         1.0       1.00      0.01      0.01     17377

    accuracy                           0.50     34754
   macro avg       0.75      0.50      0.34     34754
weighted avg       0.75      0.50      0.34     34754

===================================================
Accuracy Test
              precision    recall  f1-score   support

         0.0       0.50      1.00      0.67     17377
         1.0       1.00      0.01      0.01     17377

    accuracy                           0.50     34754
   macro avg       0.75      0.50      0.34     34754
weighted avg       0.75      0.50      0.34     34754

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Accuracy Train
print('Random Forest Classifier')
print(' ')
print('Accuracy Train')
rf_Train_pred = rf.predict(X_train)
print(classification_report(y_train,rf_Train_pred))

print('===================================================')
# Accuracy Test
print('Accuracy Test')
rf_Test_pred = rf.predict(X_test)
print(classification_report(y_test, rf_Test_pred))

Random Forest Classifier
 
Accuracy Train
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00    197880
         1.0       1.00      1.00      1.00     17377

    accuracy                           1.00    215257
   macro avg       1.00      1.00      1.00    215257
weighted avg       1.00      1.00      1.00    215257

===================================================
Accuracy Test
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96     84807
         1.0       0.00      0.00      0.00      7447

    accuracy                           0.92     92254
   macro avg       0.46      0.50      0.48     92254
weighted avg       0.85      0.92      0.88     92254

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

# Accuracy Train
print('RandomForestClassifier')
print(' ')
print('Accuracy Train')
rf_Train_pred=rf.predict(X_rus)
print(classification_report(y_rus, rf_Train_pred))

print('===================================================')
# Accuracy Test
print('Accuracy Test')
rf_Test_pred=rf.predict(X_rus)
print(classification_report(y_rus, rf_Test_pred))

RandomForestClassifier
 
Accuracy Train
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     17377
         1.0       1.00      1.00      1.00     17377

    accuracy                           1.00     34754
   macro avg       1.00      1.00      1.00     34754
weighted avg       1.00      1.00      1.00     34754

===================================================
Accuracy Test
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00     17377
         1.0       1.00      1.00      1.00     17377

    accuracy                           1.00     34754
   macro avg       1.00      1.00      1.00     34754
weighted avg       1.00      1.00      1.00     34754

lr = LogisticRegression()
lr.fit(X_train, y_train)

# Accuracy Train
print('LogisticRegression')
print(' ')
print('Accuracy Train')
lr_Train_pred = lr.predict(X_train)
print(classification_report(y_train,lr_Train_pred))

print('===================================================')
# Accuracy Test
print('Accuracy Test')
lr_Test_pred = lr.predict(X_test)
print(classification_report(y_test, lr_Test_pred))

LogisticRegression
 
Accuracy Train
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96    197880
         1.0       0.00      0.00      0.00     17377

    accuracy                           0.92    215257
   macro avg       0.46      0.50      0.48    215257
weighted avg       0.85      0.92      0.88    215257

===================================================
Accuracy Test
              precision    recall  f1-score   support

         0.0       0.92      1.00      0.96     84807
         1.0       0.00      0.00      0.00      7447

    accuracy                           0.92     92254
   macro avg       0.46      0.50      0.48     92254
weighted avg       0.85      0.92      0.88     92254

lr = LogisticRegression()
lr.fit(X_train, y_train)

# Accuracy Train
print('LogisticRegression')
print(' ')
print('Accuracy Train')
lr_Train_pred=lr.predict(X_rus)
print(classification_report(y_rus, lr_Train_pred))

print('===================================================')
# Accuracy Test
print('Accuracy Test')
lr_Test_pred=lr.predict(X_rus)
print(classification_report(y_rus, lr_Test_pred))

LogisticRegression
 
Accuracy Train
              precision    recall  f1-score   support

         0.0       0.50      1.00      0.67     17377
         1.0       0.00      0.00      0.00     17377

    accuracy                           0.50     34754
   macro avg       0.25      0.50      0.33     34754
weighted avg       0.25      0.50      0.33     34754

===================================================
Accuracy Test
              precision    recall  f1-score   support

         0.0       0.50      1.00      0.67     17377
         1.0       0.00      0.00      0.00     17377

    accuracy                           0.50     34754
   macro avg       0.25      0.50      0.33     34754
weighted avg       0.25      0.50      0.33     34754

No	Row	Description
1	SK_ID_CURR	ID of loan in our sample
2	TARGET	Target variable (1 - client with payment difficulties: he/she had late payment more than X days on at least one of the first Y installments of the loan in our sample, 0 - all other cases)
5	NAME_CONTRACT_TYPE	Identification if loan is cash or revolving
6	CODE_GENDER	Gender of the client
7	FLAG_OWN_CAR	Flag if the client owns a car
8	FLAG_OWN_REALTY	Flag if client owns a house or flat
9	CNT_CHILDREN	Number of children the client has
10	AMT_INCOME_TOTAL	Income of the client
11	AMT_CREDIT	Credit amount of the loan
12	AMT_ANNUITY	Loan annuity
13	AMT_GOODS_PRICE	For consumer loans it is the price of the goods for which the loan is given
14	NAME_TYPE_SUITE	Who was accompanying client when he was applying for the loan
15	NAME_INCOME_TYPE	Clients income type (businessman, working, maternity leave,…)
16	NAME_EDUCATION_TYPE	Level of highest education the client achieved
17	NAME_FAMILY_STATUS	Family status of the client
18	NAME_HOUSING_TYPE	What is the housing situation of the client (renting, living with parents, ...)
19	REGION_POPULATION_RELATIVE	Normalized population of region where client lives (higher number means the client lives in more populated region)
20	DAYS_BIRTH	Client's age in days at the time of application
21	DAYS_EMPLOYED	How many days before the application the person started current employment
22	DAYS_REGISTRATION	How many days before the application did client change his registration
23	DAYS_ID_PUBLISH	How many days before the application did client change the identity document with which he applied for the loan
24	OWN_CAR_AGE	Age of client's car
25	FLAG_MOBIL	Did client provide mobile phone (1=YES, 0=NO)
26	FLAG_EMP_PHONE	Did client provide work phone (1=YES, 0=NO)
27	FLAG_WORK_PHONE	Did client provide home phone (1=YES, 0=NO)
28	FLAG_CONT_MOBILE	Was mobile phone reachable (1=YES, 0=NO)
29	FLAG_PHONE	Did client provide home phone (1=YES, 0=NO)
30	FLAG_EMAIL	Did client provide email (1=YES, 0=NO)
31	OCCUPATION_TYPE	What kind of occupation does the client have
32	CNT_FAM_MEMBERS	How many family members does client have
33	REGION_RATING_CLIENT	Our rating of the region where client lives (1,2,3)
34	REGION_RATING_CLIENT_W_CITY	Our rating of the region where client lives with taking city into account (1,2,3)
35	WEEKDAY_APPR_PROCESS_START	On which day of the week did the client apply for the loan
36	HOUR_APPR_PROCESS_START	Approximately at what hour did the client apply for the loan
37	REG_REGION_NOT_LIVE_REGION	Flag if client's permanent address does not match contact address (1=different, 0=same, at region level)
38	REG_REGION_NOT_WORK_REGION	Flag if client's permanent address does not match work address (1=different, 0=same, at region level)
39	LIVE_REGION_NOT_WORK_REGION	Flag if client's contact address does not match work address (1=different, 0=same, at region level)
40	REG_CITY_NOT_LIVE_CITY	Flag if client's permanent address does not match contact address (1=different, 0=same, at city level)
41	REG_CITY_NOT_WORK_CITY	Flag if client's permanent address does not match work address (1=different, 0=same, at city level)
42	LIVE_CITY_NOT_WORK_CITY	Flag if client's contact address does not match work address (1=different, 0=same, at city level)
43	ORGANIZATION_TYPE	Type of organization where client works
44	EXT_SOURCE_1	Normalized score from external data source
45	EXT_SOURCE_2	Normalized score from external data source
46	EXT_SOURCE_3	Normalized score from external data source
47	APARTMENTS_AVG	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
48	BASEMENTAREA_AVG	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
49	YEARS_BEGINEXPLUATATION_AVG	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
50	YEARS_BUILD_AVG	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
51	COMMONAREA_AVG	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
52	ELEVATORS_AVG	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
53	ENTRANCES_AVG	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
54	FLOORSMAX_AVG	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
55	FLOORSMIN_AVG	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
56	LANDAREA_AVG	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
57	LIVINGAPARTMENTS_AVG	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
58	LIVINGAREA_AVG	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
59	NONLIVINGAPARTMENTS_AVG	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
60	NONLIVINGAREA_AVG	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
61	APARTMENTS_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
62	BASEMENTAREA_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
63	YEARS_BEGINEXPLUATATION_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
64	YEARS_BUILD_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
65	COMMONAREA_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
66	ELEVATORS_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
67	ENTRANCES_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
68	FLOORSMAX_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
69	FLOORSMIN_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
70	LANDAREA_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
71	LIVINGAPARTMENTS_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
72	LIVINGAREA_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
73	NONLIVINGAPARTMENTS_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
74	NONLIVINGAREA_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
75	APARTMENTS_MEDI	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
76	BASEMENTAREA_MEDI	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
77	YEARS_BEGINEXPLUATATION_MEDI	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
78	YEARS_BUILD_MEDI	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
79	COMMONAREA_MEDI	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
80	ELEVATORS_MEDI	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
81	ENTRANCES_MEDI	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
82	FLOORSMAX_MEDI	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
83	FLOORSMIN_MEDI	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
84	LANDAREA_MEDI	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
85	LIVINGAPARTMENTS_MEDI	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
86	LIVINGAREA_MEDI	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
87	NONLIVINGAPARTMENTS_MEDI	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
88	NONLIVINGAREA_MEDI	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
89	FONDKAPREMONT_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
90	HOUSETYPE_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
91	TOTALAREA_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
92	WALLSMATERIAL_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
93	EMERGENCYSTATE_MODE	Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor
94	OBS_30_CNT_SOCIAL_CIRCLE	How many observation of client's social surroundings with observable 30 DPD (days past due) default
95	DEF_30_CNT_SOCIAL_CIRCLE	How many observation of client's social surroundings defaulted on 30 DPD (days past due)
96	OBS_60_CNT_SOCIAL_CIRCLE	How many observation of client's social surroundings with observable 60 DPD (days past due) default
97	DEF_60_CNT_SOCIAL_CIRCLE	How many observation of client's social surroundings defaulted on 60 (days past due) DPD
98	DAYS_LAST_PHONE_CHANGE	How many days before application did client change phone
99	FLAG_DOCUMENT_2	Did client provide document 2
100	FLAG_DOCUMENT_3	Did client provide document 3
101	FLAG_DOCUMENT_4	Did client provide document 4
102	FLAG_DOCUMENT_5	Did client provide document 5
103	FLAG_DOCUMENT_6	Did client provide document 6
104	FLAG_DOCUMENT_7	Did client provide document 7
105	FLAG_DOCUMENT_8	Did client provide document 8
106	FLAG_DOCUMENT_9	Did client provide document 9
107	FLAG_DOCUMENT_10	Did client provide document 10
108	FLAG_DOCUMENT_11	Did client provide document 11
109	FLAG_DOCUMENT_12	Did client provide document 12
110	FLAG_DOCUMENT_13	Did client provide document 13
111	FLAG_DOCUMENT_14	Did client provide document 14
112	FLAG_DOCUMENT_15	Did client provide document 15
113	FLAG_DOCUMENT_16	Did client provide document 16
114	FLAG_DOCUMENT_17	Did client provide document 17
115	FLAG_DOCUMENT_18	Did client provide document 18
116	FLAG_DOCUMENT_19	Did client provide document 19
117	FLAG_DOCUMENT_20	Did client provide document 20
118	FLAG_DOCUMENT_21	Did client provide document 21
119	AMT_REQ_CREDIT_BUREAU_HOUR	Number of enquiries to Credit Bureau about the client one hour before application
120	AMT_REQ_CREDIT_BUREAU_DAY	Number of enquiries to Credit Bureau about the client one day before application (excluding one hour before application)
121	AMT_REQ_CREDIT_BUREAU_WEEK	Number of enquiries to Credit Bureau about the client one week before application (excluding one day before application)
122	AMT_REQ_CREDIT_BUREAU_MON	Number of enquiries to Credit Bureau about the client one month before application (excluding one week before application)
123	AMT_REQ_CREDIT_BUREAU_QRT	Number of enquiries to Credit Bureau about the client 3 month before application (excluding one month before application)
124	AMT_REQ_CREDIT_BUREAU_YEAR	Number of enquiries to Credit Bureau about the client one day year (excluding last 3 months before application)

	TARGET	DAYS_BIRTH	CNT_CHILDREN	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	AMT_GOODS_PRICE	REGION_POPULATION_RELATIVE	OWN_CAR_AGE	FLAG_MOBIL	FLAG_EMP_PHONE	FLAG_WORK_PHONE	FLAG_CONT_MOBILE	FLAG_PHONE	FLAG_EMAIL	CNT_FAM_MEMBERS	REGION_RATING_CLIENT	REGION_RATING_CLIENT_W_CITY	HOUR_APPR_PROCESS_START	REG_REGION_NOT_LIVE_REGION	REG_REGION_NOT_WORK_REGION	LIVE_REGION_NOT_WORK_REGION	OBS_30_CNT_SOCIAL_CIRCLE	DEF_30_CNT_SOCIAL_CIRCLE	OBS_60_CNT_SOCIAL_CIRCLE	DEF_60_CNT_SOCIAL_CIRCLE	DAYS_LAST_PHONE_CHANGE	AMT_REQ_CREDIT_BUREAU_HOUR	AMT_REQ_CREDIT_BUREAU_DAY	AMT_REQ_CREDIT_BUREAU_WEEK	AMT_REQ_CREDIT_BUREAU_MON	AMT_REQ_CREDIT_BUREAU_QRT	AMT_REQ_CREDIT_BUREAU_YEAR
count	307511.000000	307511.000000	307511.000000	3.075110e+05	3.075110e+05	307499.000000	3.072330e+05	307511.000000	104582.000000	307511.000000	307511.000000	307511.000000	307511.000000	307511.000000	307511.000000	307509.000000	307511.000000	307511.000000	307511.000000	307511.000000	307511.000000	307511.000000	306490.000000	306490.000000	306490.000000	306490.000000	307510.000000	265992.000000	265992.000000	265992.000000	265992.000000	265992.000000	265992.000000
mean	0.080729	-16036.995067	0.417052	1.687979e+05	5.990260e+05	27108.573909	5.383962e+05	0.020868	12.061091	0.999997	0.819889	0.199368	0.998133	0.281066	0.056720	2.152665	2.052463	2.031521	12.063419	0.015144	0.050769	0.040659	1.422245	0.143421	1.405292	0.100049	-962.858788	0.006402	0.007000	0.034362	0.267395	0.265474	1.899974
std	0.272419	4363.988632	0.722121	2.371231e+05	4.024908e+05	14493.737315	3.694465e+05	0.013831	11.944812	0.001803	0.384280	0.399526	0.043164	0.449521	0.231307	0.910682	0.509034	0.502737	3.265832	0.122126	0.219526	0.197499	2.400989	0.446698	2.379803	0.362291	826.808487	0.083849	0.110757	0.204685	0.916002	0.794056	1.869295
min	0.000000	-25229.000000	0.000000	2.565000e+04	4.500000e+04	1615.500000	4.050000e+04	0.000290	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-4292.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
25%	0.000000	-19682.000000	0.000000	1.125000e+05	2.700000e+05	16524.000000	2.385000e+05	0.010006	5.000000	1.000000	1.000000	0.000000	1.000000	0.000000	0.000000	2.000000	2.000000	2.000000	10.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-1570.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000
50%	0.000000	-15750.000000	0.000000	1.471500e+05	5.135310e+05	24903.000000	4.500000e+05	0.018850	9.000000	1.000000	1.000000	0.000000	1.000000	0.000000	0.000000	2.000000	2.000000	2.000000	12.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	-757.000000	0.000000	0.000000	0.000000	0.000000	0.000000	1.000000
75%	0.000000	-12413.000000	1.000000	2.025000e+05	8.086500e+05	34596.000000	6.795000e+05	0.028663	15.000000	1.000000	1.000000	0.000000	1.000000	1.000000	0.000000	3.000000	2.000000	2.000000	14.000000	0.000000	0.000000	0.000000	2.000000	0.000000	2.000000	0.000000	-274.000000	0.000000	0.000000	0.000000	0.000000	0.000000	3.000000
max	1.000000	-7489.000000	19.000000	1.170000e+08	4.050000e+06	258025.500000	4.050000e+06	0.072508	91.000000	1.000000	1.000000	1.000000	1.000000	1.000000	1.000000	20.000000	3.000000	3.000000	23.000000	1.000000	1.000000	1.000000	348.000000	34.000000	344.000000	24.000000	0.000000	4.000000	9.000000	8.000000	27.000000	261.000000	25.000000

PROJECT BASED INTERN: DATA SCIENTIST¶

HOME CREDIT INDONESIA¶

Goals¶

Dataset¶

Process Flow¶

Step by Step¶

Import Library¶

Function¶

Import Dataset¶

Descriptive Statistics¶

Data Quality Checking¶

Target Variable Distribution¶

Numerical Feature Distribution¶

Categorical Feature Distribution¶

Data Prepocessing¶

Normalization - Numerical Feature¶

One Hot Encoder - Categorical Feature¶

Train-test Split¶

Resampling¶

Random Under Sampling¶

Modeling¶

Decision Tree¶

XGBoost¶

Random Forest¶

Decision Tree Classifier¶

XGBClassifier¶

Random Forest¶

Logistic Regression¶

Kesimpulan¶

	TARGET	NAME_CONTRACT_TYPE	CODE_GENDER	DAYS_BIRTH	FLAG_OWN_CAR	FLAG_OWN_REALTY	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	AMT_GOODS_PRICE	NAME_TYPE_SUITE	NAME_INCOME_TYPE	NAME_EDUCATION_TYPE	NAME_FAMILY_STATUS	NAME_HOUSING_TYPE	REGION_POPULATION_RELATIVE	OWN_CAR_AGE	FLAG_MOBIL	FLAG_EMP_PHONE	FLAG_WORK_PHONE	FLAG_CONT_MOBILE	FLAG_PHONE	OCCUPATION_TYPE	CNT_FAM_MEMBERS	REGION_RATING_CLIENT	REGION_RATING_CLIENT_W_CITY	WEEKDAY_APPR_PROCESS_START	HOUR_APPR_PROCESS_START	ORGANIZATION_TYPE	OBS_30_CNT_SOCIAL_CIRCLE	DEF_30_CNT_SOCIAL_CIRCLE	OBS_60_CNT_SOCIAL_CIRCLE	DEF_60_CNT_SOCIAL_CIRCLE	DAYS_LAST_PHONE_CHANGE	AMT_REQ_CREDIT_BUREAU_HOUR	AMT_REQ_CREDIT_BUREAU_DAY	AMT_REQ_CREDIT_BUREAU_WEEK	AMT_REQ_CREDIT_BUREAU_MON	AMT_REQ_CREDIT_BUREAU_QRT	AMT_REQ_CREDIT_BUREAU_YEAR
0	1	Cash loans	M	-9461	N	Y	202500.0	406597.5	24700.5	351000.0	Unaccompanied	Working	Secondary / secondary special	Single / not married	House / apartment	0.018801	NaN	1	1	0	1	1	Laborers	1.0	2	2	WEDNESDAY	10	Business Entity Type 3	2.0	2.0	2.0	2.0	-1134.0	0.0	0.0	0.0	0.0	0.0	1.0
1	0	Cash loans	F	-16765	N	N	270000.0	1293502.5	35698.5	1129500.0	Family	State servant	Higher education	Married	House / apartment	0.003541	NaN	1	1	0	1	1	Core staff	2.0	1	1	MONDAY	11	School	1.0	0.0	1.0	0.0	-828.0	0.0	0.0	0.0	0.0	0.0	0.0
2	0	Revolving loans	M	-19046	Y	Y	67500.0	135000.0	6750.0	135000.0	Unaccompanied	Working	Secondary / secondary special	Single / not married	House / apartment	0.010032	26.0	1	1	1	1	1	Laborers	1.0	2	2	MONDAY	9	Government	0.0	0.0	0.0	0.0	-815.0	0.0	0.0	0.0	0.0	0.0	0.0
3	0	Cash loans	F	-19005	N	Y	135000.0	312682.5	29686.5	297000.0	Unaccompanied	Working	Secondary / secondary special	Civil marriage	House / apartment	0.008019	NaN	1	1	0	1	0	Laborers	2.0	2	2	WEDNESDAY	17	Business Entity Type 3	2.0	0.0	2.0	0.0	-617.0	NaN	NaN	NaN	NaN	NaN	NaN
4	0	Cash loans	M	-19932	N	Y	121500.0	513000.0	21865.5	513000.0	Unaccompanied	Working	Secondary / secondary special	Single / not married	House / apartment	0.028663	NaN	1	1	0	1	0	Core staff	1.0	2	2	THURSDAY	11	Religion	0.0	0.0	0.0	0.0	-1106.0	0.0	0.0	0.0	0.0	0.0	0.0

	count	unique	top	freq
NAME_CONTRACT_TYPE	307511	2	Cash loans	278232
FLAG_OWN_CAR	307511	2	N	202924
FLAG_OWN_REALTY	307511	2	Y	213312
CODE_GENDER	307511	3	F	202448
NAME_EDUCATION_TYPE	307511	5	Secondary / secondary special	218391
NAME_FAMILY_STATUS	307511	6	Married	196432
NAME_HOUSING_TYPE	307511	6	House / apartment	272868
NAME_TYPE_SUITE	306219	7	Unaccompanied	248526
WEEKDAY_APPR_PROCESS_START	307511	7	TUESDAY	53901
NAME_INCOME_TYPE	307511	8	Working	158774
OCCUPATION_TYPE	211120	18	Laborers	55186
ORGANIZATION_TYPE	307511	58	Business Entity Type 3	67992

	Column	N Unique Values	Unique Values
0	NAME_CONTRACT_TYPE	2	[Cash loans, Revolving loans]
2	FLAG_OWN_CAR	2	[N, Y]
3	FLAG_OWN_REALTY	2	[Y, N]
1	CODE_GENDER	3	[M, F, XNA]
6	NAME_EDUCATION_TYPE	5	[Secondary / secondary special, Higher educati...
7	NAME_FAMILY_STATUS	6	[Single / not married, Married, Civil marriage...
8	NAME_HOUSING_TYPE	6	[House / apartment, Rented apartment, With par...
4	NAME_TYPE_SUITE	7	[Unaccompanied, Family, Spouse, partner, Child...
10	WEEKDAY_APPR_PROCESS_START	7	[WEDNESDAY, MONDAY, THURSDAY, SUNDAY, SATURDAY...
5	NAME_INCOME_TYPE	8	[Working, State servant, Commercial associate,...
9	OCCUPATION_TYPE	18	[Laborers, Core staff, Accountants, Managers, ...
11	ORGANIZATION_TYPE	58	[Business Entity Type 3, School, Government, R...

	percentage	frequency	var_type
OWN_CAR_AGE	65.990810	202929	float64
OCCUPATION_TYPE	31.345545	96391	object
AMT_REQ_CREDIT_BUREAU_DAY	13.501631	41519	float64
AMT_REQ_CREDIT_BUREAU_HOUR	13.501631	41519	float64
AMT_REQ_CREDIT_BUREAU_YEAR	13.501631	41519	float64
AMT_REQ_CREDIT_BUREAU_WEEK	13.501631	41519	float64
AMT_REQ_CREDIT_BUREAU_QRT	13.501631	41519	float64
AMT_REQ_CREDIT_BUREAU_MON	13.501631	41519	float64
NAME_TYPE_SUITE	0.420148	1292	object
OBS_30_CNT_SOCIAL_CIRCLE	0.332021	1021	float64
DEF_30_CNT_SOCIAL_CIRCLE	0.332021	1021	float64
OBS_60_CNT_SOCIAL_CIRCLE	0.332021	1021	float64
DEF_60_CNT_SOCIAL_CIRCLE	0.332021	1021	float64
AMT_GOODS_PRICE	0.090403	278	float64
AMT_ANNUITY	0.003902	12	float64
CNT_FAM_MEMBERS	0.000650	2	float64
DAYS_LAST_PHONE_CHANGE	0.000325	1	float64

	TARGET	NAME_CONTRACT_TYPE	CODE_GENDER	DAYS_BIRTH	FLAG_OWN_CAR	FLAG_OWN_REALTY	CNT_CHILDREN	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	AMT_GOODS_PRICE	NAME_TYPE_SUITE	NAME_INCOME_TYPE	NAME_EDUCATION_TYPE	NAME_FAMILY_STATUS	NAME_HOUSING_TYPE	REGION_POPULATION_RELATIVE	OWN_CAR_AGE	FLAG_MOBIL	FLAG_EMP_PHONE	FLAG_CONT_MOBILE	FLAG_PHONE	OCCUPATION_TYPE	CNT_FAM_MEMBERS	REGION_RATING_CLIENT	REGION_RATING_CLIENT_W_CITY	WEEKDAY_APPR_PROCESS_START	HOUR_APPR_PROCESS_START	ORGANIZATION_TYPE	OBS_30_CNT_SOCIAL_CIRCLE	DEF_30_CNT_SOCIAL_CIRCLE	OBS_60_CNT_SOCIAL_CIRCLE	DEF_60_CNT_SOCIAL_CIRCLE	DAYS_LAST_PHONE_CHANGE	AMT_REQ_CREDIT_BUREAU_WEEK	AMT_REQ_CREDIT_BUREAU_MON	AMT_REQ_CREDIT_BUREAU_YEAR
29287	0	Cash loans	M	-24161	Y	N	0	112500.0	269550.0	11547.0	225000.0	Unaccompanied	Pensioner	Secondary / secondary special	Married	House / apartment	0.025164	14.0	1	0	1	0	Laborers	2.0	2	2	TUESDAY	8	XNA	2.0	0.0	2.0	0.0	-1664.0	0.0	0.0	1.0
82410	0	Cash loans	M	-13245	Y	Y	2	360000.0	450000.0	47880.0	450000.0	Unaccompanied	Working	Higher education	Married	House / apartment	0.026392	3.0	1	1	1	0	Managers	4.0	2	2	FRIDAY	12	Business Entity Type 3	1.0	0.0	1.0	0.0	-2552.0	0.0	2.0	6.0
83790	0	Cash loans	F	-24223	N	Y	0	157500.0	284400.0	15880.5	225000.0	Unaccompanied	State servant	Secondary / secondary special	Married	House / apartment	0.072508	9.0	1	1	1	1	Medicine staff	2.0	1	1	FRIDAY	10	Government	0.0	0.0	0.0	0.0	-828.0	0.0	0.0	4.0
127310	0	Cash loans	M	-9597	Y	N	0	202500.0	482530.5	26176.5	482530.5	Unaccompanied	State servant	Higher education	Single / not married	With parents	0.026392	64.0	1	1	1	0	Core staff	1.0	2	2	WEDNESDAY	11	Security Ministries	1.0	0.0	1.0	0.0	-516.0	0.0	0.0	0.0
131412	0	Cash loans	F	-24784	N	Y	0	81000.0	239850.0	23494.5	225000.0	Unaccompanied	Pensioner	Higher education	Married	House / apartment	0.007114	9.0	1	0	1	0	Laborers	2.0	2	2	TUESDAY	15	XNA	1.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0
171859	0	Cash loans	F	-24813	N	Y	0	117000.0	254700.0	24939.0	225000.0	Family	Pensioner	Secondary / secondary special	Married	House / apartment	0.030755	9.0	1	0	1	0	Laborers	2.0	2	2	THURSDAY	9	XNA	0.0	0.0	0.0	0.0	-651.0	0.0	0.0	4.0
172053	0	Cash loans	F	-24475	N	Y	0	126000.0	284400.0	16011.0	225000.0	Unaccompanied	Pensioner	Secondary / secondary special	Widow	House / apartment	0.035792	9.0	1	0	1	0	Laborers	1.0	2	2	THURSDAY	16	XNA	2.0	0.0	2.0	0.0	-1672.0	1.0	0.0	0.0
261724	0	Cash loans	F	-8433	N	Y	1	112500.0	225000.0	11074.5	225000.0	Unaccompanied	Working	Incomplete higher	Civil marriage	House / apartment	0.026392	9.0	1	1	1	0	Waiters/barmen staff	3.0	2	2	THURSDAY	13	Hotel	0.0	0.0	0.0	0.0	-283.0	0.0	0.0	1.0
297684	1	Cash loans	M	-15145	Y	Y	0	180000.0	592560.0	31153.5	450000.0	Unaccompanied	Commercial associate	Higher education	Married	House / apartment	0.007020	6.0	1	1	1	0	Drivers	2.0	2	2	FRIDAY	17	Self-employed	5.0	3.0	5.0	3.0	-569.0	0.0	0.0	4.0
302204	0	Cash loans	F	-24455	Y	N	0	90000.0	254700.0	14350.5	225000.0	Unaccompanied	Pensioner	Secondary / secondary special	Single / not married	House / apartment	0.007120	19.0	1	0	1	0	Laborers	1.0	2	2	THURSDAY	17	XNA	0.0	0.0	0.0	0.0	-2595.0	0.0	0.0	0.0
303911	0	Cash loans	M	-24624	N	Y	0	135000.0	254700.0	14350.5	225000.0	Unaccompanied	Pensioner	Secondary / secondary special	Single / not married	House / apartment	0.007120	9.0	1	0	1	0	Laborers	1.0	2	2	TUESDAY	17	XNA	0.0	0.0	0.0	0.0	-1442.0	0.0	1.0	1.0

	NAME_FAMILY_STATUS	Customers
1	Married	181577
3	Single / not married	40984
0	Civil marriage	26813
2	Separated	18150
5	Widow	15150
4	Unknown	2

	TARGET	NAME_CONTRACT_TYPE	Customers	Percent
0	0	Cash loans	255001	0.92
2	1	Cash loans	23220	0.08
1	0	Revolving loans	27675	0.95
3	1	Revolving loans	1604	0.05

	TARGET	OCCUPATION_TYPE	Customers	percentage
0	0	Accountants	9339	0.95
18	1	Accountants	474	0.05
1	0	Cleaning staff	4206	0.90
19	1	Cleaning staff	447	0.10
2	0	Cooking staff	5325	0.90
20	1	Cooking staff	621	0.10
3	0	Core staff	25831	0.94
21	1	Core staff	1738	0.06
4	0	Drivers	16496	0.89
22	1	Drivers	2106	0.11
5	0	HR staff	527	0.94
23	1	HR staff	36	0.06
6	0	High skill tech staff	10679	0.94
24	1	High skill tech staff	701	0.06
7	0	IT staff	492	0.94
25	1	IT staff	34	0.06
8	0	Laborers	139455	0.92
26	1	Laborers	12116	0.08
9	0	Low-skill Laborers	1734	0.83
27	1	Low-skill Laborers	359	0.17
10	0	Managers	20042	0.94
28	1	Managers	1328	0.06
11	0	Medicine staff	7964	0.93
29	1	Medicine staff	572	0.07
12	0	Private service staff	2477	0.93
30	1	Private service staff	175	0.07
13	0	Realty agents	692	0.92
31	1	Realty agents	59	0.08
14	0	Sales staff	29010	0.90
32	1	Sales staff	3092	0.10
15	0	Secretaries	1213	0.93
33	1	Secretaries	92	0.07
16	0	Security staff	5999	0.89
34	1	Security staff	722	0.11
17	0	Waiters/barmen staff	1195	0.89
35	1	Waiters/barmen staff	152	0.11

	TARGET	Age_category	Customers	percentage
0	Dont Have Problem	Adult	108677	0.92
2	Dont Have Problem	Older	91711	0.94
4	Dont Have Problem	Young Adult	82288	0.89
5	Have Problem	Young Adult	9953	0.11
1	Have Problem	Adult	9402	0.08
3	Have Problem	Older	5469	0.06

	TARGET	NAME_INCOME_TYPE	NAME_CONTRACT_TYPE	Customers	percentage
0	0	Businessman	Revolving loans	10	1.00
1	0	Commercial associate	Cash loans	58662	0.92
2	1	Commercial associate	Cash loans	4989	0.08
3	0	Commercial associate	Revolving loans	7595	0.95
4	1	Commercial associate	Revolving loans	370	0.05
5	1	Maternity leave	Cash loans	2	1.00
6	0	Maternity leave	Revolving loans	3	1.00
7	0	Pensioner	Cash loans	49145	0.95
8	1	Pensioner	Cash loans	2842	0.05
9	0	Pensioner	Revolving loans	3229	0.96
10	1	Pensioner	Revolving loans	140	0.04
11	0	State servant	Cash loans	18646	0.94
12	1	State servant	Cash loans	1188	0.06
13	0	State servant	Revolving loans	1806	0.97
14	1	State servant	Revolving loans	61	0.03
15	0	Student	Cash loans	15	1.00
16	0	Student	Revolving loans	3	1.00
17	0	Unemployed	Cash loans	7	0.47
18	1	Unemployed	Cash loans	8	0.53
19	0	Unemployed	Revolving loans	7	1.00
20	0	Working	Cash loans	128526	0.90
21	1	Working	Cash loans	14191	0.10
22	0	Working	Revolving loans	15022	0.94
23	1	Working	Revolving loans	1033	0.06

	FLAG_MOBIL	FLAG_EMP_PHONE	FLAG_WORK_PHONE	FLAG_CONT_MOBILE	FLAG_PHONE	REGION_RATING_CLIENT	REGION_RATING_CLIENT_W_CITY	HOUR_APPR_PROCESS_START	AMT_INCOME_TOTAL	AMT_CREDIT	AMT_ANNUITY	AMT_GOODS_PRICE	REGION_POPULATION_RELATIVE	OWN_CAR_AGE	CNT_FAM_MEMBERS	OBS_30_CNT_SOCIAL_CIRCLE	DEF_30_CNT_SOCIAL_CIRCLE	OBS_60_CNT_SOCIAL_CIRCLE	DEF_60_CNT_SOCIAL_CIRCLE	DAYS_LAST_PHONE_CHANGE	AMT_REQ_CREDIT_BUREAU_YEAR
0	1.0	1.0	0.0	1.0	1.0	0.5	0.5	0.434783	0.001512	0.090287	0.090032	0.077441	0.256321	0.098901	0.000000	0.005747	0.058824	0.005814	0.083333	0.735788	0.04
1	1.0	1.0	0.0	1.0	1.0	0.0	0.0	0.478261	0.002089	0.311736	0.132924	0.271605	0.045016	0.098901	0.052632	0.002874	0.000000	0.002907	0.000000	0.807083	0.00
2	1.0	1.0	1.0	1.0	1.0	0.5	0.5	0.391304	0.000358	0.022472	0.020025	0.023569	0.134897	0.285714	0.000000	0.000000	0.000000	0.000000	0.000000	0.810112	0.00
3	1.0	1.0	0.0	1.0	0.0	0.5	0.5	0.739130	0.000935	0.066837	0.109477	0.063973	0.107023	0.098901	0.052632	0.005747	0.000000	0.005814	0.000000	0.856244	0.04
4	1.0	1.0	0.0	1.0	0.0	0.5	0.5	0.478261	0.000819	0.116854	0.078975	0.117845	0.392880	0.098901	0.000000	0.000000	0.000000	0.000000	0.000000	0.742311	0.00

	percentage	frequency	var_type
TARGET	0.003577	11	float64
OBS_30_CNT_SOCIAL_CIRCLE	0.003577	11	float64
LIVE_REGION_NOT_WORK_REGION	0.003577	11	float64
AMT_INCOME_TOTAL	0.003577	11	float64
AMT_CREDIT	0.003577	11	float64
AMT_ANNUITY	0.003577	11	float64
AMT_GOODS_PRICE	0.003577	11	float64
REGION_POPULATION_RELATIVE	0.003577	11	float64
OWN_CAR_AGE	0.003577	11	float64
CNT_FAM_MEMBERS	0.003577	11	float64
DEF_30_CNT_SOCIAL_CIRCLE	0.003577	11	float64
REG_REGION_NOT_LIVE_REGION	0.003577	11	float64
OBS_60_CNT_SOCIAL_CIRCLE	0.003577	11	float64
DEF_60_CNT_SOCIAL_CIRCLE	0.003577	11	float64
DAYS_LAST_PHONE_CHANGE	0.003577	11	float64
AMT_REQ_CREDIT_BUREAU_HOUR	0.003577	11	float64
AMT_REQ_CREDIT_BUREAU_DAY	0.003577	11	float64
AMT_REQ_CREDIT_BUREAU_WEEK	0.003577	11	float64
AMT_REQ_CREDIT_BUREAU_MON	0.003577	11	float64
AMT_REQ_CREDIT_BUREAU_QRT	0.003577	11	float64
REG_REGION_NOT_WORK_REGION	0.003577	11	float64
HOUR_APPR_PROCESS_START	0.003577	11	float64
AMT_REQ_CREDIT_BUREAU_YEAR	0.003577	11	float64
REGION_RATING_CLIENT	0.003577	11	float64
FLAG_EMAIL	0.003577	11	float64
FLAG_PHONE	0.003577	11	float64
FLAG_CONT_MOBILE	0.003577	11	float64
FLAG_WORK_PHONE	0.003577	11	float64
FLAG_EMP_PHONE	0.003577	11	float64
FLAG_MOBIL	0.003577	11	float64
CNT_CHILDREN	0.003577	11	float64
REGION_RATING_CLIENT_W_CITY	0.003577	11	float64