import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import math

import warnings 
warnings.filterwarnings("ignore")


appl_df = pd.read_csv('application_record.csv')
appl_df.head()


appl_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438557 entries, 0 to 438556
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   ID                   438557 non-null  int64  
 1   CODE_GENDER          438557 non-null  object 
 2   FLAG_OWN_CAR         438557 non-null  object 
 3   FLAG_OWN_REALTY      438557 non-null  object 
 4   CNT_CHILDREN         438557 non-null  int64  
 5   AMT_INCOME_TOTAL     438557 non-null  float64
 6   NAME_INCOME_TYPE     438557 non-null  object 
 7   NAME_EDUCATION_TYPE  438557 non-null  object 
 8   NAME_FAMILY_STATUS   438557 non-null  object 
 9   NAME_HOUSING_TYPE    438557 non-null  object 
 10  DAYS_BIRTH           438557 non-null  int64  
 11  DAYS_EMPLOYED        438557 non-null  int64  
 12  FLAG_MOBIL           438557 non-null  int64  
 13  FLAG_WORK_PHONE      438557 non-null  int64  
 14  FLAG_PHONE           438557 non-null  int64  
 15  FLAG_EMAIL           438557 non-null  int64  
 16  OCCUPATION_TYPE      304354 non-null  object 
 17  CNT_FAM_MEMBERS      438557 non-null  float64
dtypes: float64(2), int64(8), object(8)
memory usage: 60.2+ MB


# Check for duplicates
appl_df.duplicated().sum()

0


# Check for null values
appl_df.isna().sum()/appl_df.shape[0]*100

ID                      0.000000
CODE_GENDER             0.000000
FLAG_OWN_CAR            0.000000
FLAG_OWN_REALTY         0.000000
CNT_CHILDREN            0.000000
AMT_INCOME_TOTAL        0.000000
NAME_INCOME_TYPE        0.000000
NAME_EDUCATION_TYPE     0.000000
NAME_FAMILY_STATUS      0.000000
NAME_HOUSING_TYPE       0.000000
DAYS_BIRTH              0.000000
DAYS_EMPLOYED           0.000000
FLAG_MOBIL              0.000000
FLAG_WORK_PHONE         0.000000
FLAG_PHONE              0.000000
FLAG_EMAIL              0.000000
OCCUPATION_TYPE        30.601039
CNT_FAM_MEMBERS         0.000000
dtype: float64


plt.figure(figsize=(10,4))
sns.countplot(x=appl_df[appl_df["OCCUPATION_TYPE"].isna()]["NAME_INCOME_TYPE"], palette="dark")
plt.title("Data Distribution of Null Occupation Type by Income Type")
plt.show()


# label occupation type for pensioners and students
appl_df.loc[appl_df["NAME_INCOME_TYPE"]=="Pensioner", "OCCUPATION_TYPE"] = "Pensioner"
appl_df.loc[appl_df["NAME_INCOME_TYPE"]=="Student", "OCCUPATION_TYPE"] = "Student"


# Explore Family Status
famstatus_val = appl_df.NAME_FAMILY_STATUS.value_counts(normalize = True)
famstatus_val

NAME_FAMILY_STATUS
Married                 0.683669
Single / not married    0.126029
Civil marriage          0.083300
Separated               0.062138
Widow                   0.044863
Name: proportion, dtype: float64


famstatus_val.plot.barh()
plt.show()


housing_val = appl_df.NAME_HOUSING_TYPE.value_counts(normalize = True)
housing_val

NAME_HOUSING_TYPE
House / apartment      0.898016
With parents           0.043499
Municipal apartment    0.032411
Rented apartment       0.013622
Office apartment       0.008943
Co-op apartment        0.003509
Name: proportion, dtype: float64


housing_val.plot.barh()
plt.show()


credit_df = pd.read_csv('credit_record.csv')
credit_df.head()


credit_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 3 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   ID              1048575 non-null  int64 
 1   MONTHS_BALANCE  1048575 non-null  int64 
 2   STATUS          1048575 non-null  object
dtypes: int64(2), object(1)
memory usage: 24.0+ MB


# Check for duplicates
credit_df.duplicated().sum()

0


# Check for null values
credit_df.isna().sum()/credit_df.shape[0]*100

ID                0.0
MONTHS_BALANCE    0.0
STATUS            0.0
dtype: float64


credit_df.STATUS.value_counts().plot.bar()
plt.show()


print('For application record, number of unique customer ID is', appl_df['ID'].nunique())
print('For the credit record, number of unique customer ID is', credit_df['ID'].nunique())
print('Number of unique customer ID that appear in both records is:', appl_df[appl_df['ID'].isin(credit_df['ID'])]['ID'].nunique())

For application record, number of unique customer ID is 438510
For the credit record, number of unique customer ID is 45985
Number of unique customer ID that appear in both records is: 36457


# Filter to get only usable samples
appl_df = appl_df[appl_df['ID'].isin(credit_df['ID'])]
credit_df = credit_df[credit_df['ID'].isin(appl_df['ID'])]


# Label the payments later than 30 days past due.
def highrisk(appl_df):
    if (appl_df["STATUS"]=='1') | (appl_df["STATUS"]=='2') | ( appl_df["STATUS"]=='3') | (appl_df["STATUS"]=='4') | (appl_df["STATUS"]=='5'): #high risk
        return 1
    else:
        return None


# Create a series for High Risk in credit_df
credit_df["High Risk"] = credit_df.apply(highrisk, axis=1)


# Group data by ID and count the occurences of high risk payments
count = credit_df.groupby('ID').count()
count.head()


# Label the customers that we want to approve
def determine_approval(count):
    if count["High Risk"] == 0:
        return 1
    else:
        return 0


# Create a series for Approved in credit_df
count["Approved"] = count.apply(determine_approval, axis=1)


count.head()


# Check data distribution on our approval set
plt.figure(figsize=(4,4))
sns.countplot(x=count["Approved"], palette="dark")
plt.title("Data Distribution of Approval")
plt.show()


# Merge application record with count dataframe
merged_df = pd.merge(appl_df, count, on='ID', how='inner')
merged_df.head()


merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36457 entries, 0 to 36456
Data columns (total 22 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   ID                   36457 non-null  int64  
 1   CODE_GENDER          36457 non-null  object 
 2   FLAG_OWN_CAR         36457 non-null  object 
 3   FLAG_OWN_REALTY      36457 non-null  object 
 4   CNT_CHILDREN         36457 non-null  int64  
 5   AMT_INCOME_TOTAL     36457 non-null  float64
 6   NAME_INCOME_TYPE     36457 non-null  object 
 7   NAME_EDUCATION_TYPE  36457 non-null  object 
 8   NAME_FAMILY_STATUS   36457 non-null  object 
 9   NAME_HOUSING_TYPE    36457 non-null  object 
 10  DAYS_BIRTH           36457 non-null  int64  
 11  DAYS_EMPLOYED        36457 non-null  int64  
 12  FLAG_MOBIL           36457 non-null  int64  
 13  FLAG_WORK_PHONE      36457 non-null  int64  
 14  FLAG_PHONE           36457 non-null  int64  
 15  FLAG_EMAIL           36457 non-null  int64  
 16  OCCUPATION_TYPE      31274 non-null  object 
 17  CNT_FAM_MEMBERS      36457 non-null  float64
 18  MONTHS_BALANCE       36457 non-null  int64  
 19  STATUS               36457 non-null  int64  
 20  High Risk            36457 non-null  int64  
 21  Approved             36457 non-null  int64  
dtypes: float64(2), int64(12), object(8)
memory usage: 6.1+ MB


# Convert DAYS_BIRTH and DAYS_EMPLOYED into Age and Working Years
merged_df['AGE']=-(merged_df['DAYS_BIRTH'])//365
merged_df['WORKING_YEARS']=-(merged_df['DAYS_EMPLOYED'])//365


merged_df.describe()


# Drop FLAG_MOBIL 
merged_df = merged_df.drop('FLAG_MOBIL', axis=1)


# Check for customers with negative WORKING_YEARS
merged_df[merged_df['WORKING_YEARS'] < 0]


# Replace WORKING_YEARS for all pensioners to be -1
merged_df.loc[merged_df["NAME_INCOME_TYPE"]=="Pensioner", "WORKING_YEARS"] = -1


# Drop rows with null values
merged_df = merged_df.dropna(axis=0)


# Drop columns we no longer use
merged_df = merged_df.drop('ID', axis=1)
merged_df = merged_df.drop('MONTHS_BALANCE', axis=1)
merged_df = merged_df.drop('STATUS', axis=1)
merged_df = merged_df.drop('High Risk', axis=1)
merged_df = merged_df.drop('DAYS_BIRTH', axis=1)
merged_df = merged_df.drop('DAYS_EMPLOYED', axis=1)


# Drop CODE_GENDER to prevent gender discrimination in the model
merged_df = merged_df.drop('CODE_GENDER', axis=1)


merged_df['AMT_INCOME_TOTAL']=merged_df['AMT_INCOME_TOTAL'].astype(object)
merged_df['AMT_INCOME_TOTAL'] = merged_df['AMT_INCOME_TOTAL']/1000 
print(merged_df['AMT_INCOME_TOTAL'].value_counts(bins=10,sort=False))
merged_df['AMT_INCOME_TOTAL'].plot(kind='hist',bins=50,density=True)

AMT_INCOME_TOTAL
(25.451, 181.8]     19356
(181.8, 336.6]       9751
(336.6, 491.4]       1769
(491.4, 646.2]        193
(646.2, 801.0]        134
(801.0, 955.8]         50
(955.8, 1110.6]         4
(1110.6, 1265.4]        3
(1265.4, 1420.2]        6
(1420.2, 1575.0]        8
Name: count, dtype: int64

<Axes: ylabel='Frequency'>


print(merged_df['AGE'].value_counts(bins=10,normalize=True,sort=False))
merged_df['AGE'].plot(kind='hist',bins=20,density=True)

AGE
(19.951, 24.8]    0.020304
(24.8, 29.6]      0.108557
(29.6, 34.4]      0.136887
(34.4, 39.2]      0.142451
(39.2, 44.0]      0.137878
(44.0, 48.8]      0.091833
(48.8, 53.6]      0.107469
(53.6, 58.4]      0.111882
(58.4, 63.2]      0.101746
(63.2, 68.0]      0.040993
Name: proportion, dtype: float64

<Axes: ylabel='Frequency'>


print(merged_df['WORKING_YEARS'].value_counts(bins=10,sort=False))
merged_df['WORKING_YEARS'].plot(kind='hist',bins=10,density=True)

WORKING_YEARS
(-1.045, 3.4]    15680
(3.4, 7.8]        7206
(7.8, 12.2]       4667
(12.2, 16.6]      1617
(16.6, 21.0]      1064
(21.0, 25.4]       525
(25.4, 29.8]       274
(29.8, 34.2]       151
(34.2, 38.6]        58
(38.6, 43.0]        32
Name: count, dtype: int64

<Axes: ylabel='Frequency'>


pvt_tbl = pd.pivot_table(data = merged_df, 
                         index = ['OCCUPATION_TYPE'], 
                         columns = ['NAME_FAMILY_STATUS'], 
                         values = 'Approved', 
                         aggfunc = sum,  fill_value = 0)

plt.figure(figsize=[10,10])
hm = sns.heatmap(data = pvt_tbl, 
                 annot = True, 
                 fmt='.0f', 
                 linewidths=.2, 
                 center = 1600)

bottom, top = hm.get_ylim()
hm.set_ylim(bottom + 0.5, top - 0.5)
plt.title("Approved Customers Count by Occupation Type and Family Status")
plt.show()


pvt_tbl = pd.pivot_table(data = merged_df, 
                         index = ['OCCUPATION_TYPE'], 
                         columns = ['NAME_HOUSING_TYPE'], 
                         values = 'Approved', 
                         aggfunc = sum,  fill_value = 0)

plt.figure(figsize=[10,10])
hm = sns.heatmap(data = pvt_tbl, 
                 annot = True, 
                 fmt='.0f', 
                 linewidths=.2, 
                 center = 1600)

bottom, top = hm.get_ylim()
hm.set_ylim(bottom + 0.5, top - 0.5)
plt.title("Approved Customers Count by Occupation Type and Housing Type")
plt.show()


def calc_woe_iv(feature) :
    df = pd.DataFrame(columns = ['values','total','good','bad','good_rate','bad_rate','dist_good','dist_bad','WOE','IV'])
    df['values'] = merged_df[feature].unique()
    df.set_index('values', inplace = True)
    
    values = merged_df[feature].unique()
    total_dict = dict(merged_df.groupby(feature).size())
    col_approved_dict = dict(merged_df.groupby([feature,'Approved']).size())
    approved_count = dict(merged_df.groupby(['Approved']).size())
    
    for value in values :
        df.loc[value]['total'] = total_dict[value]
        if (value,1) in col_approved_dict:
            df.loc[value]['good'] = col_approved_dict[(value,1)]
        else :
            df.loc[value]['good'] = 0
        
        if (value,0) in col_approved_dict:
            df.loc[value]['bad'] = col_approved_dict[(value,0)]
        else :
            df.loc[value]['bad'] = 0
            
        if df.loc[value]['bad'] == 0 :
            df = df.drop([value])
        
    df['good_rate'] = df['good']/df['total']
    df['bad_rate'] = df['bad']/df['total']
    
    df['dist_good'] = df['good']/approved_count[1]
    df['dist_bad'] = df['bad']/approved_count[0]
    
    df['WOE'] = np.log(df.dist_good.astype('float64')/df.dist_bad.astype('float64'))
    df['IV'] = df['WOE'] * (df['dist_good'] - df['dist_bad']) 
    
    
    iv = df['IV'].sum()
    print("Information value (IV):", iv)
    return df


iv_table = pd.DataFrame(columns = ['feature','iv'])


iv_table['feature'] = merged_df.columns
iv_table.set_index(['feature'],inplace = True)
iv_table.drop(['Approved'],inplace = True)
iv_table


merged_df['FLAG_OWN_CAR'] = merged_df['FLAG_OWN_CAR'].replace(['N','Y'], [0,1])
FLAG_OWN_CAR_df = calc_woe_iv('FLAG_OWN_CAR')
iv_table.loc['FLAG_OWN_CAR'] = FLAG_OWN_CAR_df.IV.sum()
FLAG_OWN_CAR_df

Information value (IV): 0.00047516760905406334


merged_df['FLAG_OWN_REALTY'] = merged_df['FLAG_OWN_REALTY'].replace(['N','Y'], [0,1])
FLAG_OWN_REALTY_df = calc_woe_iv('FLAG_OWN_REALTY')
iv_table.loc['FLAG_OWN_REALTY'] = FLAG_OWN_REALTY_df.IV.sum()
FLAG_OWN_REALTY_df

Information value (IV): 0.010526936966177698


FLAG_PHONE_df = calc_woe_iv('FLAG_PHONE')
iv_table.loc['FLAG_PHONE'] = FLAG_PHONE_df.IV.sum()
FLAG_PHONE_df

Information value (IV): 0.0017100207770481316


FLAG_WORK_PHONE_df = calc_woe_iv('FLAG_WORK_PHONE')
iv_table.loc['FLAG_WORK_PHONE'] = FLAG_WORK_PHONE_df.IV.sum()
FLAG_WORK_PHONE_df

Information value (IV): 4.3508959059651395e-05


FLAG_EMAIL_df = calc_woe_iv('FLAG_EMAIL')
iv_table.loc['FLAG_EMAIL'] = FLAG_EMAIL_df.IV.sum()
FLAG_EMAIL_df

Information value (IV): 0.003934518023302461


merged_df['cnt_child_bin'] = merged_df.CNT_CHILDREN.apply(lambda x : '2+' if x>= 2 else str(x))


CNT_CHILDREN_df = calc_woe_iv('cnt_child_bin')
iv_table.loc['CNT_CHILDREN'] = CNT_CHILDREN_df.IV.sum()
CNT_CHILDREN_df

Information value (IV): 0.0005624845340762583


bins = [0, 70, 100, 150, 200, 250, 300, 350, 1600]
labels = ['70', '100', '150', '200', '250', '300', '350', '1600']
merged_df['income_bin'] = pd.cut(merged_df['AMT_INCOME_TOTAL'], bins = bins, labels = labels)


merged_df.income_bin.value_counts()

income_bin
150     8830
200     6564
250     5865
100     3004
300     2416
1600    1936
70      1395
350     1264
Name: count, dtype: int64


AMT_INCOME_TOTAL_df = calc_woe_iv('income_bin')
iv_table.loc['AMT_INCOME_TOTAL'] = AMT_INCOME_TOTAL_df.IV.sum()
AMT_INCOME_TOTAL_df.sort_values(by = 'WOE', inplace = True)
AMT_INCOME_TOTAL_df

Information value (IV): 0.014692899629109128


merged_df.CNT_FAM_MEMBERS.value_counts()

CNT_FAM_MEMBERS
2.0     16774
1.0      6100
3.0      5383
4.0      2625
5.0       317
6.0        51
7.0        18
15.0        3
9.0         2
20.0        1
Name: count, dtype: int64


merged_df['cnt_family_bin'] = merged_df.CNT_FAM_MEMBERS.apply(lambda x : '4+' if x>= 4 else str(x))


CNT_FAM_MEMBERS_df = calc_woe_iv('cnt_family_bin')
iv_table.loc['CNT_FAM_MEMBERS'] = CNT_FAM_MEMBERS_df.IV.sum()
CNT_FAM_MEMBERS_df.sort_values(by = 'WOE', inplace = True)
CNT_FAM_MEMBERS_df

Information value (IV): 0.001482738217308712


bins = [19, 27, 30, 35, 40, 45, 50, 55, 62, 70]
labels = ['27','30','35','40','45','50','55','62','70']
merged_df['age_bin'] = pd.cut(merged_df['AGE'], bins = bins, labels = labels)
merged_df.age_bin.value_counts()

age_bin
62    4871
40    4590
35    4246
45    4004
50    3566
55    3302
30    2552
27    2332
70    1811
Name: count, dtype: int64


AGE_df = calc_woe_iv('age_bin')
iv_table.loc['AGE'] = AGE_df.IV.sum()
AGE_df.sort_values(by = 'WOE', inplace = True)
AGE_df

Information value (IV): 0.01428216840831179


bins = [-2, -1,  5, 10, 15,  20, 50]
labels = ['retired','5','10','15','20','20+']
merged_df['wkg_years_bin'] = pd.cut(merged_df['WORKING_YEARS'], bins = bins, labels = labels)
merged_df.wkg_years_bin.value_counts()

wkg_years_bin
5          13694
10          6515
retired     6152
15          2600
20+         1179
20          1134
Name: count, dtype: int64


WORKING_YEARS_df = calc_woe_iv('wkg_years_bin')
iv_table.loc['WORKING_YEARS'] = WORKING_YEARS_df.IV.sum()
WORKING_YEARS_df.sort_values(by = 'WOE',inplace = True)
WORKING_YEARS_df

Information value (IV): 0.007172907442583332


NAME_INCOME_TYPE_df = calc_woe_iv('NAME_INCOME_TYPE')
iv_table.loc['NAME_INCOME_TYPE'] = NAME_INCOME_TYPE_df.IV.sum()
NAME_INCOME_TYPE_df

Information value (IV): 0.006580480165306965


NAME_EDUCATION_TYPE_df = calc_woe_iv('NAME_EDUCATION_TYPE')
iv_table.loc['NAME_EDUCATION_TYPE'] = NAME_EDUCATION_TYPE_df.IV.sum()
NAME_EDUCATION_TYPE_df

Information value (IV): 0.003972847939147731


NAME_FAMILY_STATUS_df = calc_woe_iv('NAME_FAMILY_STATUS')
iv_table.loc['NAME_FAMILY_STATUS'] = NAME_FAMILY_STATUS_df.IV.sum()
NAME_FAMILY_STATUS_df

Information value (IV): 0.003208161449406242


NAME_HOUSING_TYPE_df = calc_woe_iv('NAME_HOUSING_TYPE')
iv_table.loc['NAME_HOUSING_TYPE'] = NAME_HOUSING_TYPE_df.IV.sum()
NAME_HOUSING_TYPE_df

Information value (IV): 0.005002140385334694


OCCUPATION_TYPE_df = calc_woe_iv('OCCUPATION_TYPE')
iv_table.loc['OCCUPATION_TYPE'] = OCCUPATION_TYPE_df.IV.sum()
OCCUPATION_TYPE_df

Information value (IV): 0.017190146841521217


iv_table.sort_values(by = 'iv', ascending=False, inplace = True)
iv_table


from imblearn.over_sampling import SMOTE
import itertools

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# Define Confusion Matrix function
def plot_confusion_matrix(cm, classes, 
                     normalize=False, 
                     title='Confusion Matrix', 
                     cmap=plt.cm.Blues):
    
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        
    print(cm)

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')


merged_df.columns

Index(['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'CNT_CHILDREN', 'AMT_INCOME_TOTAL',
       'NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS',
       'NAME_HOUSING_TYPE', 'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL',
       'OCCUPATION_TYPE', 'CNT_FAM_MEMBERS', 'Approved', 'AGE',
       'WORKING_YEARS', 'cnt_child_bin', 'income_bin', 'cnt_family_bin',
       'age_bin', 'wkg_years_bin'],
      dtype='object')


df_train = merged_df[['FLAG_OWN_CAR', 'FLAG_OWN_REALTY', 'NAME_INCOME_TYPE', 
              'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS', 'NAME_HOUSING_TYPE', 
              'FLAG_WORK_PHONE', 'FLAG_PHONE', 'FLAG_EMAIL', 'OCCUPATION_TYPE', 
              'cnt_child_bin', 'income_bin', 'cnt_family_bin', 'age_bin', 'wkg_years_bin', 'Approved']]


def convert_dummy(df, columns) :
    # Creating a dummy variable for some of the categorical variables and dropping the first one.
    dummy1 = pd.get_dummies(df[columns], drop_first=True)
    
    # Adding the results to the master dataframe
    df1 = pd.concat([df, dummy1], axis=1)
    
    #Dropping the initial column
    df1.drop(columns, axis = 1, inplace = True)
    
    return df1


df_train = convert_dummy(df_train, ['NAME_INCOME_TYPE', 'NAME_EDUCATION_TYPE', 'NAME_FAMILY_STATUS','NAME_HOUSING_TYPE',
                                   'OCCUPATION_TYPE', 'cnt_child_bin', 'income_bin', 'cnt_family_bin', 'age_bin', 
                                    'wkg_years_bin'])


Y = df_train['Approved']
X = df_train.drop(['Approved'], axis=1)


x_train, x_test, y_train, y_test = train_test_split(X, Y, 
                                                    stratify = Y, 
                                                    test_size = 0.3,
                                                    random_state = 10086)


smote= SMOTE(sampling_strategy='minority', random_state=43)
x_train_smote, y_train_smote = smote.fit_resample(x_train, y_train)


model = LogisticRegression(C=0.8,
                           random_state=0,
                           solver='lbfgs')
model.fit(x_train_smote, y_train_smote)
y_predict = model.predict(x_test)

print('Accuracy Score: {:.5}'.format(accuracy_score(y_test, y_predict)))
print('Precision Score: {:.5}'.format(precision_score(y_test, y_predict)))
print('Recall Score: {:.5}'.format(recall_score(y_test, y_predict)))
print('F1 Score: {:.5}'.format(f1_score(y_test, y_predict)))

print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

sns.set_style('white') 
class_names = ['0','1']
plot_confusion_matrix(confusion_matrix(y_test,y_predict),
                      classes= class_names, normalize = True, 
                      title='Normalized Confusion Matrix: Logistic Regression')

Accuracy Score: 0.76116
Precision Score: 0.88285
Recall Score: 0.84045
F1 Score: 0.86113
      0     1
0   194   922
1  1319  6948
[[0.17383513 0.82616487]
 [0.15955002 0.84044998]]


model = DecisionTreeClassifier(max_depth=12,
                               min_samples_split=8,
                               random_state=1024)
model.fit(x_train_smote, y_train_smote)
y_predict = model.predict(x_test)

print('Accuracy Score: {:.5}'.format(accuracy_score(y_test, y_predict)))
print('Precision Score: {:.5}'.format(precision_score(y_test, y_predict)))
print('Recall Score: {:.5}'.format(recall_score(y_test, y_predict)))
print('F1 Score: {:.5}'.format(f1_score(y_test, y_predict)))

print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

plot_confusion_matrix(confusion_matrix(y_test,y_predict),
                      classes=class_names, normalize = True, 
                      title='Normalized Confusion Matrix: CART')

Accuracy Score: 0.58681
Precision Score: 0.91619
Recall Score: 0.58449
F1 Score: 0.71368
      0     1
0   674   442
1  3435  4832
[[0.60394265 0.39605735]
 [0.41550744 0.58449256]]


model = RandomForestClassifier(n_estimators=250,
                              max_depth=12,
                              min_samples_leaf=16
                              )
model.fit(x_train_smote, y_train_smote)
y_predict = model.predict(x_test)

print('Accuracy Score: {:.5}'.format(accuracy_score(y_test, y_predict)))
print('Precision Score: {:.5}'.format(precision_score(y_test, y_predict)))
print('Recall Score: {:.5}'.format(recall_score(y_test, y_predict)))
print('F1 Score: {:.5}'.format(f1_score(y_test, y_predict)))

print(pd.DataFrame(confusion_matrix(y_test,y_predict)))

plot_confusion_matrix(confusion_matrix(y_test,y_predict),
                      classes=class_names, normalize = True, 
                      title='Normalized Confusion Matrix: Random Forest')

Accuracy Score: 0.70564
Precision Score: 0.90484
Recall Score: 0.74416
F1 Score: 0.81667
      0     1
0   469   647
1  2115  6152
[[0.4202509  0.5797491 ]
 [0.25583646 0.74416354]]

	ID	MONTHS_BALANCE	STATUS
0	5001711	0	X
1	5001711	-1	0
2	5001711	-2	0
3	5001711	-3	0
4	5001712	0	C

	ID	CNT_CHILDREN	AMT_INCOME_TOTAL	DAYS_BIRTH	DAYS_EMPLOYED	FLAG_MOBIL	FLAG_WORK_PHONE	FLAG_PHONE	FLAG_EMAIL	CNT_FAM_MEMBERS	MONTHS_BALANCE	STATUS	High Risk	Approved	AGE	WORKING_YEARS
count	3.645700e+04	36457.000000	3.645700e+04	36457.000000	36457.000000	36457.0	36457.000000	36457.000000	36457.000000	36457.000000	36457.000000	36457.000000	36457.000000	36457.000000	36457.000000	36457.000000
mean	5.078227e+06	0.430315	1.866857e+05	-15975.173382	59262.935568	1.0	0.225526	0.294813	0.089722	2.198453	21.332392	21.332392	0.317497	0.882300	43.260334	-162.834161
std	4.187524e+04	0.742367	1.017892e+05	4200.549944	137651.334859	0.0	0.417934	0.455965	0.285787	0.911686	14.911849	14.911849	1.531811	0.322257	11.510414	377.066049
min	5.008804e+06	0.000000	2.700000e+04	-25152.000000	-15713.000000	1.0	0.000000	0.000000	0.000000	1.000000	1.000000	1.000000	0.000000	0.000000	20.000000	-1001.000000
25%	5.042028e+06	0.000000	1.215000e+05	-19438.000000	-3153.000000	1.0	0.000000	0.000000	0.000000	2.000000	9.000000	9.000000	0.000000	1.000000	34.000000	1.000000
50%	5.074614e+06	0.000000	1.575000e+05	-15563.000000	-1552.000000	1.0	0.000000	0.000000	0.000000	2.000000	18.000000	18.000000	0.000000	1.000000	42.000000	4.000000
75%	5.115396e+06	1.000000	2.250000e+05	-12462.000000	-408.000000	1.0	0.000000	1.000000	0.000000	3.000000	31.000000	31.000000	0.000000	1.000000	53.000000	8.000000
max	5.150487e+06	19.000000	1.575000e+06	-7489.000000	365243.000000	1.0	1.000000	1.000000	1.000000	20.000000	61.000000	61.000000	49.000000	1.000000	68.000000	43.000000

IV	Ability to predict
<0.02	Almost no predictive power
0.02~0.1	weak predictive power
0.1~0.3	Moderate predictive power
0.3~0.5	Strong predictive power
>0.5	Predictive power is too strong, need to check variables

	total	good	bad	good_rate	bad_rate	dist_good	dist_bad	WOE	IV
values
1	11805	10435	1370	0.883947	0.116053	0.378725	0.368181	0.028236	0.000298
0	19469	17118	2351	0.879244	0.120756	0.621275	0.631819	-0.016829	0.000177

	total	good	bad	good_rate	bad_rate	dist_good	dist_bad	WOE	IV
values
1	21183	18822	2361	0.888543	0.111457	0.68312	0.634507	0.073822	0.003589
0	10091	8731	1360	0.865226	0.134774	0.31688	0.365493	-0.142724	0.006938

Credit Approval Prediction (EDA, WOE & IV calculation, Modelling)¶

Context:¶

Application Record EDA¶

Credit Record EDA¶

Identify Usable Samples¶

Determine Approved and Denied Customers¶

SMOTE and how it works¶

Merge Application Data & Approval Data¶

Approved Customers EDA¶

Distribution of Annual Income ('000)¶

Distribution of Age¶

Distribution of Working Years¶

Approval Count Heatmap¶

Calculate WOE and IV¶

Binary Variables¶

Possession of Car¶

Possession of Realty Property¶

Possession of Phone¶

Possession of Work Phone¶

Possession of Email¶

Continuous Variables¶

Number of Children¶

Annual Income¶

Number of Family Members¶

Age¶

Working Years¶

Categorical Variables¶

Income Type¶

Education Level¶

Family Status¶

Housing Type¶

Occupation Type¶

Information Values¶

Algorithms¶

Logistic Regression¶

Decision Tree¶

Random Forest¶

Results¶

	ID	CODE_GENDER	FLAG_OWN_CAR	FLAG_OWN_REALTY	AMT_INCOME_TOTAL	NAME_INCOME_TYPE	NAME_EDUCATION_TYPE	NAME_FAMILY_STATUS	NAME_HOUSING_TYPE	DAYS_BIRTH	DAYS_EMPLOYED	FLAG_MOBIL	FLAG_WORK_PHONE	FLAG_PHONE	FLAG_EMAIL	OCCUPATION_TYPE	CNT_FAM_MEMBERS
0	5008804	M	Y	Y	427500.0	Working	Higher education	Civil marriage	Rented apartment	-12005	-4542	1	1	0	0	NaN	2.0
1	5008805	M	Y	Y	427500.0	Working	Higher education	Civil marriage	Rented apartment	-12005	-4542	1	1	0	0	NaN	2.0
2	5008806	M	Y	Y	112500.0	Working	Secondary / secondary special	Married	House / apartment	-21474	-1134	1	0	0	0	Security staff	2.0
3	5008808	F	N	Y	270000.0	Commercial associate	Secondary / secondary special	Single / not married	House / apartment	-19110	-3051	1	0	1	1	Sales staff	1.0
4	5008809	F	N	Y	270000.0	Commercial associate	Secondary / secondary special	Single / not married	House / apartment	-19110	-3051	1	0	1	1	Sales staff	1.0

	ID	CODE_GENDER	FLAG_OWN_CAR	FLAG_OWN_REALTY	CNT_CHILDREN	AMT_INCOME_TOTAL	NAME_INCOME_TYPE	NAME_EDUCATION_TYPE	NAME_FAMILY_STATUS	NAME_HOUSING_TYPE	...	FLAG_PHONE	FLAG_EMAIL	OCCUPATION_TYPE	CNT_FAM_MEMBERS	MONTHS_BALANCE	STATUS	High Risk	Approved	AGE	WORKING_YEARS
7	5008812	F	N	Y	0	283500.0	Pensioner	Higher education	Separated	House / apartment	...	0	0	Pensioner	1.0	17	17	0	1	61	-1001
8	5008813	F	N	Y	0	283500.0	Pensioner	Higher education	Separated	House / apartment	...	0	0	Pensioner	1.0	17	17	0	1	61	-1001
9	5008814	F	N	Y	0	283500.0	Pensioner	Higher education	Separated	House / apartment	...	0	0	Pensioner	1.0	17	17	0	1	61	-1001
69	5008884	F	N	Y	0	315000.0	Pensioner	Secondary / secondary special	Widow	House / apartment	...	0	0	Pensioner	1.0	41	41	0	1	55	-1001
150	5008974	F	N	Y	0	112500.0	Pensioner	Secondary / secondary special	Married	House / apartment	...	0	0	Pensioner	2.0	3	3	0	1	61	-1001
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
36408	5126278	F	N	N	0	162000.0	Pensioner	Secondary / secondary special	Married	House / apartment	...	0	0	Pensioner	2.0	29	29	9	0	58	-1001
36418	5139446	F	N	Y	0	112500.0	Pensioner	Secondary / secondary special	Widow	House / apartment	...	1	0	Pensioner	1.0	17	17	4	0	58	-1001
36432	5145690	F	N	Y	0	306000.0	Pensioner	Higher education	Married	House / apartment	...	0	0	Pensioner	2.0	18	18	13	0	59	-1001
36434	5145817	F	N	Y	0	90000.0	Pensioner	Secondary / secondary special	Married	House / apartment	...	0	0	Pensioner	2.0	40	40	1	0	60	-1001
36439	5148602	M	N	Y	0	225000.0	Pensioner	Secondary / secondary special	Married	House / apartment	...	0	0	Pensioner	2.0	16	16	2	0	62	-1001

	iv
feature
FLAG_OWN_CAR	NaN
FLAG_OWN_REALTY	NaN
CNT_CHILDREN	NaN
AMT_INCOME_TOTAL	NaN
NAME_INCOME_TYPE	NaN
NAME_EDUCATION_TYPE	NaN
NAME_FAMILY_STATUS	NaN
NAME_HOUSING_TYPE	NaN
FLAG_WORK_PHONE	NaN
FLAG_PHONE	NaN
FLAG_EMAIL	NaN
OCCUPATION_TYPE	NaN
CNT_FAM_MEMBERS	NaN
AGE	NaN
WORKING_YEARS	NaN

	total	good	bad	good_rate	bad_rate	dist_good	dist_bad	WOE	IV
values
0	22145	19449	2696	0.878257	0.121743	0.705876	0.724536	-0.026093	0.000487
1	9129	8104	1025	0.88772	0.11228	0.294124	0.275464	0.065546	0.001223

	total	good	bad	good_rate	bad_rate	dist_good	dist_bad	WOE	IV
values
0	24391	21480	2911	0.880653	0.119347	0.779588	0.782317	-0.003493	0.00001
1	6883	6073	810	0.882319	0.117681	0.220412	0.217683	0.012455	0.000034

	total	good	bad	good_rate	bad_rate	dist_good	dist_bad	WOE	IV
values
0	28527	25193	3334	0.883128	0.116872	0.914347	0.895996	0.020274	0.000372
1	2747	2360	387	0.859119	0.140881	0.085653	0.104004	-0.194127	0.003562

	total	good	bad	good_rate	bad_rate	dist_good	dist_bad	WOE	IV
values
0	21819	19222	2597	0.880975	0.119025	0.697637	0.697931	-0.000420	0.0
2+	3169	2772	397	0.874724	0.125276	0.100606	0.106692	-0.058731	0.000357
1	6286	5559	727	0.884346	0.115654	0.201757	0.195378	0.032128	0.000205

	total	good	bad	good_rate	bad_rate	dist_good	dist_bad	WOE	IV
values
250	5865	5047	818	0.860529	0.139471	0.183174	0.219833	-0.182432	0.006688
350	1264	1098	166	0.868671	0.131329	0.03985	0.044612	-0.112861	0.000537
1600	1936	1689	247	0.872417	0.127583	0.0613	0.06638	-0.079615	0.000404
300	2416	2117	299	0.876242	0.123758	0.076834	0.080355	-0.044807	0.000158
100	3004	2647	357	0.881158	0.118842	0.096069	0.095942	0.001327	0.0
150	8830	7816	1014	0.885164	0.114836	0.283671	0.272507	0.040151	0.000448
200	6564	5880	684	0.895795	0.104205	0.213407	0.183822	0.149235	0.004415
70	1395	1259	136	0.902509	0.097491	0.045694	0.036549	0.223299	0.002042

	total	good	bad	good_rate	bad_rate	dist_good	dist_bad	WOE	IV
values
4+	3017	2629	388	0.871395	0.128605	0.095416	0.104273	-0.088765	0.000786
1.0	6100	5353	747	0.877541	0.122459	0.19428	0.200752	-0.032772	0.000212
2.0	16774	14802	1972	0.882437	0.117563	0.537219	0.529965	0.013595	0.000099
3.0	5383	4769	614	0.885937	0.114063	0.173085	0.165009	0.047778	0.000386

	total	good	bad	good_rate	bad_rate	dist_good	dist_bad	WOE	IV
values
27	2332	2004	328	0.859348	0.140652	0.072733	0.088148	-0.192232	0.002963
30	2552	2198	354	0.861285	0.138715	0.079774	0.095136	-0.176113	0.002705
35	4246	3677	569	0.865992	0.134008	0.133452	0.152916	-0.136147	0.00265
55	3302	2907	395	0.880376	0.119624	0.105506	0.106154	-0.006128	0.000004
50	3566	3149	417	0.883062	0.116938	0.114289	0.112067	0.019635	0.000044
45	4004	3550	454	0.886613	0.113387	0.128843	0.12201	0.054487	0.000372
62	4871	4338	533	0.890577	0.109423	0.157442	0.143241	0.094528	0.001342
40	4590	4099	491	0.893028	0.106972	0.148768	0.131954	0.119935	0.002017
70	1811	1631	180	0.900607	0.099393	0.059195	0.048374	0.201873	0.002184

	total	good	bad	good_rate	bad_rate	dist_good	dist_bad	WOE	IV
values
15	2600	2268	332	0.872308	0.127692	0.082314	0.089223	-0.080600	0.000557
20	1134	991	143	0.873898	0.126102	0.035967	0.038431	-0.066249	0.000163
5	13694	11978	1716	0.87469	0.12531	0.434726	0.461166	-0.059043	0.001561
10	6515	5750	765	0.882579	0.117421	0.208689	0.20559	0.014960	0.000046
retired	6152	5508	644	0.895319	0.104681	0.199906	0.173072	0.144139	0.003868
20+	1179	1058	121	0.897371	0.102629	0.038399	0.032518	0.166226	0.000978

	total	good	bad	good_rate	bad_rate	dist_good	dist_bad	WOE	IV
values
Working	15622	13770	1852	0.881449	0.118551	0.499764	0.497716	0.004107	0.000008
Commercial associate	7052	6144	908	0.871242	0.128758	0.222988	0.24402	-0.090132	0.001896
Pensioner	6152	5508	644	0.895319	0.104681	0.199906	0.173072	0.144139	0.003868
State servant	2437	2121	316	0.870332	0.129668	0.076979	0.084923	-0.098218	0.00078
Student	11	10	1	0.909091	0.090909	0.000363	0.000269	0.300466	0.000028

	total	good	bad	good_rate	bad_rate	dist_good	dist_bad	WOE	IV
values
Secondary / secondary special	21720	19144	2576	0.8814	0.1186	0.694806	0.692287	0.003633	0.000009
Higher education	8141	7182	959	0.882201	0.117799	0.260661	0.257726	0.011323	0.000033
Incomplete higher	1051	904	147	0.860133	0.139867	0.032809	0.039506	-0.185722	0.001244
Lower secondary	347	314	33	0.904899	0.095101	0.011396	0.008869	0.250766	0.000634
Academic degree	15	9	6	0.6	0.4	0.000327	0.001612	-1.596654	0.002053

	total	good	bad	good_rate	bad_rate	dist_good	dist_bad	WOE	IV
values
Married	21424	18911	2513	0.882702	0.117298	0.68635	0.675356	0.016148	0.000178
Single / not married	4127	3586	541	0.868912	0.131088	0.130149	0.145391	-0.110746	0.001688
Separated	1812	1609	203	0.887969	0.112031	0.058397	0.054555	0.068043	0.000261
Civil marriage	2503	2188	315	0.874151	0.125849	0.079411	0.084655	-0.063948	0.000335
Widow	1408	1259	149	0.894176	0.105824	0.045694	0.040043	0.132008	0.000746

	total	good	bad	good_rate	bad_rate	dist_good	dist_bad	WOE	IV
values
House / apartment	28000	24724	3276	0.883	0.117	0.897325	0.880408	0.019032	0.000322
Rented apartment	469	399	70	0.850746	0.149254	0.014481	0.018812	-0.261653	0.001133
Municipal apartment	989	851	138	0.860465	0.139535	0.030886	0.037087	-0.182961	0.001135
With parents	1437	1243	194	0.864997	0.135003	0.045113	0.052137	-0.144694	0.001016
Co-op apartment	160	148	12	0.925	0.075	0.005371	0.003225	0.510187	0.001095
Office apartment	219	188	31	0.858447	0.141553	0.006823	0.008331	-0.199664	0.000301

	total	good	bad	good_rate	bad_rate	dist_good	dist_bad	WOE	IV
values
Security staff	592	501	91	0.846284	0.153716	0.018183	0.024456	-0.296372	0.001859
Sales staff	3485	3096	389	0.888379	0.111621	0.112365	0.104542	0.072168	0.000565
Pensioner	6152	5508	644	0.895319	0.104681	0.199906	0.173072	0.144139	0.003868
Accountants	1240	1094	146	0.882258	0.117742	0.039705	0.039237	0.011870	0.000006
Laborers	6206	5479	727	0.882855	0.117145	0.198853	0.195378	0.017632	0.000061
Managers	3011	2622	389	0.870807	0.129193	0.095162	0.104542	-0.094006	0.000882
Drivers	2137	1874	263	0.87693	0.12307	0.068014	0.07068	-0.038443	0.000102
Core staff	3578	3120	458	0.871996	0.128004	0.113236	0.123085	-0.083400	0.000821
High skill tech staff	1383	1202	181	0.869125	0.130875	0.043625	0.048643	-0.108874	0.000546
Cleaning staff	551	488	63	0.885662	0.114338	0.017711	0.016931	0.045062	0.000035
Private service staff	344	322	22	0.936047	0.063953	0.011687	0.005912	0.681390	0.003934
Cooking staff	655	569	86	0.868702	0.131298	0.020651	0.023112	-0.112586	0.000277
Low-skill Laborers	174	142	32	0.816092	0.183908	0.005154	0.0086	-0.512028	0.001765
Medicine staff	1206	1044	162	0.865672	0.134328	0.037891	0.043537	-0.138901	0.000784
Secretaries	151	138	13	0.913907	0.086093	0.005009	0.003494	0.360185	0.000546
Student	11	10	1	0.909091	0.090909	0.000363	0.000269	0.300466	0.000028
Waiters/barmen staff	174	155	19	0.890805	0.109195	0.005626	0.005106	0.096867	0.00005
HR staff	85	71	14	0.835294	0.164706	0.002577	0.003762	-0.378496	0.000449
Realty agents	79	69	10	0.873418	0.126582	0.002504	0.002687	-0.070598	0.000013
IT staff	60	49	11	0.816667	0.183333	0.001778	0.002956	-0.508194	0.000599

	iv
feature
OCCUPATION_TYPE	0.01719
AMT_INCOME_TOTAL	0.014693
AGE	0.014282
FLAG_OWN_REALTY	0.010527
WORKING_YEARS	0.007173
NAME_INCOME_TYPE	0.00658
NAME_HOUSING_TYPE	0.005002
NAME_EDUCATION_TYPE	0.003973
FLAG_EMAIL	0.003935
NAME_FAMILY_STATUS	0.003208
FLAG_PHONE	0.00171
CNT_FAM_MEMBERS	0.001483
CNT_CHILDREN	0.000562
FLAG_OWN_CAR	0.000475
FLAG_WORK_PHONE	0.000044