Default of Credit Card Clients Analytics and machine learning

Personal Motivation

My mean goal with this project is understand the variables that interact in a financial dataframe and how to use them to develop a machine learning model

Brief introduction

The first steps of the project will focus on graphic illustrator on how the variable are distributed with the libraries pandas seaborn and matplotlib then i will apply some well know algorithms (Chi squared - Random forest and others) with the library scikit-learn

Loading dependencies

import pandas as pd
from matplotlib import dates as mpl_dates 
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd 
import seaborn as sns
from scipy import stats
from textblob import TextBlob
import scipy.stats as stats
import math
from datetime import datetime, timedelta 
from jinja2 import Environment, FileSystemLoader

Reading Data

df = pd.read_csv("UCI_Credit_Card.csv")
df = df.rename(columns={"default.payment.next.month": "target"})
df = df.rename(columns={"PAY_0": "PAY_1"})
df

df.describe().T

df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 ID         30000 non-null  int64  
 LIMIT_BAL  30000 non-null  float64
 SEX        30000 non-null  int64  
 EDUCATION  30000 non-null  int64  
 MARRIAGE   30000 non-null  int64  
 AGE        30000 non-null  int64  
 PAY_1      30000 non-null  int64  
 PAY_2      30000 non-null  int64  
 PAY_3      30000 non-null  int64  
 PAY_4      30000 non-null  int64  
PAY_5      30000 non-null  int64  
PAY_6      30000 non-null  int64  
BILL_AMT1  30000 non-null  float64
BILL_AMT2  30000 non-null  float64
BILL_AMT3  30000 non-null  float64
BILL_AMT4  30000 non-null  float64
BILL_AMT5  30000 non-null  float64
BILL_AMT6  30000 non-null  float64
PAY_AMT1   30000 non-null  float64
PAY_AMT2   30000 non-null  float64
PAY_AMT3   30000 non-null  float64
PAY_AMT4   30000 non-null  float64
PAY_AMT5   30000 non-null  float64
PAY_AMT6   30000 non-null  float64
target     30000 non-null  int64  
dtypes: float64(13), int64(12)
memory usage: 5.7 MB

df.isnull().sum(axis=0)
ID           0
LIMIT_BAL    0
SEX          0
EDUCATION    0
MARRIAGE     0
AGE          0
PAY_1        0
PAY_2        0
PAY_3        0
PAY_4        0
PAY_5        0
PAY_6        0
BILL_AMT1    0
BILL_AMT2    0
BILL_AMT3    0
BILL_AMT4    0
BILL_AMT5    0
BILL_AMT6    0
PAY_AMT1     0
PAY_AMT2     0
PAY_AMT3     0
PAY_AMT4     0
PAY_AMT5     0
PAY_AMT6     0
target       0
dtype: int64

# Check unique values in columns
print('SEX' + str(sorted(df['SEX'].unique())))
print('EDUCATION' + str(sorted(df['EDUCATION'].unique())))
print('MARRIAGE' + str(sorted(df['MARRIAGE'].unique())))
print('PAY_1' + str(sorted(df['PAY_1'].unique())))
print('target' + str(sorted(df['target'].unique())))
SEX[1, 2]
EDUCATION[0, 1, 2, 3, 4, 5, 6]
MARRIAGE[0, 1, 2, 3]
PAY_1[-2, -1, 0, 1, 2, 3, 4, 5, 6, 7, 8]
target[0, 1]

plt.style.use('fivethirtyeight')
plt.figure(figsize=(22,8))
sns.countplot(x="target", data=df, linewidth = 1)
plt.xticks(np.arange(2),('No', "Yes"), size = 16)
plt.xlabel("Default Payment" , alpha=0.9, size=22)
plt.show()
plt.savefig('Default_Payment')

plt.subplots(figsize=(20,5))
plt.subplot(121)
sns.distplot(df.LIMIT_BAL)

plt.subplot(122)
sns.distplot(df.AGE)

plt.show()

bins = [20,30,40,50,60,70,80]
names = ['21-30','31-40','41-50','51-60','61-70','71-80']
df['AGE_BIN'] = pd.cut(x=df.AGE, bins=bins, labels=names, right=True)

age_cnt = df.AGE_BIN.value_counts()
age_0 = (df.AGE_BIN[df['target'] == 0].value_counts())
age_1 = (df.AGE_BIN[df['target'] == 1].value_counts())

plt.subplots(figsize=(10,7))
# sns.barplot(data=defaulters, x='AGE_BIN', y='LIMIT_BAL', hue='def_pay', ci=0)
plt.bar(age_0.index, age_0.values, label='0')
plt.bar(age_1.index, age_1.values, label='1')
for x,y in zip(names,age_0):
    plt.text(x,y,y,fontsize=12)
for x,y in zip(names,age_1):
    plt.text(x,y,y,fontsize=12)
plt.xticks(fontsize=12)
plt.yticks(fontsize=12)
plt.title("Number of clients in each age group", fontsize=15)
plt.legend(loc='upper right', fontsize=15)
plt.show()

Exploratory data analysis

output = 'target'

# Let's do a little EDA
cols = [f for f in df.columns]
cols.remove("ID")
cols.remove("AGE_BIN")
cols.remove(output)

f = pd.melt( df, id_vars=output, value_vars=cols)
g = sns.FacetGrid( f, hue=output, col="variable", col_wrap=5, sharex=False, sharey=False )
g = g.map( sns.distplot, "value", kde=True)

Average credit limit and Age

limit_mean=df.groupby('AGE')['LIMIT_BAL'].mean()

plt.figure(figsize=(16,8))
plt.style.use("ggplot")
plt.grid(True)
plt.plot(np.arange(21,77,1),limit_mean, color="purple", alpha=0.7)
plt.title("Average credit limit and Age", color="red", size=22)
plt.xlabel("Age", color="black", alpha=0.8, size=16)
plt.xlim(20,80)
plt.show()

Education Distribution

plt.figure(figsize=(12,6))
sns.countplot(x="EDUCATION", data=df)
plt.xticks(np.arange(7), ("graduate school" if i==1 else "university" if i==2 else "high school" if i==3 else 
                          "other1" if i==4 else "other2" if i==5 else "other3" if i==6 else "other" for i in range(7)))
plt.xlabel("Education", size=22, alpha=0.7)
plt.show()

Chi-squared test

A chi-square or (χ2) statistic is a measure of the difference between the observed and expected frequencies of the outcomes of a set of events or variables. Chi-square is useful for analyzing such differences in categorical variables, especially those nominal in nature.

Categorical variable (also called qualitative variable) is a variable that can take on one of a limited, and usually fixed, number of possible values, assigning each individual or other unit of observation to a particular group or nominal category on the basis of some qualitative property

A nominal category or a nominal group is a group of objects or ideas that can be collectively grouped on the basis of a particular characteristic

So understanding the above concepts i decided to explain the chi squared with variable SEX

The first thing that we need is define the expected values and totalize them also the easiest way to do it is with a table

Then we need to calculated the expected value with the division of the totals of default or no default with total of each sex

Then the chi squared is the sum of all equations of observed - expected raise to 2 and divided by expected

Degrees of freedom

Refers to the maximum number of logically independent values, which are values that have the freedom to vary, in the data sample. so it is the (sum of rows -1) * (sum of columns -1)

Is a 0.05 P-Value Significant?

A p-value less than 0.05 is typically considered to be statistically significant, in which case the null hypothesis should be rejected. A p-value greater than 0.05 means that deviation from the null hypothesis is not statistically significant, and the null hypothesis is not rejected.

Code

def ChiSquaredTestOfIndependence( df, inputVar, Outcome_Category ):
    # Useful to have this wrapped in a function
    # The ChiSquaredTest of Independence - 
    # has a null hypothesis: the OutcomeCategory is independent of the inputVar
    # So we create a test-statistic which is a measure of the difference between 
    # "expected" i.e. what we WOULD observe if the OutcomeCategory WAS independent of the inputVar
    # "observed" i.e. what the data actually shows
    # the p-value returned is the probability of seeing this test-statistic if the null-hypothesis is true
    Outcome_Category_Table = df.groupby( Outcome_Category )[ Outcome_Category ].count().values
    # Outcome_Category_Table  is the sum between the different values that are in the column target we only have 0 and 1 so the total  of both are 
    #target
    #0    23364
    #1     6636
    Outcome_Category_Ratios = Outcome_Category_Table / sum( Outcome_Category_Table )
    # Outcome_Category_Ratios is the sum of the values that are in the column target (30000) then divided into the Outcome_Category_Table
    #target ------ Outcome_Category_Ratios
    #0    23364   | 0.7788
    #1     6636   | 0.2212
    possibleVals = df[inputVar].unique()
    # inputVar = SEX
    # possibleVals = 1 and 2
    observed = []
    expected = []
    for possible in possibleVals:
        countsInCategories = df[ df[ inputVar ] == possible ].groupby( Outcome_Category )[Outcome_Category].count().values
        # we have two categories in the column sex we are going to count how many are 0 or 1 for each category
        # Category 1
        # 0    9015
        # 1    2873
        # Category 2
        # 0    14349
        # 1     3763
        
        if( len(countsInCategories) != len( Outcome_Category_Ratios ) ):
            # The len() function returns the number of items in an object.
            # in each category we have 2 values and Outcome_Category_Ratios (target) we have 2 values
            print("Error! The class " + str( possible) +" of \'" + inputVar + "\' does not contain all values of \'" + Outcome_Category + "\'" )
            return
        elif( min(countsInCategories) < 5 ):
            # The min() function returns the item with the lowest value, or the item with the lowest value in an iterable.
            print("Chi Squared Test needs at least 5 observations in each cell!")
            print( inputVar + "=" + str(possible) + " has insufficient data")
            print( countsInCategories )
            return
        else:
            observed.append( countsInCategories )   
            expected.append( Outcome_Category_Ratios * len( df[df[ inputVar ] == possible ]))
            #we have the outcome ratios of the total of the values  (default or not default) divide by the total of target 23364 / 30000 or 6636 / 30000
            #then we multiply the ratios for the total of men or women (men = 11888 | women = 18112)
    observed = np.array( observed )
    expected = np.array( expected )
    chi_squared_stat = ((observed - expected)**2 / expected).sum().sum()
    # chi squared is the sum  of all equations of observed - expected raise to 2 and divided by expected
    #Degrees of freedom
    degOfF = (observed.shape[0] - 1 ) *(observed.shape[1] - 1 ) 
    #Degrees of freedom refers to the maximum number of logically independent values, which are values that have the freedom to vary, in the data sample.
    #crit = stats.chi2.ppf(q = 0.95,df = degOfF) 
    p_value = 1 - stats.chi2.cdf(x=chi_squared_stat, df=degOfF)
    print("Calculated test-statistic is %.2f" % chi_squared_stat )
    print("If " + Outcome_Category + " is indep of " + inputVar + ", this has prob %.2e of occurring" % p_value )
    #t_stat, p_val, doF, expArray = stats.chi2_contingency(observed= observed, correction=False)
    #print("Using built-in stats test: outputs")
    #print("test-statistic=%.2f, p-value=%.2f, degsOfFreedom=%d" % ( t_stat, p_val, doF ) )
    

ChiSquaredTestOfIndependence( df, "SEX", output )

Calculated test-statistic is 47.91 If target is indep of SEX, this has prob 4.47e-12 of occurring

# Ok. So "default" is not independent of "SEX".
ChiSquaredTestOfIndependence( df, "EDUCATION", output )   

Error! The class 0 of ‘EDUCATION’ does not contain all values of ‘target’

The problem that we have with education variable is all target values are equal to 0

education_class_0 = df[(df["EDUCATION"] == 0)]
education_class_0

# The quantitative vars:
quant = ["LIMIT_BAL", "AGE"]

# The qualitative but "Encoded" variables (ie most of them)
qual_Enc = cols
qual_Enc.remove("LIMIT_BAL")
qual_Enc.remove("AGE")

logged = []
for ii in range(1,7):
    qual_Enc.remove("PAY_AMT" + str( ii ))
    df[ "log_PAY_AMT" + str( ii )]  = df["PAY_AMT"  + str( ii )].apply( lambda x: np.log1p(x) if (x>0) else 0 )
    logged.append("log_PAY_AMT" + str( ii ) )

for ii in range(1,7):
    qual_Enc.remove("BILL_AMT" + str( ii ))
    df[ "log_BILL_AMT" + str( ii )] = df["BILL_AMT" + str( ii )].apply( lambda x: np.log1p(x) if (x>0) else 0 )
    logged.append("log_BILL_AMT" + str( ii ) )

f = pd.melt( df, id_vars=output, value_vars=logged)
g = sns.FacetGrid( f, hue=output, col="variable", col_wrap=3, sharex=False, sharey=False )
g = g.map( sns.distplot, "value", kde=True)

features = quant + qual_Enc + logged   
X = df[features].values    
y = df[ output ].values

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.2)

from sklearn.preprocessing import StandardScaler
scX = StandardScaler()
X_train = scX.fit_transform( X_train )
X_test = scX.transform( X_test )

# We'll need some metrics to evaluate our models
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score

#-------------- 
# Random Forest 
#--------------
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=10)
classifier.fit( X_train, y_train )
y_pred = classifier.predict( X_test )

cm = confusion_matrix( y_test, y_pred )
print("Accuracy on Test Set for RandomForest = %.2f" % ((cm[0,0] + cm[1,1] )/len(X_test)))
scoresRF = cross_val_score( classifier, X_train, y_train, cv=10)
print("Mean RandomForest CrossVal Accuracy on Train Set %.2f, with std=%.2f" % (scoresRF.mean(), scoresRF.std() ))

#-------------- 
# kernel SVM 
#--------------
from sklearn.svm import SVC
classifier1 = SVC(kernel="rbf")
classifier1.fit( X_train, y_train )
y_pred = classifier1.predict( X_test )

cm = confusion_matrix( y_test, y_pred )
print("Accuracy on Test Set for kernel-SVM = %.2f" % ((cm[0,0] + cm[1,1] )/len(X_test)))
scoresSVC = cross_val_score( classifier1, X_train, y_train, cv=10)
print("Mean kernel-SVM CrossVal Accuracy on Train Set %.2f, with std=%.2f" % (scoresSVC.mean(), scoresSVC.std() ))

Accuracy on Test Set for RandomForest = 0.81 Mean RandomForest CrossVal Accuracy on Train Set 0.81, with std=0.00 Accuracy on Test Set for kernel-SVM = 0.83 Mean kernel-SVM CrossVal Accuracy on Train Set 0.82, with std=0.00

We’ll check some of the other classifiers - but we don’t expect they will do better

#-------------- 
# Logistic Regression 
#--------------
from sklearn.linear_model import LogisticRegression
classifier2 = LogisticRegression()
classifier2.fit( X_train, y_train )
y_pred = classifier2.predict( X_test )

cm = confusion_matrix( y_test, y_pred )
print("Accuracy on Test Set for LogReg = %.2f" % ((cm[0,0] + cm[1,1] )/len(X_test)))
scoresLR = cross_val_score( classifier2, X_train, y_train, cv=10)
print("Mean LogReg CrossVal Accuracy on Train Set %.2f, with std=%.2f" % (scoresLR.mean(), scoresLR.std() ))

#-------------- 
# Naive Bayes 
#--------------
from sklearn.naive_bayes import GaussianNB
classifier3 = GaussianNB()
classifier3.fit( X_train, y_train )
y_pred = classifier3.predict( X_test )
cm = confusion_matrix( y_test, y_pred )
print("Accuracy on Test Set for NBClassifier = %.2f" % ((cm[0,0] + cm[1,1] )/len(X_test)))
scoresNB = cross_val_score( classifier3, X_train, y_train, cv=10)
print("Mean NaiveBayes CrossVal Accuracy on Train Set %.2f, with std=%.2f" % (scoresNB.mean(), scoresNB.std() ))

#-------------- 
# K-NEIGHBOURS 
#--------------
from sklearn.neighbors import KNeighborsClassifier
classifier4 = KNeighborsClassifier(n_neighbors=5)
classifier4.fit( X_train, y_train )
y_pred = classifier4.predict( X_test )
cm = confusion_matrix( y_test, y_pred )
print("Accuracy on Test Set for KNeighborsClassifier = %.2f" % ((cm[0,0] + cm[1,1] )/len(X_test)))
scoresKN = cross_val_score( classifier3, X_train, y_train, cv=10)
print("Mean KN CrossVal Accuracy on Train Set Set %.2f, with std=%.2f" % (scoresKN.mean(), scoresKN.std() ))

Accuracy on Test Set for LogReg = 0.81 Mean LogReg CrossVal Accuracy on Train Set 0.81, with std=0.01 Accuracy on Test Set for NBClassifier = 0.77 Mean NaiveBayes CrossVal Accuracy on Train Set 0.76, with std=0.01 Accuracy on Test Set for KNeighborsClassifier = 0.80 Mean KN CrossVal Accuracy on Train Set Set 0.76, with std=0.01