The goal of this page is to explore how different machine algorithms work for predicting the default probability of a credit position (loan).
The datasets that are being used are
The following machine learning algorithm are used:
Before building the model, please note the following limitations
library(sqldf)
library(Information)
library(gridExtra)
library(kableExtra)
library(ggplot2)
library(reshape2)
library(pROC)
library(scorecard)
library(rpart)
library(gbm)
library(plyr)
The bar-plot below shows the percentage of defaults (label=1) compared to the percentage of non-defaults (label=0).
par(mfrow=c(1,2))
barplot(prop.table(table(tran[,"TARGET"])), main = "default flag ratio")
# Pie Chart with Percentages
target_pct <- sqldf("select target, count(*) as target_count from tran group by 1 order by 1 desc")
slices <- target_pct[,'target_count']
lbls <- target_pct[,'TARGET']
pct <- round(slices/sum(slices)*100)
lbls <- paste(c("deft.","perform"), pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices,labels = lbls, col=rainbow(length(lbls)),
main="Default percentage")
Since the algorithm selected are not sensitive to outliers and missing values. There is not too much emphasis on the data treatment on these issues. The missing value will be analyzed in the data exploration step.
# 365243 should be considered as NA
tran[which(tran[,'DAYS_EMPLOYED']==365243),'DAYS_EMPLOYED'] <- NA
tran[which(tran[,'DAYS_BIRTH']==365243),'DAYS_BIRTH'] <- NA
#quantile(tran[,"AMT_INCOME_TOTAL"], probs = c(0.9,0.99))
mean_income <- mean(tran[,"AMT_INCOME_TOTAL"], na.rm = TRUE)
sd_income <- sd(tran[,"AMT_INCOME_TOTAL"], na.rm = TRUE)
## winsoring
upper_bound <- mean_income+3*sd_income
outlier <- nrow((tran[which(tran[,"AMT_INCOME_TOTAL"]>upper_bound),]))/nrow(tran)
tran$AMT_INCOME_TOTAL[tran$AMT_INCOME_TOTAL>upper_bound] <- upper_bound
cat(paste0("income average: ", round(mean_income), "\n", "income standard deviation: ", round(sd_income),"\n","percentage of income are winsorized: ", outlier,"\n"))
## income average: 164856
## income standard deviation: 79216
## percentage of income are winsorized: 0.0260543525272267
tran$VTL <- tran$AMT_GOODS_PRICE/tran$AMT_CREDIT
tran$ITL <- tran$AMT_INCOME_TOTAL/tran$AMT_CREDIT
tran$ITA <- tran$AMT_INCOME_TOTAL/tran$AMT_ANNUITY
Derive features from credit bureau data
library(sqldf)
bureau_groupbyID <- sqldf("
select SK_ID_CURR
, sum(case when DAYS_CREDIT > -365 then 1 else 0 end) as b_CreditInquiry12M
, sum(case when DAYS_CREDIT > -730 then 1 else 0 end) as b_CreditInquiry24M
, count(*) as b_CreditInquiryAll
, sum(case when CREDIT_ACTIVE='Active' then AMT_CREDIT_SUM else 0 end) as b_AMT_CREDIT_SUM
, sum(case when CREDIT_ACTIVE='Active' then AMT_CREDIT_SUM_DEBT else 0 end) as b_AMT_CREDIT_SUM_DEBT
, sum(case when CREDIT_ACTIVE='Active' then AMT_ANNUITY else 0 end) as b_AMT_ANNUITY
, sum(case when CREDIT_ACTIVE='Active' then AMT_CREDIT_SUM_LIMIT else 0 end) as b_AMT_CREDIT_SUM_LIMIT
from bureau
group by 1")
Derive features from credit card balance
library(sqldf)
## combined card information if 1 customer has more than 1 credit card
CardPerPerson <- sqldf("
select SK_ID_CURR, MONTHS_BALANCE
, sum(AMT_BALANCE) as AMT_BALANCE
, sum(AMT_CREDIT_LIMIT_ACTUAL) as AMT_CREDIT_LIMIT_ACTUAL
, max(SK_DPD) as SK_DPD
from card
group by 1,2")
CardFeatures <- sqldf("
select SK_ID_CURR
, avg(AMT_BALANCE) as c_AMT_BALANCE_mean
, stdev(AMT_BALANCE) as c_AMT_BALANCE_sd
, avg(AMT_CREDIT_LIMIT_ACTUAL) as c_AMT_LIMIT_mean
, stdev(AMT_CREDIT_LIMIT_ACTUAL) as c_AMT_LIMIT_sd
, avg(AMT_BALANCE/AMT_CREDIT_LIMIT_ACTUAL) c_utilization_mean
, stdev(AMT_BALANCE/AMT_CREDIT_LIMIT_ACTUAL) c_utilization_sd
, max(SK_DPD) as c_SK_DPD_max
, max(MONTHS_BALANCE) as last_month
from CardPerPerson
group by 1
")
CardFeatures <- sqldf("
select
a.*
, (b.AMT_BALANCE/b.AMT_CREDIT_LIMIT_ACTUAL-a.c_utilization_mean)/a.c_utilization_sd as c_last_utl_zscore
from CardFeatures a
left join CardPerPerson b
on a.last_month = b.MONTHS_BALANCE
and a.SK_ID_CURR = b.SK_ID_CURR
")
Combine the features of credit bureau & credit card to the applicatino dataset
tran_ext <- sqldf("
select
a.*
, b.b_CreditInquiry12M
, b.b_CreditInquiry24M
, b.b_CreditInquiryAll
, b.b_AMT_CREDIT_SUM
, b.b_AMT_CREDIT_SUM_DEBT
, b.b_AMT_ANNUITY
, b.b_AMT_CREDIT_SUM_LIMIT
, c.c_AMT_BALANCE_mean
, c.c_AMT_BALANCE_sd
, c.c_AMT_LIMIT_mean
, c.c_AMT_LIMIT_sd
, c.c_utilization_mean
, c.c_utilization_sd
, c.c_last_utl_zscore
, c.c_SK_DPD_max
from tran a
left join bureau_groupbyID b
on a.SK_ID_CURR=b.SK_ID_CURR
left join CardFeatures c
on a.SK_ID_CURR=c.SK_ID_CURR")
Derive additional features
tran_ext$CreditIncreaseMult <- tran_ext$b_AMT_CREDIT_SUM/tran_ext$AMT_CREDIT
tran_ext$PaymentStress <- (tran_ext$b_AMT_ANNUITY + tran_ext$AMT_ANNUITY)/tran_ext$AMT_INCOME_TOTAL
tran_ext$InquiryRatio12M <- tran_ext$b_CreditInquiry12M/tran_ext$b_CreditInquiryAll
tran_ext$DaysEmployedRatio <- tran_ext$DAYS_EMPLOYED/tran_ext$DAYS_BIRTH
tran_ext$IncomeToDaysEmployed <- tran_ext$AMT_INCOME_TOTAL/tran_ext$DAYS_EMPLOYED
tran_ext$IncomePerPerson <- tran_ext$AMT_INCOME_TOTAL/tran_ext$CNT_FAM_MEMBERS
Data treatment again
tran_ext$c_utilization_mean[tran_ext$c_utilization_mean>1] <- 1
missing_analysis <- as.data.frame(sapply(tran_ext[tran_train,], function(x) sum(is.na(x))))
missing_analysis$nrows <- nrow(tran_ext[tran_train,])
colnames(missing_analysis) <- c("missing","nrows")
missing_analysis$missing_rate <- missing_analysis$missing/missing_analysis$nrows
missing_analysis[which(missing_analysis$missing_rate>0.5),]
## missing nrows missing_rate
## OWN_CAR_AGE 135213 205007 0.6595531
## EXT_SOURCE_1 115771 205007 0.5647173
## APARTMENTS_AVG 104085 205007 0.5077144
## BASEMENTAREA_AVG 119930 205007 0.5850044
## YEARS_BUILD_AVG 136372 205007 0.6652066
## COMMONAREA_AVG 143216 205007 0.6985908
## ELEVATORS_AVG 109222 205007 0.5327721
## ENTRANCES_AVG 103259 205007 0.5036852
## FLOORSMIN_AVG 139125 205007 0.6786354
## LANDAREA_AVG 121763 205007 0.5939456
## LIVINGAPARTMENTS_AVG 140222 205007 0.6839864
## LIVINGAREA_AVG 102902 205007 0.5019438
## NONLIVINGAPARTMENTS_AVG 142395 205007 0.6945860
## NONLIVINGAREA_AVG 113129 205007 0.5518299
## APARTMENTS_MODE 104085 205007 0.5077144
## BASEMENTAREA_MODE 119930 205007 0.5850044
## YEARS_BUILD_MODE 136372 205007 0.6652066
## COMMONAREA_MODE 143216 205007 0.6985908
## ELEVATORS_MODE 109222 205007 0.5327721
## ENTRANCES_MODE 103259 205007 0.5036852
## FLOORSMIN_MODE 139125 205007 0.6786354
## LANDAREA_MODE 121763 205007 0.5939456
## LIVINGAPARTMENTS_MODE 140222 205007 0.6839864
## LIVINGAREA_MODE 102902 205007 0.5019438
## NONLIVINGAPARTMENTS_MODE 142395 205007 0.6945860
## NONLIVINGAREA_MODE 113129 205007 0.5518299
## APARTMENTS_MEDI 104085 205007 0.5077144
## BASEMENTAREA_MEDI 119930 205007 0.5850044
## YEARS_BUILD_MEDI 136372 205007 0.6652066
## COMMONAREA_MEDI 143216 205007 0.6985908
## ELEVATORS_MEDI 109222 205007 0.5327721
## ENTRANCES_MEDI 103259 205007 0.5036852
## FLOORSMIN_MEDI 139125 205007 0.6786354
## LANDAREA_MEDI 121763 205007 0.5939456
## LIVINGAPARTMENTS_MEDI 140222 205007 0.6839864
## LIVINGAREA_MEDI 102902 205007 0.5019438
## NONLIVINGAPARTMENTS_MEDI 142395 205007 0.6945860
## NONLIVINGAREA_MEDI 113129 205007 0.5518299
## AMT_INCOME_TOTAL" 204703 205007 0.9985171
## c_AMT_BALANCE_mean 147291 205007 0.7184681
## c_AMT_BALANCE_sd 147706 205007 0.7204925
## c_AMT_LIMIT_mean 147291 205007 0.7184681
## c_AMT_LIMIT_sd 147706 205007 0.7204925
## c_utilization_mean 147856 205007 0.7212242
## c_utilization_sd 148291 205007 0.7233460
## c_last_utl_zscore 175594 205007 0.8565269
## c_SK_DPD_max 147291 205007 0.7184681
barplot(prop.table(table(tran[tran_train,"OCCUPATION_TYPE"])), horiz=T , las=1, main='Occupation type',cex.names=.5)
barplot_IncomeType <- ggplot(tran_ext[tran_train,], aes(x=factor(NAME_INCOME_TYPE)),)+
geom_bar(aes(y = (..count..)/sum(..count..)))+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 8, hjust = 1))
barplot_EduType <- ggplot(tran_ext[tran_train,], aes(x=factor(NAME_EDUCATION_TYPE)),)+
geom_bar(aes(y = (..count..)/sum(..count..)))+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 8, hjust = 1))
grid.arrange(barplot_IncomeType, barplot_EduType, nrow = 1)
par(mfrow=c(1,2))
hist(tran[tran_train,"DAYS_EMPLOYED"], main='Days employed', xlab = 'Days employed')
hist(tran[tran_train,"DAYS_BIRTH"], main='Days birth', xlab = 'Days birth')
par(mfrow=c(2,2))
hist(tran[tran_train,"EXT_SOURCE_1"], main='External source 1')
hist(tran[tran_train,"EXT_SOURCE_2"], main='External source 2')
hist(tran[tran_train,"EXT_SOURCE_3"], main='External source 3')
tran[which(tran[,"VTL"]>2),"VTL"] <- NA
par(mfrow=c(2,2))
hist(tran[tran_train,"AMT_INCOME_TOTAL"], main='Income')
hist(tran[tran_train,"AMT_CREDIT"], main='Loan amount')
hist(tran[tran_train,"AMT_GOODS_PRICE"], main="Goods price")
hist(tran[tran_train,"VTL"], main="Goods price to loan amount ratio", xlim=c(0.5,1), breaks = 35)
Variable | IV |
---|---|
EXT_SOURCE_3 | 0.3323778 |
EXT_SOURCE_2 | 0.3110848 |
EXT_SOURCE_1 | 0.1541127 |
DAYS_EMPLOYED | 0.1104089 |
InquiryRatio12M | 0.0977675 |
DaysEmployedRatio | 0.0890180 |
b_CreditInquiry12M | 0.0862857 |
OCCUPATION_TYPE | 0.0831829 |
DAYS_BIRTH | 0.0823114 |
IncomeToDaysEmployed | 0.0766828 |
Variable | IV | |
---|---|---|
11 | VTL | 0.0762401 |
12 | ORGANIZATION_TYPE | 0.0727611 |
13 | b_CreditInquiry24M | 0.0672627 |
14 | c_utilization_mean | 0.0627193 |
15 | NAME_INCOME_TYPE | 0.0547894 |
16 | AMT_GOODS_PRICE | 0.0546221 |
17 | NAME_EDUCATION_TYPE | 0.0521986 |
18 | REGION_RATING_CLIENT_W_CITY | 0.0508815 |
19 | REGION_RATING_CLIENT | 0.0481477 |
20 | AMT_CREDIT | 0.0478144 |
library(plyr)
library(ggplot2)
library(gridExtra)
tran_ext$TARGET_char <- as.character(tran_ext$TARGET)
mu_EXT_SOURCE_3 <- ddply(tran_ext, "TARGET_char", summarise, grp.mean=mean(EXT_SOURCE_3, na.rm = TRUE))
mu_DAYS_EMPLOYED <- ddply(tran_ext, "TARGET_char", summarise, grp.mean=mean(DAYS_EMPLOYED, na.rm = TRUE))
mu_b_CreditInquiry12M <- ddply(tran_ext, "TARGET_char", summarise, grp.mean=mean(b_CreditInquiry12M, na.rm = TRUE))
mu_c_utilization_mean <- ddply(tran_ext, "TARGET_char", summarise, grp.mean=mean(c_utilization_mean, na.rm = TRUE))
p_EXT_SOURCE_3 <- ggplot(tran_ext[tran_train,], aes(x=EXT_SOURCE_3, fill=TARGET_char, color=TARGET_char)) +
geom_histogram(aes(y = ..density..), position="identity", alpha=0.5,na.rm = TRUE) +
scale_color_manual(values=c('blue','red'))+
scale_fill_manual(values=c('blue','red'))+
theme(legend.position="bottom")+
geom_vline(data=mu_EXT_SOURCE_3, aes(xintercept=grp.mean, color=TARGET_char),
linetype="dashed")
p_DAYS_EMPLOYED <- ggplot(tran_ext[tran_train,], aes(x=DAYS_EMPLOYED, fill=TARGET_char, color=TARGET_char)) +
geom_histogram(aes(y = ..density..), position="identity", alpha=0.5,na.rm = TRUE) +
xlim(c(-15000, 0))+
scale_color_manual(values=c('blue','red'))+
scale_fill_manual(values=c('blue','red'))+
theme(legend.position="bottom")+
geom_vline(data=mu_DAYS_EMPLOYED, aes(xintercept=grp.mean, color=TARGET_char),
linetype="dashed")
p_b_CreditInquiry12M <- ggplot(tran_ext[tran_train,], aes(x=b_CreditInquiry12M, fill=TARGET_char, color=TARGET_char)) +
geom_histogram(aes(y = ..density..), position="identity", alpha=0.5,na.rm = TRUE) +
xlim(c(0, 10))+
scale_color_manual(values=c('blue','red'))+
scale_fill_manual(values=c('blue','red'))+
theme(legend.position="bottom")+
geom_vline(data=mu_b_CreditInquiry12M, aes(xintercept=grp.mean, color=TARGET_char),
linetype="dashed")
p_c_utilization_mean <- ggplot(tran_ext[tran_train,], aes(x=c_utilization_mean, fill=TARGET_char, color=TARGET_char)) +
geom_histogram(aes(y = ..density..), position="identity", alpha=0.5, na.rm = TRUE) +
scale_color_manual(values=c('blue','red'))+
scale_fill_manual(values=c('blue','red'))+
theme(legend.position="bottom")+
geom_vline(data=mu_c_utilization_mean, aes(xintercept=grp.mean, color=TARGET_char),
linetype="dashed")
grid.arrange(p_EXT_SOURCE_3, p_DAYS_EMPLOYED, nrow = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
grid.arrange(p_b_CreditInquiry12M, p_c_utilization_mean, nrow = 1)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
To avoid including strongly correlated variables in the explanatory variable set, the following correlation analysis is performed.
library(ggplot2)
library(reshape2)
corX <- c('EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','DAYS_BIRTH','DAYS_EMPLOYED','IncomeToDaysEmployed','DaysEmployedRatio','InquiryRatio12M','b_CreditInquiry12M','b_CreditInquiry24M','b_AMT_CREDIT_SUM_DEBT','b_AMT_CREDIT_SUM','CreditIncreaseMult','AMT_CREDIT','AMT_GOODS_PRICE','PaymentStress','b_AMT_ANNUITY','c_utilization_mean','c_AMT_BALANCE_mean','c_last_utl_zscore')
### Get lower triangle of the correlation matrix
get_lower_tri <- function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
### Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
corX_matrix <- cor(tran_ext[tran_train,corX],method="spearman",use='pairwise.complete.obs')
upper_tri <- round(abs(get_upper_tri(corX_matrix)),2)
melted_cormat <- melt(upper_tri, na.rm = TRUE)
### Heatmap
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
geom_text(aes(label = value), color = 'black', size = 2) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(0,1), space = "Lab",
name="Spearman\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 8, hjust = 1))+
coord_fixed()
Correlation of the building information
library(ggplot2)
library(reshape2)
corX <- c("ELEVATORS_AVG","FLOORSMAX_AVG","LIVINGAREA_AVG","APARTMENTS_MODE","ELEVATORS_MODE","FLOORSMAX_MODE","LIVINGAREA_MODE","APARTMENTS_MEDI","BASEMENTAREA_MEDI","ELEVATORS_MEDI","FLOORSMAX_MEDI","FLOORSMIN_MEDI","LIVINGAREA_MEDI","TOTALAREA_MODE","REGION_POPULATION_RELATIVE")
### Get lower triangle of the correlation matrix
get_lower_tri <- function(cormat){
cormat[upper.tri(cormat)] <- NA
return(cormat)
}
### Get upper triangle of the correlation matrix
get_upper_tri <- function(cormat){
cormat[lower.tri(cormat)]<- NA
return(cormat)
}
corX_matrix <- cor(tran_ext[tran_train,corX],method="spearman",use='pairwise.complete.obs')
upper_tri <- round(abs(get_upper_tri(corX_matrix)),2)
melted_cormat <- melt(upper_tri, na.rm = TRUE)
### Heatmap
ggplot(data = melted_cormat, aes(Var2, Var1, fill = value))+
geom_tile(color = "white")+
geom_text(aes(label = value), color = 'black', size = 2) +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(0,1), space = "Lab",
name="Spearman\nCorrelation") +
theme_minimal()+
theme(axis.text.x = element_text(angle = 45, vjust = 1,
size = 8, hjust = 1))+
coord_fixed()
Variables that are related to the loan amount (credit) or the payment (strategy) can be strategy-driven and thus are not ideal variables for predicting the default risk. For example, if the company’s lending costs or operational costs have decreased and therefore can take on more risk and increase the credit it is extending. This change does not directly provide the casual effect on the default risk of the client.
The explanatory variables are transformed to WOE before serving as the independent variables for the logistic regression. Highly correlated variables are excluded from the previous step.
library(scorecard)
vars_logistic <- c('EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3','InquiryRatio12M','DAYS_BIRTH','DAYS_EMPLOYED','OCCUPATION_TYPE','VTL','ORGANIZATION_TYPE','NAME_INCOME_TYPE','AMT_GOODS_PRICE','NAME_EDUCATION_TYPE','c_utilization_mean','c_last_utl_zscore','c_AMT_BALANCE_mean','APARTMENTS_AVG','b_CreditInquiry24M','REGION_RATING_CLIENT_W_CITY','b_AMT_CREDIT_SUM_DEBT','DAYS_LAST_PHONE_CHANGE','PaymentStress','HOUSETYPE_MODE','NAME_FAMILY_STATUS','b_AMT_ANNUITY','OWN_CAR_AGE','REG_CITY_NOT_LIVE_CITY','DAYS_REGISTRATION','CODE_GENDER','FLAG_EMP_PHONE','FLAG_DOCUMENT_3','REGION_POPULATION_RELATIVE','FLOORSMAX_AVG')
## remove insignificant var after 1st run
vars_logistic <- setdiff(vars_logistic, c('REGION_POPULATION_RELATIVE','c_AMT_BALANCE_mean','DAYS_BIRTH'))
## the binning & woe is created using training set and will later be applied to testing set to create clean out-of-sample testing
woes <- woebin(dt = tran_ext[tran_train,c('TARGET',vars_logistic)], y='TARGET')
var_woes <- woebin_ply(dt = tran_ext[tran_train,c('TARGET',vars_logistic)], bins = woes, to = "woe", no_cores = 2, print_step = 0L)
model <- glm(TARGET ~.,family=binomial(link='logit'),data=var_woes)
summary(model)
##
## Call:
## glm(formula = TARGET ~ ., family = binomial(link = "logit"),
## data = var_woes)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6299 -0.4359 -0.3131 -0.2202 3.3107
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.430564 0.008961 -271.227 < 2e-16 ***
## EXT_SOURCE_1_woe 0.479147 0.024550 19.517 < 2e-16 ***
## EXT_SOURCE_2_woe 0.728796 0.015955 45.679 < 2e-16 ***
## EXT_SOURCE_3_woe 0.770233 0.016808 45.826 < 2e-16 ***
## InquiryRatio12M_woe 0.069014 0.031846 2.167 0.030229 *
## DAYS_BIRTH_woe -0.069393 0.041428 -1.675 0.093930 .
## DAYS_EMPLOYED_woe 0.414018 0.033864 12.226 < 2e-16 ***
## OCCUPATION_TYPE_woe 0.160043 0.041346 3.871 0.000108 ***
## VTL_woe 0.600433 0.031978 18.776 < 2e-16 ***
## ORGANIZATION_TYPE_woe 0.506005 0.066550 7.603 2.88e-14 ***
## NAME_INCOME_TYPE_woe 0.302260 0.058245 5.189 2.11e-07 ***
## AMT_GOODS_PRICE_woe 0.419989 0.038684 10.857 < 2e-16 ***
## NAME_EDUCATION_TYPE_woe 0.521757 0.041901 12.452 < 2e-16 ***
## c_utilization_mean_woe 0.480150 0.040371 11.893 < 2e-16 ***
## c_last_utl_zscore_woe 0.405535 0.046363 8.747 < 2e-16 ***
## b_CreditInquiry24M_woe 0.201058 0.039688 5.066 4.06e-07 ***
## REGION_RATING_CLIENT_W_CITY_woe 0.292788 0.040141 7.294 3.01e-13 ***
## b_AMT_CREDIT_SUM_DEBT_woe 0.254989 0.055329 4.609 4.05e-06 ***
## DAYS_LAST_PHONE_CHANGE_woe 0.222899 0.043743 5.096 3.48e-07 ***
## PaymentStress_woe 0.541614 0.094237 5.747 9.07e-09 ***
## HOUSETYPE_MODE_woe 0.225099 0.061550 3.657 0.000255 ***
## NAME_FAMILY_STATUS_woe 0.311129 0.056541 5.503 3.74e-08 ***
## b_AMT_ANNUITY_woe -0.485916 0.087837 -5.532 3.17e-08 ***
## OWN_CAR_AGE_woe 0.657851 0.061458 10.704 < 2e-16 ***
## REG_CITY_NOT_LIVE_CITY_woe 0.202742 0.059082 3.432 0.000600 ***
## DAYS_REGISTRATION_woe 0.291276 0.055589 5.240 1.61e-07 ***
## CODE_GENDER_woe 0.663793 0.045206 14.684 < 2e-16 ***
## FLAG_EMP_PHONE_woe -0.844449 0.105900 -7.974 1.54e-15 ***
## FLAG_DOCUMENT_3_woe 0.538970 0.057870 9.313 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 115339 on 205006 degrees of freedom
## Residual deviance: 102409 on 204978 degrees of freedom
## AIC: 102467
##
## Number of Fisher Scoring iterations: 6
auc_logistic_train <- auc(tran_ext[tran_train,"TARGET"], predicted_logistic)
auc_logistic_test <- auc(tran_ext[tran_test,"TARGET"], predicted_logistic_test)
cat(paste0("Training AUC: ", auc_logistic_train,"\n", "Testing AUC: ", auc_logistic_test,"\n"))
## Training AUC: 0.748192595705134
## Testing AUC: 0.746169061745828
library(rpart)
predicted_tree_train <- predict(mytree, tran[tran_train,], type ="prob")
auc_tree_train <- auc(tran[tran_train,"TARGET"], predicted_tree_train[,'1'])
predicted_tree_test <- predict(mytree, tran[tran_test,], type ="prob")
auc_tree_test <- auc(tran[tran_test,"TARGET"], predicted_tree_test[,'1'])
cat(paste0("Training AUC: ", auc_tree_train,"\n", "Testing AUC: ", auc_tree_test,"\n"))
## Training AUC: 0.68065833438218
## Testing AUC: 0.669119187117814
library(gbm)
gbm_tree <- gbm.fit <- gbm(
formula = TARGET ~ .,
distribution = "bernoulli",
data = tran_ext[tran_train,],
n.minobsinnode=30,
n.trees = 1000,
interaction.depth = 2,
shrinkage = 0.005, # learning rate
n.cores = NULL, # will use all cores by default
verbose = TRUE
)
library(gbm)
predicted_gbm_train <- predict(gbm_tree, n.trees = gbm_tree$n.trees, tran_ext[tran_train,], type="response")
auc_gbm_train <- auc(tran_ext[tran_train,"TARGET"], predicted_gbm_train)
predicted_gbm_test <- predict(gbm_tree, n.trees = gbm_tree$n.trees, tran_ext[tran_test,], type="response")
auc_gbm_test <- auc(tran_ext[tran_test,"TARGET"], predicted_gbm_test)
cat(paste0("Training AUC: ", auc_gbm_train,"\n", "Testing AUC: ", auc_gbm_test,"\n"))
## Training AUC: 0.74399412015478
## Testing AUC: 0.738960864584719
to be continued…