Chapter 4 Build Final Scorecard Model

Data Binning and WOE calculation

comb <- comb_hard # Select which data set you want to use from above techniques #

set.seed(12345)
train_id <- sample(seq_len(nrow(comb)), size = floor(0.7*nrow(comb)))

train <- comb[train_id, ]
test <- comb[-train_id, ]

## categorical variable -> level<10, or
col_unique<-lapply(lapply(train,unique),length)
catag_variable<-names(col_unique[col_unique<=10])

#2. type=character 
chara_type<-lapply(train,typeof)
chara_names<-names(chara_type[chara_type=="character"])
catag_variable<-unique(c(chara_names,catag_variable))
catag_variable<-subset(catag_variable,!(catag_variable%in%c("good")))

#continuous variable (not categorical)
conti_variable<-names(train)
conti_variable<-subset(conti_variable,!(conti_variable%in%catag_variable))

# factorize both train and the test
train[,catag_variable]=lapply(train[,catag_variable],as.factor)
#str(train)

test[,catag_variable]=lapply(test[,catag_variable],as.factor)
#str(test)

# Binning continuous variable 
result_con <- list()
for(i in 1:length(conti_variable)){
  result_con[[conti_variable[i]]] <- smbinning(df = train, y = "good", x = conti_variable[i])
}

smbinning.sumiv.plot(iv_summary)

key_variable<-iv_summary$Char[iv_summary$IV>=0.1&is.na(iv_summary$IV)==FALSE]

results<-c(result_con)
result_all_sig<-results[key_variable]

for(i in c(1,4)) {
  train <- smbinning.gen(df = train, ivout = result_all_sig[[i]], chrname = paste(result_all_sig[[i]]$x, "_bin", sep = ""))
}

for (j in c(1,4)) {
  for (i in 1:nrow(train)) {
    bin_name <- paste(result_all_sig[[j]]$x, "_bin", sep = "")
    bin <- substr(train[[bin_name]][i], 2, 2)
    
    woe_name <- paste(result_all_sig[[j]]$x, "_WOE", sep = "")
    
    if(bin == 0) {
      bin <- dim(result_all_sig[[j]]$ivtable)[1] - 1
      train[[woe_name]][i] <- result_all_sig[[j]]$ivtable[bin, "WoE"]
    } else {
      train[[woe_name]][i] <- result_all_sig[[j]]$ivtable[bin, "WoE"]
    }
  }
}

#Below is useful for checking data cleaning process
#lapply(lapply(train[,key_variable[c(2,3,5)]],is.na),sum)

# calculate the WOE
train$good<-as.factor(train$good)
woemodel <- woe(good~., data = train, zeroadj=0.005, applyontrain = TRUE)

## At least one empty cell (class x level) does exists. Zero adjustment applied!
## At least one empty cell (class x level) does exists. Zero adjustment applied!
## At least one empty cell (class x level) does exists. Zero adjustment applied!
## At least one empty cell (class x level) does exists. Zero adjustment applied!
## At least one empty cell (class x level) does exists. Zero adjustment applied!
## At least one empty cell (class x level) does exists. Zero adjustment applied!
## At least one empty cell (class x level) does exists. Zero adjustment applied!

## apply woes 
traindata <- predict(woemodel, train, replace = TRUE)

## No woe model for variable(s): good

#str(traindata)
train=cbind(train,traindata[,c("woe_CARDS","woe_PERS_H","woe_EC_CARD")])

############################## mapling table for the categorical woe ##############################

cate1=unique(train[,c("CARDS","woe_CARDS")])
cate2=unique(train[,c("PERS_H","woe_PERS_H")])
cate3=unique(train[,c("EC_CARD","woe_EC_CARD")])

####################################################################################################

train$weight_ar<-as.numeric(as.character(train$weight_ar))

4.1 Build the logistic regression and variable selection

initial_score <- glm(data = train, GB ~  
                       TMJOB1_WOE + INCOME_WOE+
                        woe_CARDS+woe_PERS_H+woe_EC_CARD
                     , weights =weight_ar,family = "binomial")

summary(initial_score)

## 
## Call:
## glm(formula = GB ~ TMJOB1_WOE + INCOME_WOE + woe_CARDS + woe_PERS_H + 
##     woe_EC_CARD, family = "binomial", data = train, weights = weight_ar)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -5.115  -1.151   1.997   2.662   4.427  
## 
## Coefficients:
##             Estimate Std. Error  z value             Pr(>|z|)    
## (Intercept) -3.19686    0.02395 -133.491 < 0.0000000000000002 ***
## TMJOB1_WOE  -0.87136    0.03355  -25.970 < 0.0000000000000002 ***
## INCOME_WOE  -0.20747    0.05761   -3.601             0.000317 ***
## woe_CARDS    1.04236    0.12078    8.630 < 0.0000000000000002 ***
## woe_PERS_H   0.79047    0.03708   21.318 < 0.0000000000000002 ***
## woe_EC_CARD -0.18214    0.13127   -1.388             0.165270    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19394  on 3149  degrees of freedom
## Residual deviance: 17138  on 3144  degrees of freedom
## AIC: 20286
## 
## Number of Fisher Scoring iterations: 6

Variable Selected Logistic Regression

initial_score_red <- glm(data = train, GB ~  
                           TMJOB1_WOE + 
                           woe_CARDS+woe_PERS_H
                         , weights =weight_ar,family = "binomial")

summary(initial_score_red)

## 
## Call:
## glm(formula = GB ~ TMJOB1_WOE + woe_CARDS + woe_PERS_H, family = "binomial", 
##     data = train, weights = weight_ar)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.956  -1.162   1.992   2.663   4.365  
## 
## Coefficients:
##             Estimate Std. Error z value            Pr(>|z|)    
## (Intercept) -3.19627    0.02391 -133.66 <0.0000000000000002 ***
## TMJOB1_WOE  -0.89123    0.03301  -27.00 <0.0000000000000002 ***
## woe_CARDS    1.02451    0.04100   24.99 <0.0000000000000002 ***
## woe_PERS_H   0.80044    0.03724   21.50 <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 19394  on 3149  degrees of freedom
## Residual deviance: 17151  on 3146  degrees of freedom
## AIC: 20293
## 
## Number of Fisher Scoring iterations: 6

4.2 Evaluate the Initial Model

Training Data
1. KS - > best cut off 0.04327352
2. ROC

train$pred=predict(initial_score_red,data=train,type = "response")

#train[is.na(train$weight_ar),]

train$GB<-as.numeric(as.character(train$GB))
train$good<-as.numeric(as.character(train$good))
smbinning.metrics(dataset = train, prediction = "pred", actualclass = "GB", report = 1)

## 
##   Overall Performance Metrics 
##   -------------------------------------------------- 
##                     KS : 0.4908 (Good)
##                    AUC : 0.7948 (Fair)
## 
##   Classification Matrix 
##   -------------------------------------------------- 
##            Cutoff (>=) : 0.0433 (Optimal)
##    True Positives (TP) : 1345
##   False Positives (FP) : 380
##   False Negatives (FN) : 415
##    True Negatives (TN) : 1010
##    Total Positives (P) : 1760
##    Total Negatives (N) : 1390
## 
##   Business/Performance Metrics 
##   -------------------------------------------------- 
##       %Records>=Cutoff : 0.5476
##              Good Rate : 0.7797 (Vs 0.5587 Overall)
##               Bad Rate : 0.2203 (Vs 0.4413 Overall)
##         Accuracy (ACC) : 0.7476
##      Sensitivity (TPR) : 0.7642
##  False Neg. Rate (FNR) : 0.2358
##  False Pos. Rate (FPR) : 0.2734
##      Specificity (TNR) : 0.7266
##        Precision (PPV) : 0.7797
##   False Discovery Rate : 0.2203
##     False Omision Rate : 0.2912
##   Inv. Precision (NPV) : 0.7088
## 
##   Note: 0 rows deleted due to missing data.

smbinning.metrics(dataset = train[train$pred<=0.4,], prediction = "pred", actualclass = "GB", report = 0, plot = "ks")

smbinning.metrics(dataset = train, prediction = "pred", actualclass = "GB", report = 0, plot = "auc")

pred<-prediction(fitted(initial_score_red),factor(train$GB))
perf<-performance(pred,measure="tpr",x.measure="fpr")
plot(perf,lwd=3,colorsize=TRUE,colorkey=TRUE,colorsize.palette=rev(gray.colors(256)))

KS<-max(perf@y.values[[1]]-perf@x.values[[1]])
cutoffAtKS<-unlist(perf@alpha.values)[which.max(perf@y.values[[1]]-perf@x.values[[1]])]
print(c(KS,cutoffAtKS))

## [1] 0.49082325 0.04327352

Testing Data

test <- comb[-train_id, ]
test[,catag_variable]=lapply(test[,catag_variable],as.factor)
str(test)

## 'data.frame':    1350 obs. of  21 variables:
##  $ PERS_H   : Factor w/ 9 levels "1","10","2","3",..: 3 3 1 3 3 3 3 3 3 1 ...
##  $ TMADD    : int  3 60 72 168 192 240 264 288 360 999 ...
##  $ TMJOB1   : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ TEL      : Factor w/ 3 levels "0","1","2": 3 3 3 3 2 2 3 3 3 3 ...
##  $ NMBLOAN  : Factor w/ 3 levels "0","1","2": 1 3 3 1 1 1 3 1 1 3 ...
##  $ FINLOAN  : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 2 2 ...
##  $ INCOME   : int  1000 2900 2300 0 0 2100 0 0 3000 0 ...
##  $ EC_CARD  : Factor w/ 2 levels "0","1": 2 1 1 1 2 1 2 2 1 2 ...
##  $ BUREAU   : Factor w/ 3 levels "1","2","3": 1 1 1 1 3 1 1 1 3 1 ...
##  $ LOCATION : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ LOANS    : Factor w/ 11 levels "0","1","10","2",..: 4 2 2 2 1 2 2 2 1 2 ...
##  $ REGN     : Factor w/ 9 levels "0","2","3","4",..: 2 2 1 1 1 1 2 1 3 1 ...
##  $ CASH     : int  1300 900 1100 1900 1100 8000 1400 800 7000 3000 ...
##  $ PRODUCT  : Factor w/ 9 levels "","Cars","Dept. Store or Mail",..: 5 5 8 3 5 2 8 3 5 5 ...
##  $ RESID    : Factor w/ 3 levels "","Lease","Owner": 2 3 2 2 2 2 2 2 3 2 ...
##  $ PROF     : Factor w/ 14 levels "","Chemical Industr",..: 8 8 8 8 3 8 3 8 8 8 ...
##  $ CAR      : Factor w/ 4 levels "Car","Car and Motor bi",..: 1 1 4 1 1 1 1 1 1 4 ...
##  $ CARDS    : Factor w/ 6 levels "Cheque card",..: 1 3 3 2 1 3 1 1 3 1 ...
##  $ GB       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 1 1 1 1 ...
##  $ good     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ weight_ar: Factor w/ 4 levels "1","1.5","29.9597523219814",..: 4 4 4 4 4 4 4 4 4 4 ...

for(i in 1:1) {
  test <- smbinning.gen(df = test, ivout = result_all_sig[[i]], chrname = paste(result_all_sig[[i]]$x, "_bin", sep = ""))
}

for (j in 1:1) {
  for (i in 1:nrow(test)) {
    bin_name <- paste(result_all_sig[[j]]$x, "_bin", sep = "")
    bin <- substr(test[[bin_name]][i], 2, 2)
    
    woe_name <- paste(result_all_sig[[j]]$x, "_WOE", sep = "")
    
    if(bin == 0) {
      bin <- dim(result_all_sig[[j]]$ivtable)[1] - 1
      test[[woe_name]][i] <- result_all_sig[[j]]$ivtable[bin, "WoE"]
    } else {
      test[[woe_name]][i] <- result_all_sig[[j]]$ivtable[bin, "WoE"]
    }
  }
}

test$good<-as.factor(test$good)

########## categorical ####################################
test<-merge(test,cate1,by="CARDS",all.x = TRUE)
test<-merge(test,cate2,by="PERS_H",all.x = TRUE)
########## categorical ####################################

test$good=as.numeric(as.character(test$good))
test$GB=as.numeric(as.character(test$GB))
test$pred <- predict(initial_score_red, newdata=test, type='response')
smbinning.metrics(dataset = test, prediction = "pred", actualclass = "GB", report = 1)

## 
##   Overall Performance Metrics 
##   -------------------------------------------------- 
##                     KS : 0.4880 (Good)
##                    AUC : 0.7752 (Fair)
## 
##   Classification Matrix 
##   -------------------------------------------------- 
##            Cutoff (>=) : 0.0333 (Optimal)
##    True Positives (TP) : 608
##   False Positives (FP) : 181
##   False Negatives (FN) : 155
##    True Negatives (TN) : 405
##    Total Positives (P) : 763
##    Total Negatives (N) : 586
## 
##   Business/Performance Metrics 
##   -------------------------------------------------- 
##       %Records>=Cutoff : 0.5849
##              Good Rate : 0.7706 (Vs 0.5656 Overall)
##               Bad Rate : 0.2294 (Vs 0.4344 Overall)
##         Accuracy (ACC) : 0.7509
##      Sensitivity (TPR) : 0.7969
##  False Neg. Rate (FNR) : 0.2031
##  False Pos. Rate (FPR) : 0.3089
##      Specificity (TNR) : 0.6911
##        Precision (PPV) : 0.7706
##   False Discovery Rate : 0.2294
##     False Omision Rate : 0.2768
##   Inv. Precision (NPV) : 0.7232
## 
##   Note: 1 rows deleted due to missing data.

smbinning.metrics(dataset = test[test$pred<=0.4,], prediction = "pred", actualclass = "GB", report = 1, plot = "ks")

## 
##   Overall Performance Metrics 
##   -------------------------------------------------- 
##                     KS : 0.4901 (Good)
##                    AUC : 0.7770 (Fair)
## 
##   Classification Matrix 
##   -------------------------------------------------- 
##            Cutoff (>=) : 0.0333 (Optimal)
##    True Positives (TP) : 607
##   False Positives (FP) : 179
##   False Negatives (FN) : 155
##    True Negatives (TN) : 405
##    Total Positives (P) : 762
##    Total Negatives (N) : 584
## 
##   Business/Performance Metrics 
##   -------------------------------------------------- 
##       %Records>=Cutoff : 0.5840
##              Good Rate : 0.7723 (Vs 0.5661 Overall)
##               Bad Rate : 0.2277 (Vs 0.4339 Overall)
##         Accuracy (ACC) : 0.7519
##      Sensitivity (TPR) : 0.7966
##  False Neg. Rate (FNR) : 0.2034
##  False Pos. Rate (FPR) : 0.3065
##      Specificity (TNR) : 0.6935
##        Precision (PPV) : 0.7723
##   False Discovery Rate : 0.2277
##     False Omision Rate : 0.2768
##   Inv. Precision (NPV) : 0.7232
## 
##   Note: 1 rows deleted due to missing data.

smbinning.metrics(dataset = test, prediction = "pred", actualclass = "GB", report = 0, plot = "auc")

4.3 Final Scorecard

final_score<-initial_score_red

pdo <- 20
score <- 500
odds <- 50
fact <- pdo/log(2)
os <- score - fact*log(odds)
var_names <- names(final_score$coefficients[-1])

for(i in var_names) {
  beta <- final_score$coefficients[i]
  beta0 <- final_score$coefficients["(Intercept)"]
  nvar <- length(var_names)
  WOE_var <- train[[i]]
  points_name <- paste(str_sub(i, end = -4), "points", sep="")
  
  train[[points_name]] <- -(WOE_var*(beta) + (beta0/nvar))*fact + os/nvar
}

colini <- (ncol(train)-nvar + 1)
colend <- ncol(train)
train$Score <- rowSums(train[, colini:colend])
hist(train$Score, xlim=range(400,600), breaks = 30, main = "Distribution of Scores", xlab = "Score")

for(i in var_names) {
  beta <- final_score$coefficients[i]
  beta0 <- final_score$coefficients["(Intercept)"]
  nvar <- length(var_names)
  WOE_var <- test[[i]]
  points_name <- paste(str_sub(i, end = -4), "points", sep="")
  
  test[[points_name]] <- -(WOE_var*(beta) + (beta0/nvar))*fact + os/nvar
}

colini <- (ncol(test)-nvar + 1)
colend <- ncol(test)
test$Score <- rowSums(test[, colini:colend])

hist(test$Score, xlim=range(400,600), breaks = 30, main = "Distribution of Test Scores", xlab = "Score")

accepts_scored_comb <- rbind(train[,names(test)], test)
hist(accepts_scored_comb$Score,xlim=range(400,600), breaks = 30, main = "Distribution of Scores", xlab = "Score")

################# Score Card ###################

PERS_H_Score=unique(train[,c("PERS_H","woe_PERpoints")])
names(PERS_H_Score)=c("PERS_H","Point")
CARDS_Score=unique(train[,c("CARDS","woe_CApoints")])
names(CARDS_Score)=c("CARDS","Point")
TMJOB1_Score=unique(train[,c("TMJOB1_bin","TMJOB1_points")])
names(TMJOB1_Score)=c("TMJOB1","Point")
################# Score Card ###################

Scorecard

4.4 Score distribution

cutpoints <- unique(quantile(accepts_scored_comb$Score, probs = seq(0,1,0.1),na.rm=TRUE))
accepts_scored_comb$Score.QBin <- cut(accepts_scored_comb$Score, breaks=cutpoints, include.lowest=TRUE)
Default.QBin.pop <- round(table(accepts_scored_comb$Score.QBin, accepts_scored_comb$GB)[,2]/(table(accepts_scored_comb$Score.QBin, accepts_scored_comb$GB)[,2] + table(accepts_scored_comb$Score.QBin, accepts_scored_comb$GB)[,1]*weight_ag)*100,2)

#print(Default.QBin.pop)

barplot(Default.QBin.pop, 
        main = "Default Decile Plot", 
        xlab = "Deciles of Scorecard",
        ylab = "Default Rate (%)", ylim = c(0,20),
        col = saturation(heat.colors, scalefac(0.8))(10))
abline(h = 3.23, lwd = 2, lty = "dashed")
text(9, 4.3, "Current = 3.23%")

4.5 Plotting Default, Acceptance, & Profit By Score

def <- NULL
acc <- NULL
prof <- NULL
score <- NULL

cost <- 52000
profit <- 2000
for(i in min(floor(train$Score)):max(floor(train$Score))){
  score[i - min(floor(train$Score)) + 1] <- i
  def[i - min(floor(train$Score)) + 1] <- 100*sum(train$GB[which(train$Score >= i)])/(length(train$GB[which(train$Score >= i & train$GB == 1)]) + weight_ag*length(train$GB[which(train$Score >= i & train$GB == 0)]))
  acc[i - min(floor(train$Score)) + 1] <- 100*(length(train$GB[which(train$Score >= i & train$GB == 1)]) + weight_ag*length(train$GB[which(train$Score >= i & train$GB == 0)]))/(length(train$GB[which(train$GB == 1)]) + weight_ag*length(train$GB[which(train$GB == 0)]))
  prof[i - min(floor(train$Score)) + 1] <- length(train$GB[which(train$Score >= i & train$GB == 1)])*(-cost) + weight_ag*length(train$GB[which(train$Score >= i & train$GB == 0)])*profit
}

plot_data <- data.frame(def, acc, prof, score)

def_plot <- xyplot(def ~ score, plot_data, 
                   type = "l" , lwd=2, col="red",
                   ylab = "Default Rate (%)",
                   xlab = "Score",
                   xlim=c(400:600),
                   main = "Default Rate by Acceptance Across Score",
                   panel = function(x, y,...) {
                     panel.xyplot(x, y, ...)
                     panel.abline(h = 3.23, col = "red")
                   })
acc_plot <- xyplot(acc ~ score, plot_data, 
                   type = "l", lwd=2, col="blue",
                   ylab = "Acceptance Rate (%)",
                   xlim=c(400:600),
                   panel = function(x, y,...) {
                     panel.xyplot(x, y, ...)
                     panel.abline(h = 75, col = "blue")
                   })
prof_plot <- xyplot(prof/1000 ~ score, plot_data, 
                    type = "l" , lwd=2, col="green",
                    ylab = "Profit (Thousands $)",
                    xlab = "Score",
                    xlim=c(400:600),
                    main = "Profit by Acceptance Across Score"
)

doubleYScale(def_plot, acc_plot, add.ylab2 = TRUE, use.style=FALSE)

doubleYScale(prof_plot, acc_plot, add.ylab2 = TRUE, use.style=FALSE)

as.data.frame(lapply(plot_data[abs(plot_data$acc-75)<=4,],mean))

##         def      acc     prof score
## 1 0.9197601 71.56439 69093825   478

as.data.frame(plot_data[plot_data$score==472,])

##          def      acc     prof score
## 233 1.327788 80.91123 66672232   472

as.data.frame(lapply(plot_data[abs(plot_data$def-3.32)<=0.03,],mean))

##   def acc prof score
## 1 NaN NaN  NaN   NaN

as.data.frame(plot_data[plot_data$score==441,])

##          def      acc     prof score
## 202 2.221145 96.10569 49415842   441