Data Cleaning
rejects<-read.csv("/Users/mingming/Documents/Financial Analysis/Homework1_FA/rejected_customers.csv")
accepts$good <- abs(accepts$GB - 1)
catag_variable_new=names(rejects)[names(rejects)%in%catag_variable]
rejects[,catag_variable_new]=lapply(rejects[,catag_variable_new],as.factor)
#################
unique(train[,c("TMJOB1_bin","TMJOB1_WOE")])
## TMJOB1_bin TMJOB1_WOE
## 2190 02 <= 144 -0.0402
## 51 03 > 144 1.0757
## 2712 01 <= 15 -0.5484
rejects$TMJOB1_bin<-rep("",nrow(rejects))
rejects$TMJOB1_bin[rejects$TMJOB1<=15]="<= 15"
rejects$TMJOB1_bin[rejects$TMJOB1<=144&rejects$TMJOB1>15]="<= 144"
rejects$TMJOB1_bin[rejects$TMJOB1>144]=">144"
rejects$TMJOB1_WOE<-rep(0,nrow(rejects))
rejects$TMJOB1_WOE[rejects$TMJOB1<=15]=-0.5484
rejects$TMJOB1_WOE[rejects$TMJOB1<=144&rejects$TMJOB1>15]=-0.0402
rejects$TMJOB1_WOE[rejects$TMJOB1>144]=1.0757
rejects<-merge(rejects,cate1,by="CARDS",all.x = TRUE)
rejects<-merge(rejects,cate2,by="PERS_H",all.x = TRUE)
as.data.frame(lapply(lapply(rejects,is.na),sum)>0)
## lapply(lapply(rejects, is.na), sum) > 0
## PERS_H FALSE
## CARDS FALSE
## CHILDREN FALSE
## AGE FALSE
## TMADD FALSE
## TMJOB1 FALSE
## TEL FALSE
## NMBLOAN FALSE
## FINLOAN FALSE
## INCOME FALSE
## EC_CARD FALSE
## BUREAU FALSE
## LOCATION FALSE
## LOANS FALSE
## REGN FALSE
## DIV FALSE
## CASH FALSE
## PRODUCT FALSE
## RESID FALSE
## NAT FALSE
## PROF FALSE
## CAR FALSE
## TMJOB1_bin FALSE
## TMJOB1_WOE FALSE
## woe_CARDS TRUE
## woe_PERS_H TRUE
rejects[is.na(rejects$woe_PERS_H),] ### PERS_H fail out the range of the accepted data
## PERS_H CARDS CHILDREN AGE TMADD TMJOB1 TEL NMBLOAN FINLOAN INCOME EC_CARD BUREAU LOCATION
## 1500 11 no credit cards 9 56 120 12 2 0 0 1600 0 3 1
## LOANS REGN DIV CASH PRODUCT RESID NAT PROF CAR TMJOB1_bin TMJOB1_WOE
## 1500 0 5 1 1700 Radio, TV, Hifi Turkish Others Without Vehicle <= 15 -0.5484
## woe_CARDS woe_PERS_H
## 1500 0.2268594 NA
##impute
rejects$woe_PERS_H[is.na(rejects$woe_PERS_H)]=5.3002221
rejects[is.na(rejects$woe_CARDS),] ## VISA Citibank is not one category in the accept data
## PERS_H CARDS CHILDREN AGE TMADD TMJOB1 TEL NMBLOAN FINLOAN INCOME EC_CARD BUREAU LOCATION
## 403 1 VISA Citibank 0 22 264 42 2 2 1 0 0 1 1
## 1402 4 VISA Citibank 2 36 6 216 2 0 0 0 0 1 1
## LOANS REGN DIV CASH PRODUCT RESID NAT PROF CAR TMJOB1_bin
## 403 2 3 1 1500 Radio, TV, Hifi Lease German Others Without Vehicle <= 144
## 1402 5 2 1 2500 Radio, TV, Hifi Lease German Military Service Car >144
## TMJOB1_WOE woe_CARDS woe_PERS_H
## 403 -0.0402 NA 0.4926674
## 1402 1.0757 NA -0.2655746
##impute as the value of VISA Others
rejects$woe_CARDS[is.na(rejects$woe_CARDS)]=train$woe_CARDS[train$CARDS=="VISA Others"]
as.data.frame(lapply(lapply(rejects,is.na),sum)>0)
## lapply(lapply(rejects, is.na), sum) > 0
## PERS_H FALSE
## CARDS FALSE
## CHILDREN FALSE
## AGE FALSE
## TMADD FALSE
## TMJOB1 FALSE
## TEL FALSE
## NMBLOAN FALSE
## FINLOAN FALSE
## INCOME FALSE
## EC_CARD FALSE
## BUREAU FALSE
## LOCATION FALSE
## LOANS FALSE
## REGN FALSE
## DIV FALSE
## CASH FALSE
## PRODUCT FALSE
## RESID FALSE
## NAT FALSE
## PROF FALSE
## CAR FALSE
## TMJOB1_bin FALSE
## TMJOB1_WOE FALSE
## woe_CARDS FALSE
## woe_PERS_H FALSE
Predicted Default Probability
rejects$pred <- predict(initial_score_red, newdata=rejects, type='response')
rejects$GB <- as.numeric(rejects$pred > 0.03351922)
rejects$good <- abs(rejects$GB - 1)
- Data oversampling and weight calculation
pop_g <- 9677
pop_b <- 323
sam_g <- 1500
sam_b <- 1500
pop_sam_gb_ratio <- (pop_g/pop_b)/(sam_g/sam_b)
pop_a <- 0.75
pop_r <- 0.25
sam_a <- 30
sam_r <- 15
pop_sam_ar_ratio <- (pop_a/pop_r)/(sam_a/sam_r)
weight_rb <- 1
weight_rg <- pop_sam_gb_ratio
weight_ab <- pop_sam_ar_ratio
weight_ag <- pop_sam_ar_ratio*pop_sam_gb_ratio
accepts$weight_ar <- ifelse(accepts$GB == 1, weight_ab, weight_ag)
rejects$weight_ar <- ifelse(rejects$GB == 1, weight_rb, weight_rg)
accepts=subset(accepts,select=-c(X_freq_))
comb_hard <- rbind(accepts, rejects[,names(accepts)]) # New Combined Data Set #
# Below can be used to see if there is any missing value
# lapply(lapply(accepts,is.na),sum)>0
# lapply(lapply(rejects,is.na),sum)>0
# lapply(lapply(comb_hard,is.na),sum)>0