Measuring the Out of Sample impact of features on predictive accuracy
XGBoost will report variable importance based on how much each variable improves model performance in sample. The code below does the same out of sample.
attr(Xmod,"importance") <- xgb.importance(model=Xmod)
oos_improvement <- sapply(attr(Xmod,"importance")$Feature,function(feature){
rhsvarsmod <- setdiff(rhsvars,feature)
# Here we look at improvement in oos predictions
gain <- sapply(1:50,function(...){
ins <- sample(1:nrow(x),size=nrow(x)/2,replace=F)
xa <- x[ins,]
xb <- x[-ins,]
xa1 <- xgb.DMatrix(data=as.matrix(xa[,rhsvars,with=F]),label=xa[[depvar]])
xa2 <- xgb.DMatrix(data=as.matrix(xa[,rhsvarsmod,with=F]),label=xa[[depvar]])
amod1 <- xgb.train(params=list(),data = xa1,nrounds = optrounds)
amod2 <- xgb.train(params=list(),data = xa2,nrounds = optrounds)
naive_hat <- mean(xa[[depvar]])
truth <- xb[[depvar]]
xb1 <- xgb.DMatrix(data=as.matrix(xb[,rhsvars,with=F]))
xb2 <- xgb.DMatrix(data=as.matrix(xb[,rhsvarsmod,with=F]))
mae_with <- median(abs(predict(amod1,xb1) - truth))
mae_without <- median(abs(predict(amod2,xb2) - truth))
naive_mae <- median(abs(truth - naive_hat))
(mae_without - mae_with)/naive_mae
})
median(gain)
})
home