XGBoost Tips

Measuring the Out of Sample impact of features on predictive accuracy

XGBoost will report variable importance based on how much each variable improves model performance in sample. The code below does the same out of sample.

attr(Xmod,"importance") <- xgb.importance(model=Xmod) oos_improvement <- sapply(attr(Xmod,"importance")$Feature,function(feature){ rhsvarsmod <- setdiff(rhsvars,feature) # Here we look at improvement in oos predictions gain <- sapply(1:50,function(...){ ins <- sample(1:nrow(x),size=nrow(x)/2,replace=F) xa <- x[ins,] xb <- x[-ins,] xa1 <- xgb.DMatrix(data=as.matrix(xa[,rhsvars,with=F]),label=xa[[depvar]]) xa2 <- xgb.DMatrix(data=as.matrix(xa[,rhsvarsmod,with=F]),label=xa[[depvar]]) amod1 <- xgb.train(params=list(),data = xa1,nrounds = optrounds) amod2 <- xgb.train(params=list(),data = xa2,nrounds = optrounds) naive_hat <- mean(xa[[depvar]]) truth <- xb[[depvar]] xb1 <- xgb.DMatrix(data=as.matrix(xb[,rhsvars,with=F])) xb2 <- xgb.DMatrix(data=as.matrix(xb[,rhsvarsmod,with=F])) mae_with <- median(abs(predict(amod1,xb1) - truth)) mae_without <- median(abs(predict(amod2,xb2) - truth)) naive_mae <- median(abs(truth - naive_hat)) (mae_without - mae_with)/naive_mae }) median(gain) })

home