Regresión con XGBoost

Dataset

El dataset corresponde a Boston, que se encuentra disponible en Kaggle; una plataforma de competencia de machine learning. Más info en: https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html

Cargo las librerias

library(tidymodels)
library(tidyverse)
library(magrittr)
library(MASS) #el dataset se encuentra en esta librería 

Ingreso los datos

data(Boston)

División de los datos

Voy a dividir los datos en train y test

set.seed(1234)
p_split <- Boston %>%
    initial_split(prop = 0.75)
p_train <- training(p_split)
p_test <- testing(p_split)

glimpse(p_train)
## Rows: 379
## Columns: 14
## $ crim    <dbl> 0.01501, 0.03961, 67.92080, 0.14866, 0.10574, 0.10793, 11.5779…
## $ zn      <dbl> 90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, …
## $ indus   <dbl> 1.21, 5.19, 18.10, 8.56, 27.74, 8.56, 18.10, 21.89, 18.10, 18.…
## $ chas    <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ nox     <dbl> 0.401, 0.515, 0.693, 0.520, 0.609, 0.520, 0.700, 0.624, 0.718,…
## $ rm      <dbl> 7.923, 6.037, 5.683, 6.727, 5.983, 6.195, 5.036, 6.372, 6.006,…
## $ age     <dbl> 24.8, 34.5, 100.0, 79.9, 98.8, 54.4, 97.0, 97.9, 95.3, 77.8, 8…
## $ dis     <dbl> 5.8850, 5.9853, 1.4254, 2.7778, 1.8681, 2.7778, 1.7700, 2.3274…
## $ rad     <int> 1, 5, 24, 5, 4, 5, 24, 4, 24, 24, 24, 2, 5, 4, 2, 5, 5, 24, 24…
## $ tax     <dbl> 198, 224, 666, 384, 711, 384, 666, 437, 666, 666, 666, 276, 38…
## $ ptratio <dbl> 13.6, 20.2, 20.2, 20.9, 20.1, 20.9, 20.2, 21.2, 20.2, 20.2, 20…
## $ black   <dbl> 395.52, 396.90, 384.97, 394.76, 390.11, 393.49, 396.90, 385.76…
## $ lstat   <dbl> 3.16, 8.01, 22.98, 9.42, 18.07, 13.00, 25.68, 11.12, 15.70, 29…
## $ medv    <dbl> 50.0, 21.1, 5.0, 27.5, 13.6, 21.7, 9.7, 23.0, 14.2, 6.3, 7.4, …

Los datos de TRAIN voy a dividirlos en 3-folds para hacer validación cruzada v-folds=3

p_folds <- vfold_cv(p_train, v=3, repeats = 5)
p_folds
## #  3-fold cross-validation repeated 5 times 
## # A tibble: 15 × 3
##    splits            id      id2  
##    <list>            <chr>   <chr>
##  1 <split [252/127]> Repeat1 Fold1
##  2 <split [253/126]> Repeat1 Fold2
##  3 <split [253/126]> Repeat1 Fold3
##  4 <split [252/127]> Repeat2 Fold1
##  5 <split [253/126]> Repeat2 Fold2
##  6 <split [253/126]> Repeat2 Fold3
##  7 <split [252/127]> Repeat3 Fold1
##  8 <split [253/126]> Repeat3 Fold2
##  9 <split [253/126]> Repeat3 Fold3
## 10 <split [252/127]> Repeat4 Fold1
## 11 <split [253/126]> Repeat4 Fold2
## 12 <split [253/126]> Repeat4 Fold3
## 13 <split [252/127]> Repeat5 Fold1
## 14 <split [253/126]> Repeat5 Fold2
## 15 <split [253/126]> Repeat5 Fold3

Datos de TEST

head(p_test)
##       crim   zn indus chas   nox    rm   age    dis rad tax ptratio  black
## 1  0.00632 18.0  2.31    0 0.538 6.575  65.2 4.0900   1 296    15.3 396.90
## 5  0.06905  0.0  2.18    0 0.458 7.147  54.2 6.0622   3 222    18.7 396.90
## 7  0.08829 12.5  7.87    0 0.524 6.012  66.6 5.5605   5 311    15.2 395.60
## 8  0.14455 12.5  7.87    0 0.524 6.172  96.1 5.9505   5 311    15.2 396.90
## 9  0.21124 12.5  7.87    0 0.524 5.631 100.0 6.0821   5 311    15.2 386.63
## 11 0.22489 12.5  7.87    0 0.524 6.377  94.3 6.3467   5 311    15.2 392.52
##    lstat medv
## 1   4.98 24.0
## 5   5.33 36.2
## 7  12.43 22.9
## 8  19.15 27.1
## 9  29.93 16.5
## 11 20.45 15.0

Preprocesamiento de los datos

recipe_rf <- p_train %>%
  recipe(medv~.) %>%
  step_corr(all_predictors()) %>% #elimino las correlaciones
  step_center(all_predictors(), -all_outcomes()) %>% #centrado
  step_scale(all_predictors(), -all_outcomes()) %>% #escalado
  prep() 

Modelo XGBoost

xgb_spec <- boost_tree(
  trees = 1000, 
  tree_depth = tune(), min_n = tune(), 
  loss_reduction = tune(),                     ## first three: model complexity
  sample_size = tune(), mtry = tune(),         ## randomness
  learn_rate = tune(),                         ## step size
) %>% 
  set_engine("xgboost") %>% 
  set_mode("regression")

xgb_spec
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
##   tree_depth = tune()
##   learn_rate = tune()
##   loss_reduction = tune()
##   sample_size = tune()
## 
## Computational engine: xgboost

Grilla de optimización de XGBoost

xgb_grid <- grid_latin_hypercube(
  tree_depth(),
  min_n(),
  loss_reduction(),
  sample_size = sample_prop(),
  finalize(mtry(), p_train),
  learn_rate(),
  size = 30
)

xgb_grid
## # A tibble: 30 × 6
##    tree_depth min_n loss_reduction sample_size  mtry learn_rate
##         <int> <int>          <dbl>       <dbl> <int>      <dbl>
##  1          4    32   0.000000216        0.481    13   1.30e- 4
##  2          7    30   0.00559            0.590     2   1.24e- 7
##  3          2    17   0.00000144         0.273     9   1.89e-10
##  4         10    20   0.0216             0.230     8   2.19e- 5
##  5          4    37   0.246              0.627    13   4.46e- 2
##  6          9     3   0.0000437          0.721     4   9.51e- 8
##  7         13     4   0.0836             0.502     7   9.29e- 5
##  8         13    33   2.02               0.823     7   9.34e- 4
##  9         12    11   0.0000000200       0.890     3   6.98e- 3
## 10          1    26   0.00000532         0.406     8   3.50e- 6
## # … with 20 more rows

Workflow

xgb_wf <- workflow() %>%
  add_recipe(recipe_rf) %>%
  add_model(xgb_spec)

xgb_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_corr()
## • step_center()
## • step_scale()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = tune()
##   trees = 1000
##   min_n = tune()
##   tree_depth = tune()
##   learn_rate = tune()
##   loss_reduction = tune()
##   sample_size = tune()
## 
## Computational engine: xgboost

ENTRENAMIENTO del Modelo

doParallel::registerDoParallel()

set.seed(234)
xgb_res <- tune_grid(
  xgb_wf,
  resamples = p_folds,
  grid = xgb_grid,
  control = control_grid(save_pred = TRUE)
)

xgb_res
## Warning: This tuning result has notes. Example notes on model fitting include:
## internal: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## internal: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## internal: A correlation computation is required, but `estimate` is constant and has 0 standard deviation, resulting in a divide by 0 error. `NA` will be returned.
## # Tuning results
## # 3-fold cross-validation repeated 5 times 
## # A tibble: 15 × 6
##    splits            id      id2   .metrics           .notes           .predictions
##    <list>            <chr>   <chr> <list>             <list>           <list>      
##  1 <split [252/127]> Repeat1 Fold1 <tibble [60 × 10]> <tibble [1 × 1]> <tibble [3,…
##  2 <split [253/126]> Repeat1 Fold2 <tibble [60 × 10]> <tibble [1 × 1]> <tibble [3,…
##  3 <split [253/126]> Repeat1 Fold3 <tibble [60 × 10]> <tibble [1 × 1]> <tibble [3,…
##  4 <split [252/127]> Repeat2 Fold1 <tibble [60 × 10]> <tibble [1 × 1]> <tibble [3,…
##  5 <split [253/126]> Repeat2 Fold2 <tibble [60 × 10]> <tibble [1 × 1]> <tibble [3,…
##  6 <split [253/126]> Repeat2 Fold3 <tibble [60 × 10]> <tibble [1 × 1]> <tibble [3,…
##  7 <split [252/127]> Repeat3 Fold1 <tibble [60 × 10]> <tibble [1 × 1]> <tibble [3,…
##  8 <split [253/126]> Repeat3 Fold2 <tibble [60 × 10]> <tibble [1 × 1]> <tibble [3,…
##  9 <split [253/126]> Repeat3 Fold3 <tibble [60 × 10]> <tibble [1 × 1]> <tibble [3,…
## 10 <split [252/127]> Repeat4 Fold1 <tibble [60 × 10]> <tibble [1 × 1]> <tibble [3,…
## 11 <split [253/126]> Repeat4 Fold2 <tibble [60 × 10]> <tibble [1 × 1]> <tibble [3,…
## 12 <split [253/126]> Repeat4 Fold3 <tibble [60 × 10]> <tibble [1 × 1]> <tibble [3,…
## 13 <split [252/127]> Repeat5 Fold1 <tibble [60 × 10]> <tibble [1 × 1]> <tibble [3,…
## 14 <split [253/126]> Repeat5 Fold2 <tibble [60 × 10]> <tibble [1 × 1]> <tibble [3,…
## 15 <split [253/126]> Repeat5 Fold3 <tibble [60 × 10]> <tibble [1 × 1]> <tibble [3,…

Resultados

collect_metrics(xgb_res)
## # A tibble: 60 × 12
##     mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
##    <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
##  1    13    32          4   1.30e- 4    0.000000216       0.481 rmse   
##  2    13    32          4   1.30e- 4    0.000000216       0.481 rsq    
##  3     2    30          7   1.24e- 7    0.00559           0.590 rmse   
##  4     2    30          7   1.24e- 7    0.00559           0.590 rsq    
##  5     9    17          2   1.89e-10    0.00000144        0.273 rmse   
##  6     9    17          2   1.89e-10    0.00000144        0.273 rsq    
##  7     8    20         10   2.19e- 5    0.0216            0.230 rmse   
##  8     8    20         10   2.19e- 5    0.0216            0.230 rsq    
##  9    13    37          4   4.46e- 2    0.246             0.627 rmse   
## 10    13    37          4   4.46e- 2    0.246             0.627 rsq    
## # … with 50 more rows, and 5 more variables: .estimator <chr>, mean <dbl>,
## #   n <int>, std_err <dbl>, .config <fct>

Veamos los hiperparámetros

xgb_res %>%
  collect_metrics() %>%
  filter(.metric == "rmse") %>% 
  dplyr::select(mtry:sample_size, mean) %>%
  pivot_longer(mtry:sample_size,
               values_to = "value",
               names_to = "parameter"
  ) %>%
  ggplot(aes(value, mean, color = parameter)) +
  geom_point(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~parameter, scales = "free_x") +
  labs(x = NULL, y = "rmse")

show_best(xgb_res, "rmse")
## # A tibble: 5 × 12
##    mtry min_n tree_depth learn_rate loss_reduction sample_size .metric
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <chr>  
## 1     6    19          8    0.0815        4.36e- 9       0.782 rmse   
## 2     3    11         12    0.00698       2.00e- 8       0.890 rmse   
## 3    13     8          4    0.00421       5.02e-10       0.399 rmse   
## 4    13    37          4    0.0446        2.46e- 1       0.627 rmse   
## 5     2    28         12    0.0229        1.10e+ 1       0.433 rmse   
## # … with 5 more variables: .estimator <chr>, mean <dbl>, n <int>,
## #   std_err <dbl>, .config <fct>

Mejor modelo.

best_rmse <- select_best(xgb_res, "rmse")
best_rmse
## # A tibble: 1 × 7
##    mtry min_n tree_depth learn_rate loss_reduction sample_size .config          
##   <int> <int>      <int>      <dbl>          <dbl>       <dbl> <fct>            
## 1     6    19          8     0.0815  0.00000000436       0.782 Preprocessor1_Mo…

Finalizo el workflow

final_xgb <- finalize_workflow(
  xgb_wf,
  best_rmse
)

final_xgb
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: boost_tree()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
## 
## • step_corr()
## • step_center()
## • step_scale()
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Boosted Tree Model Specification (regression)
## 
## Main Arguments:
##   mtry = 6
##   trees = 1000
##   min_n = 19
##   tree_depth = 8
##   learn_rate = 0.0815309356647953
##   loss_reduction = 4.36458359896047e-09
##   sample_size = 0.781626974374522
## 
## Computational engine: xgboost

Importancia de las variables

`

library(vip)
## 
## Attaching package: 'vip'
## The following object is masked from 'package:utils':
## 
##     vi
final_xgb %>%
  fit(data = p_train) %>%
  pull_workflow_fit() %>%
  vip(geom = "point")
## Warning: `pull_workflow_fit()` was deprecated in workflows 0.2.3.
## Please use `extract_fit_parsnip()` instead.

Predicción en TEST

final_res <- last_fit(final_xgb, p_split)

collect_metrics(final_res)
## # A tibble: 2 × 4
##   .metric .estimator .estimate .config             
##   <chr>   <chr>          <dbl> <fct>               
## 1 rmse    standard       3.26  Preprocessor1_Model1
## 2 rsq     standard       0.871 Preprocessor1_Model1
collect_predictions(final_res) %>%
  ggplot(aes(medv, .pred)) +
  geom_abline(lty = 2, color = "gray50") +
  geom_point(alpha = 0.5, color = "midnightblue") +
  coord_fixed()