El dataset corresponde a Boston, que se encuentra disponible en Kaggle; una plataforma de competencia de machine learning. Más info en: https://www.cs.toronto.edu/~delve/data/boston/bostonDetail.html
library(tidymodels)
library(tidyverse)
library(magrittr)
library(corrr)
library(MASS) #el dataset se encuentra en esta librería
data(Boston)
Voy a dividir los datos en train y test
set.seed(1234)
p_split <- Boston %>%
initial_split(prop = 0.75)
p_train <- training(p_split)
p_test <- testing(p_split)
glimpse(p_train)
## Rows: 379
## Columns: 14
## $ crim <dbl> 0.01501, 0.03961, 67.92080, 0.14866, 0.10574, 0.10793, 11.5779…
## $ zn <dbl> 90, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 20, …
## $ indus <dbl> 1.21, 5.19, 18.10, 8.56, 27.74, 8.56, 18.10, 21.89, 18.10, 18.…
## $ chas <int> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,…
## $ nox <dbl> 0.401, 0.515, 0.693, 0.520, 0.609, 0.520, 0.700, 0.624, 0.718,…
## $ rm <dbl> 7.923, 6.037, 5.683, 6.727, 5.983, 6.195, 5.036, 6.372, 6.006,…
## $ age <dbl> 24.8, 34.5, 100.0, 79.9, 98.8, 54.4, 97.0, 97.9, 95.3, 77.8, 8…
## $ dis <dbl> 5.8850, 5.9853, 1.4254, 2.7778, 1.8681, 2.7778, 1.7700, 2.3274…
## $ rad <int> 1, 5, 24, 5, 4, 5, 24, 4, 24, 24, 24, 2, 5, 4, 2, 5, 5, 24, 24…
## $ tax <dbl> 198, 224, 666, 384, 711, 384, 666, 437, 666, 666, 666, 276, 38…
## $ ptratio <dbl> 13.6, 20.2, 20.2, 20.9, 20.1, 20.9, 20.2, 21.2, 20.2, 20.2, 20…
## $ black <dbl> 395.52, 396.90, 384.97, 394.76, 390.11, 393.49, 396.90, 385.76…
## $ lstat <dbl> 3.16, 8.01, 22.98, 9.42, 18.07, 13.00, 25.68, 11.12, 15.70, 29…
## $ medv <dbl> 50.0, 21.1, 5.0, 27.5, 13.6, 21.7, 9.7, 23.0, 14.2, 6.3, 7.4, …
Los datos de TRAIN voy a dividirlos en 3-folds para hacer validación cruzada v-folds=3
p_folds <- vfold_cv(p_train, v=3, repeats = 5)
p_folds
## # 3-fold cross-validation repeated 5 times
## # A tibble: 15 × 3
## splits id id2
## <list> <chr> <chr>
## 1 <split [252/127]> Repeat1 Fold1
## 2 <split [253/126]> Repeat1 Fold2
## 3 <split [253/126]> Repeat1 Fold3
## 4 <split [252/127]> Repeat2 Fold1
## 5 <split [253/126]> Repeat2 Fold2
## 6 <split [253/126]> Repeat2 Fold3
## 7 <split [252/127]> Repeat3 Fold1
## 8 <split [253/126]> Repeat3 Fold2
## 9 <split [253/126]> Repeat3 Fold3
## 10 <split [252/127]> Repeat4 Fold1
## 11 <split [253/126]> Repeat4 Fold2
## 12 <split [253/126]> Repeat4 Fold3
## 13 <split [252/127]> Repeat5 Fold1
## 14 <split [253/126]> Repeat5 Fold2
## 15 <split [253/126]> Repeat5 Fold3
Veamos los splits de validación cruzada, son 3 folds que se repiten 5 veces.
p_folds$splits
## [[1]]
## <Analysis/Assess/Total>
## <252/127/379>
##
## [[2]]
## <Analysis/Assess/Total>
## <253/126/379>
##
## [[3]]
## <Analysis/Assess/Total>
## <253/126/379>
##
## [[4]]
## <Analysis/Assess/Total>
## <252/127/379>
##
## [[5]]
## <Analysis/Assess/Total>
## <253/126/379>
##
## [[6]]
## <Analysis/Assess/Total>
## <253/126/379>
##
## [[7]]
## <Analysis/Assess/Total>
## <252/127/379>
##
## [[8]]
## <Analysis/Assess/Total>
## <253/126/379>
##
## [[9]]
## <Analysis/Assess/Total>
## <253/126/379>
##
## [[10]]
## <Analysis/Assess/Total>
## <252/127/379>
##
## [[11]]
## <Analysis/Assess/Total>
## <253/126/379>
##
## [[12]]
## <Analysis/Assess/Total>
## <253/126/379>
##
## [[13]]
## <Analysis/Assess/Total>
## <252/127/379>
##
## [[14]]
## <Analysis/Assess/Total>
## <253/126/379>
##
## [[15]]
## <Analysis/Assess/Total>
## <253/126/379>
Los datos de TEST están aparte de los folds de la validación cruzada, y no los tocamos hasta el final.
head(p_test)
## crim zn indus chas nox rm age dis rad tax ptratio black
## 1 0.00632 18.0 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90
## 5 0.06905 0.0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90
## 7 0.08829 12.5 7.87 0 0.524 6.012 66.6 5.5605 5 311 15.2 395.60
## 8 0.14455 12.5 7.87 0 0.524 6.172 96.1 5.9505 5 311 15.2 396.90
## 9 0.21124 12.5 7.87 0 0.524 5.631 100.0 6.0821 5 311 15.2 386.63
## 11 0.22489 12.5 7.87 0 0.524 6.377 94.3 6.3467 5 311 15.2 392.52
## lstat medv
## 1 4.98 24.0
## 5 5.33 36.2
## 7 12.43 22.9
## 8 19.15 27.1
## 9 29.93 16.5
## 11 20.45 15.0
recipe_rf <- p_train %>%
recipe(medv~.) %>%
step_corr(all_predictors()) %>% #elimino las correlaciones
step_center(all_predictors(), -all_outcomes()) %>% #centrado
step_scale(all_predictors(), -all_outcomes()) %>% #escalado
prep()
En esta etapa se especifica el modelo que vamos a implementar, en este caso, Random Forest Baseline.
rf_spec <- rand_forest() %>%
set_engine("ranger") %>%
set_mode("regression")
Inicializo el workflow para trabajar de manera ordenada, luego visualizo los pasos.
rf_wf <- workflow() %>%
add_recipe(recipe_rf) %>%
add_model(rf_spec)
rf_wf
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 3 Recipe Steps
##
## • step_corr()
## • step_center()
## • step_scale()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (regression)
##
## Computational engine: ranger
set.seed(123)
rf_res <- rf_wf %>%
fit_resamples(
p_folds,
control = control_resamples(save_pred = TRUE)
)
glimpse(rf_res)
## Rows: 15
## Columns: 6
## $ splits <list> [<vfold_split[252 x 127 x 379 x 14]>], [<vfold_split[253…
## $ id <chr> "Repeat1", "Repeat1", "Repeat1", "Repeat2", "Repeat2", "R…
## $ id2 <chr> "Fold1", "Fold2", "Fold3", "Fold1", "Fold2", "Fold3", "Fo…
## $ .metrics <list> [<tbl_df[2 x 4]>], [<tbl_df[2 x 4]>], [<tbl_df[2 x 4]>],…
## $ .notes <list> [<tbl_df[0 x 1]>], [<tbl_df[0 x 1]>], [<tbl_df[0 x 1]>],…
## $ .predictions <list> [<tbl_df[127 x 4]>], [<tbl_df[126 x 4]>], [<tbl_df[126 x…
rf_res %>%
collect_metrics()
## # A tibble: 2 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <fct>
## 1 rmse standard 3.71 15 0.160 Preprocessor1_Model1
## 2 rsq standard 0.852 15 0.0117 Preprocessor1_Model1
final_model <- finalize_model(rf_spec, select_best(rf_res, "rmse"))
final_model
## Random Forest Model Specification (regression)
##
## Computational engine: ranger
final_rs <- last_fit(final_model, medv ~ ., p_split)
final_rs
## # Resampling results
## # Manual resampling
## # A tibble: 1 × 6
## splits id .metrics .notes .predictions .workflow
## <list> <chr> <list> <list> <list> <list>
## 1 <split [379/127]> train/test split <tibble [… <tibble… <tibble [127… <workflo…
final_rs %>%
collect_metrics()
## # A tibble: 2 × 4
## .metric .estimator .estimate .config
## <chr> <chr> <dbl> <fct>
## 1 rmse standard 3.34 Preprocessor1_Model1
## 2 rsq standard 0.877 Preprocessor1_Model1
final_rs %>%
collect_predictions()
## # A tibble: 127 × 5
## id .pred .row medv .config
## <chr> <dbl> <int> <dbl> <fct>
## 1 train/test split 28.7 1 24 Preprocessor1_Model1
## 2 train/test split 33.0 5 36.2 Preprocessor1_Model1
## 3 train/test split 20.5 7 22.9 Preprocessor1_Model1
## 4 train/test split 18.9 8 27.1 Preprocessor1_Model1
## 5 train/test split 17.7 9 16.5 Preprocessor1_Model1
## 6 train/test split 19.3 11 15 Preprocessor1_Model1
## 7 train/test split 16.4 23 15.2 Preprocessor1_Model1
## 8 train/test split 17.1 25 15.6 Preprocessor1_Model1
## 9 train/test split 17.6 27 16.6 Preprocessor1_Model1
## 10 train/test split 18.7 32 14.5 Preprocessor1_Model1
## # … with 117 more rows
collect_predictions(final_rs) %>%
ggplot(aes(medv, .pred)) +
geom_abline(lty = 2, color = "gray50") +
geom_point(alpha = 0.5, color = "midnightblue") +
coord_fixed()
Vamos a plotear las variables más importantes del MODELO, mediante la librería vip. Más información sobre esta librería en https://koalaverse.github.io/vip/
library(vip)
final_model %>%
set_engine("ranger", importance = "permutation") %>%
fit(medv ~ .,
data = juice(recipe_rf)) %>%
vip(geom = "point")