# If needed, uncomment and run this line once:
# %pip install statsmodels
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.base import BaseEstimator
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import RFECV
from sklearn.feature_selection import mutual_info_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.inspection import permutation_importance
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constantPractical 8: Feature Selection
This week we will focus on feature selection for supervised machine learning models. We will use the California housing dataset from sklearn and apply selected feature selection methods to identify the most relevant features for the model before or after training a linear model or random forest model.
Learning Outcomes
- Understand how to apply VIF and Lasso regression for feature selection of linear models.
- Understand how to apply mutual information (MI) and permutation importance (PI) for feature selection of supervised learning models.
- Understand RFECV for recursive feature elimination with cross-validation for selecting features.
Starting the Practical
The process for this week is similar with previous weeks: download the notebook to your DSSS folder (or wherever you keep your course materials), switch over to JupyterLab (running in Podman/Docker), and work through each section.
If you want to save the completed notebook to your Github repo, remember to add, commit, and push your work.
Suggestions for a Better Learning Experience:
- Keep software language set to English for easier debugging and searching.
- Back up your work using cloud storage and/or Git.
- Avoid whitespace in filenames and dataset column names.
Load data and prepare feature dataframes
In this practical, we use the Ames Housing dataset, which contains information about residential properties in Ames, Iowa. It consists of 2930 samples and 80 variables. The target variable is house sale price, so this is a regression task. The features are a mixture of structural housing attributes, location-related descriptors, quality/condition indicators, and sale metadata. These features are a mix of numeric and categorical types and contain some missing values, which is common in real-world datasets. There is a lot of noise and redundancy (e.g., ‘GarageArea’ vs ‘GarageCars’, or ‘TotalBsmtSF’ vs ‘1stFlrSF’).
# load Ames Housing dataset
ames = fetch_openml(name="house_prices", version=1, as_frame=True)
X_raw = ames.data.copy()
y = ames.target.astype(float).copy()
print(f"Dataset shape: {X_raw.shape[0]} rows × {X_raw.shape[1]} columns")
print(f"Target: {ames.target_names[0]}")
print("Feature type counts:")
print(X_raw.dtypes.value_counts())
print(f"Missing values in predictors: {int(X_raw.isna().sum().sum())}")Dataset shape: 1460 rows × 80 columns
Target: SalePrice
Feature type counts:
object 43
int64 34
float64 3
Name: count, dtype: int64
Missing values in predictors: 6965
# split once and reuse for all methods
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
X_raw, y, test_size=0.2, random_state=42
)
num_cols = X_train_raw.select_dtypes(include=[np.number]).columns.tolist()
cat_cols = X_train_raw.select_dtypes(exclude=[np.number]).columns.tolist()
print(f"Numeric columns: {len(num_cols)}")
print(f"Categorical columns: {len(cat_cols)}")Numeric columns: 37
Categorical columns: 43
We will use a sklearn Pipeline to preprocess the data, which includes missing value imputation for numeric and categorical features, and one-hot encoding for categorical features. For linear models, we also standardise numeric features to have mean=0 and std=1, which is important for VIF and Lasso. For tree-based models, data standardisation is not necessary, but we will keep the same preprocessing for simplicity.
# preprocessing for linear models and random forest
numeric_pipe = Pipeline([
("imputer", SimpleImputer(strategy="median")),
("scaler", StandardScaler())
])
categorical_pipe = Pipeline([
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore"))
])
preprocessor = ColumnTransformer([
("num", numeric_pipe, num_cols),
("cat", categorical_pipe, cat_cols)
])
X_train_prepared = preprocessor.fit_transform(X_train_raw)
X_test_prepared = preprocessor.transform(X_test_raw)
# transform these dataframes back to pandas DataFrames
feature_names = preprocessor.get_feature_names_out()
X_train_prepared = pd.DataFrame(X_train_prepared.toarray() if hasattr(X_train_prepared, "toarray") else X_train_prepared, columns=feature_names)
X_test_prepared = pd.DataFrame(X_test_prepared.toarray() if hasattr(X_test_prepared, "toarray") else X_test_prepared, columns=feature_names)
print("Prepared train shape:", X_train_prepared.shape)
print("Prepared test shape:", X_test_prepared.shape)Prepared train shape: (1168, 287)
Prepared test shape: (292, 287)
VIF for linear regression
VIF addresses multicollinearity among features. We compute VIF for each feature and drop features with VIF above a threshold (e.g. 10). We repeat this process until all remaining features have VIF below the threshold.
Note that the VIF code below takes several minutes, as there are 287 features in the processed data.
# drop columns with VIF until all VIFs are below the threshold
# This function is adjusted from: https://stackoverflow.com/a/51329496/4667568
def drop_column_using_vif_(df, thresh=10.0):
'''
Calculates VIF each feature in a pandas dataframe, and repeatedly drop the columns with the highest VIF
A constant must be added to variance_inflation_factor or the results will be incorrect
:param df: the pandas dataframe containing only the predictor features, not the response variable
:param thresh: (default 5) the threshould VIF value. If the VIF of a variable is greater than thresh, it should be removed from the dataframe
:return: dataframe with multicollinear features removed
'''
while True:
# adding a constatnt item to the data. add_constant is a function from statsmodels (see the import above)
df_with_const = add_constant(df)
vif_df = pd.Series([variance_inflation_factor(df_with_const.values, i)
for i in range(df_with_const.shape[1])], name= "VIF",
index=df_with_const.columns).to_frame()
# drop the const
vif_df = vif_df.drop('const')
# if the largest VIF is above the thresh, remove a variable with the largest VIF
# If there are multiple variabels with VIF>thresh, only one of them is removed. This is because we want to keep as many variables as possible
if vif_df.VIF.max() > thresh:
# If there are multiple variables with the maximum VIF, choose the first one
index_to_drop = vif_df.index[vif_df.VIF == vif_df.VIF.max()].tolist()[0]
print('Dropping: {}'.format(index_to_drop))
df = df.drop(columns = index_to_drop)
else:
# No VIF is above threshold. Exit the loop
break
return df# use all prepared features for VIF
X_vif_train = pd.DataFrame(X_train_prepared, columns=feature_names)
# keep columns with some variation
X_vif_train = X_vif_train.loc[:, X_vif_train.nunique() > 1]
vif_threshold = 10.0
# drop high-VIF columns using the helper function above
X_vif_selected = drop_column_using_vif_(X_vif_train.copy(), thresh=vif_threshold)
vif_selected_features = X_vif_selected.columns.tolist()
# final VIF table for selected features
X_vif_selected_with_const = add_constant(X_vif_selected, has_constant="add")
vif_table = pd.Series(
[
variance_inflation_factor(X_vif_selected_with_const.values, i)
for i in range(X_vif_selected_with_const.shape[1])
],
index=X_vif_selected_with_const.columns,
name="VIF"
).drop("const").sort_values(ascending=False).reset_index()
vif_table.columns = ["feature", "VIF"]
print("VIF threshold:", vif_threshold)
print("Number of VIF-selected features:", len(vif_selected_features))
display(vif_table)Dropping: cat__Utilities_NoSeWa
Dropping: cat__Neighborhood_Blueste
Dropping: cat__Condition1_RRNe
Dropping: cat__Condition2_PosA
Dropping: cat__RoofStyle_Shed
Dropping: cat__RoofMatl_ClyTile
Dropping: cat__Exterior1st_AsphShn
Dropping: cat__Exterior1st_CBlock
Dropping: cat__Exterior2nd_CBlock
Dropping: cat__ExterCond_Po
Dropping: cat__BsmtCond_Po
Dropping: cat__Heating_Floor
Dropping: cat__HeatingQC_Po
Dropping: cat__Functional_Sev
Dropping: cat__PoolQC_Fa
Dropping: cat__MiscFeature_Gar2
Dropping: cat__SaleType_Con
Dropping: cat__LotConfig_FR3
Dropping: cat__SaleCondition_AdjLand
Dropping: cat__MSZoning_'C (all)'
Dropping: cat__Foundation_Wood
Dropping: cat__HouseStyle_2.5Fin
Dropping: cat__Alley_Pave
Dropping: cat__Street_Grvl
Dropping: num__BsmtFinSF1
Dropping: num__1stFlrSF
Dropping: cat__LotShape_IR1
Dropping: cat__LandContour_Bnk
Dropping: cat__LandSlope_Gtl
Dropping: cat__BldgType_1Fam
Dropping: cat__MasVnrType_BrkCmn
Dropping: cat__ExterQual_Ex
Dropping: cat__BsmtQual_Ex
Dropping: cat__BsmtExposure_Av
Dropping: cat__BsmtFinType1_ALQ
Dropping: cat__BsmtFinType2_ALQ
Dropping: cat__CentralAir_N
Dropping: cat__Electrical_FuseA
Dropping: cat__KitchenQual_Ex
Dropping: cat__FireplaceQu_Ex
Dropping: cat__GarageType_2Types
Dropping: cat__GarageFinish_Fin
Dropping: cat__GarageQual_Ex
Dropping: cat__GarageCond_Ex
Dropping: cat__PavedDrive_N
Dropping: cat__Fence_GdPrv
Dropping: cat__Exterior2nd_VinylSd
Dropping: cat__RoofStyle_Gable
Dropping: cat__HeatingQC_Ex
Dropping: cat__Exterior1st_VinylSd
Dropping: cat__BsmtCond_TA
Dropping: cat__ExterCond_TA
Dropping: cat__Neighborhood_NAmes
Dropping: cat__SaleType_New
Dropping: cat__HouseStyle_1Story
Dropping: cat__Condition1_Norm
Dropping: cat__Functional_Typ
Dropping: cat__Foundation_CBlock
Dropping: cat__LotConfig_Inside
Dropping: cat__GarageCond_TA
Dropping: cat__RoofMatl_CompShg
Dropping: cat__MSZoning_RL
Dropping: cat__SaleCondition_Partial
Dropping: cat__GarageType_Attchd
Dropping: num__MSSubClass
Dropping: cat__MiscFeature_Shed
Dropping: cat__Exterior2nd_MetalSd
Dropping: cat__SaleCondition_Normal
Dropping: cat__Heating_GasA
Dropping: cat__MasVnrType_None
Dropping: cat__Condition2_Norm
Dropping: cat__GarageQual_TA
Dropping: cat__Exterior1st_CemntBd
Dropping: num__GrLivArea
Dropping: cat__ExterQual_TA
Dropping: cat__RoofStyle_Flat
Dropping: cat__FireplaceQu_Gd
Dropping: num__YearBuilt
Dropping: cat__Exterior1st_HdBoard
Dropping: cat__BsmtFinType2_Unf
Dropping: num__2ndFlrSF
Dropping: cat__BsmtQual_TA
Dropping: cat__KitchenQual_TA
VIF threshold: 10.0
Number of VIF-selected features: 204
| feature | VIF | |
|---|---|---|
| 0 | cat__PoolQC_Ex | 8.917055 |
| 1 | cat__Exterior2nd_'Wd Sdng' | 8.163297 |
| 2 | cat__Exterior1st_'Wd Sdng' | 7.721266 |
| 3 | cat__Neighborhood_Somerst | 7.488074 |
| 4 | num__TotalBsmtSF | 7.356459 |
| ... | ... | ... |
| 199 | cat__SaleCondition_Family | 1.189712 |
| 200 | num__3SsnPorch | 1.184405 |
| 201 | cat__SaleType_Oth | 1.182672 |
| 202 | cat__Condition2_RRAn | 1.171539 |
| 203 | cat__Exterior2nd_Other | 1.130220 |
204 rows × 2 columns
Lasso for linear regression
Lasso performs embedded feature selection via coefficient shrinkage. We fit LassoCV and keep features with non-zero coefficients.
lasso = LassoCV(cv=5, random_state=42, max_iter=20000)
lasso.fit(X_train_prepared, y_train)
lasso_mask = ~np.isclose(lasso.coef_, 0.0)
lasso_selected_features = feature_names[lasso_mask].tolist()
lasso_coef_df = pd.DataFrame({
"feature": feature_names,
"coefficient": lasso.coef_,
"abs_coefficient": np.abs(lasso.coef_)
}).sort_values("abs_coefficient", ascending=False)
lasso_selected_coef_df = pd.DataFrame({
"feature": feature_names,
"coefficient": lasso.coef_,
"abs_coefficient": np.abs(lasso.coef_)
})
lasso_selected_coef_df = lasso_selected_coef_df[~np.isclose(lasso_selected_coef_df["coefficient"], 0.0)]
lasso_selected_coef_df = lasso_selected_coef_df.sort_values("abs_coefficient", ascending=False)
print("Lasso alpha:", lasso.alpha_)
print("Number of Lasso-selected features:", len(lasso_selected_features))
display(lasso_selected_coef_df)Lasso alpha: 198.662927166851
Number of Lasso-selected features: 82
| feature | coefficient | abs_coefficient | |
|---|---|---|---|
| 125 | cat__RoofMatl_ClyTile | -192170.788421 | 192170.788421 |
| 102 | cat__Condition2_PosN | -41431.018711 | 41431.018711 |
| 79 | cat__Neighborhood_NoRidge | 34667.240738 | 34667.240738 |
| 86 | cat__Neighborhood_StoneBr | 28929.234155 | 28929.234155 |
| 223 | cat__KitchenQual_Ex | 26958.306076 | 26958.306076 |
| ... | ... | ... | ... |
| 18 | num__BsmtHalfBath | -259.569426 | 259.569426 |
| 78 | cat__Neighborhood_NWAmes | -203.376518 | 203.376518 |
| 15 | num__LowQualFinSF | -152.663603 | 152.663603 |
| 34 | num__MiscVal | -89.229028 | 89.229028 |
| 10 | num__BsmtFinSF2 | -75.354887 | 75.354887 |
82 rows × 3 columns
To compare the features selected by VIF and Lasso, we will summarise the number of features selected by each method and the overlap between them. Note that VIF and Lasso optimise different goals (collinearity control vs linear sparsity), therefore we do not expect them to select the same features.
# summary of VIF vs Lasso selected features
vif_set = set(vif_selected_features)
lasso_set = set(lasso_selected_features)
summary_vif_lasso = pd.DataFrame({
"method": ["VIF", "Lasso (non-zero coef)", "Overlap"],
"n_features": [
len(vif_set),
len(lasso_set),
len(vif_set.intersection(lasso_set))
]
})
display(summary_vif_lasso)| method | n_features | |
|---|---|---|
| 0 | VIF | 204 |
| 1 | Lasso (non-zero coef) | 82 |
| 2 | Overlap | 59 |
Mutual information (MI) for random forest features (20 features)
MI measures non-linear dependency between each predictor and target. We compute MI on the prepared matrix and then keep the top 20 features with the highest MI scores.
mi_scores = mutual_info_regression(X_train_prepared, y_train, random_state=42)
# mi_mask = mi_selector.get_support()
# mi_selected_features = feature_names[mi_mask].tolist()
mi_df = pd.DataFrame({
"feature": feature_names,
"mi_score": mi_scores
}).sort_values("mi_score", ascending=False)
mi_selected_features = mi_df["feature"].head(20).tolist()
# print("Number of MI-selected features:", len(mi_selected_features))
display(mi_df.head(20))| feature | mi_score | |
|---|---|---|
| 4 | num__OverallQual | 0.520684 |
| 16 | num__GrLivArea | 0.452883 |
| 26 | num__GarageCars | 0.361264 |
| 27 | num__GarageArea | 0.359418 |
| 12 | num__TotalBsmtSF | 0.346107 |
| 6 | num__YearBuilt | 0.334873 |
| 13 | num__1stFlrSF | 0.294918 |
| 1 | num__MSSubClass | 0.290739 |
| 25 | num__GarageYrBlt | 0.273491 |
| 19 | num__FullBath | 0.258317 |
| 170 | cat__ExterQual_TA | 0.242955 |
| 7 | num__YearRemodAdd | 0.237659 |
| 247 | cat__GarageFinish_Unf | 0.217089 |
| 169 | cat__ExterQual_Gd | 0.211553 |
| 14 | num__2ndFlrSF | 0.207762 |
| 226 | cat__KitchenQual_TA | 0.207178 |
| 185 | cat__BsmtQual_TA | 0.199414 |
| 23 | num__TotRmsAbvGrd | 0.182135 |
| 178 | cat__Foundation_PConc | 0.177595 |
| 3 | num__LotArea | 0.176679 |
Permutation importance (PI) for selecting top 20 features inrandom forest
PI measures the drop in model performance when one feature is shuffled. We fit a random forest on all prepared features, compute permutation importance, and keep the top 20 features.
rf_full = RandomForestRegressor(
n_estimators=300,
random_state=42,
n_jobs=-1
)
rf_full.fit(X_train_prepared, y_train)
perm = permutation_importance(
rf_full,
X_test_prepared,
y_test,
scoring="r2",
n_repeats=5,
random_state=42,
n_jobs=-1
)
pi_df = pd.DataFrame({
"feature": feature_names,
"pi_mean": perm.importances_mean,
"pi_std": perm.importances_std
}).sort_values("pi_mean", ascending=False)
pi_selected_features = pi_df["feature"].head(20).tolist()
print("Number of PI-selected features:", len(pi_selected_features))
display(pi_df.head(20))Number of PI-selected features: 20
| feature | pi_mean | pi_std | |
|---|---|---|---|
| 4 | num__OverallQual | 0.487505 | 0.022181 |
| 16 | num__GrLivArea | 0.119808 | 0.010768 |
| 14 | num__2ndFlrSF | 0.031823 | 0.003460 |
| 12 | num__TotalBsmtSF | 0.024634 | 0.003295 |
| 13 | num__1stFlrSF | 0.017895 | 0.003705 |
| 26 | num__GarageCars | 0.016992 | 0.001407 |
| 9 | num__BsmtFinSF1 | 0.015890 | 0.003530 |
| 3 | num__LotArea | 0.012041 | 0.001757 |
| 27 | num__GarageArea | 0.006000 | 0.001709 |
| 6 | num__YearBuilt | 0.005485 | 0.000995 |
| 7 | num__YearRemodAdd | 0.003606 | 0.000747 |
| 182 | cat__BsmtQual_Ex | 0.003436 | 0.000430 |
| 23 | num__TotRmsAbvGrd | 0.002732 | 0.000641 |
| 19 | num__FullBath | 0.002723 | 0.000410 |
| 5 | num__OverallCond | 0.002394 | 0.000294 |
| 25 | num__GarageYrBlt | 0.002003 | 0.000359 |
| 24 | num__Fireplaces | 0.001416 | 0.000376 |
| 225 | cat__KitchenQual_Gd | 0.001320 | 0.000178 |
| 247 | cat__GarageFinish_Unf | 0.001256 | 0.000297 |
| 244 | cat__GarageType_Detchd | 0.001013 | 0.000599 |
RFECV
RFECV recursively removes weak features with cross-validation. To keep exactly 20 features, we use the final feature ranking and keep the top 20 ranked variables.
rfecv = RFECV(
estimator=RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1),
step=20,
cv=5,
scoring="r2",
min_features_to_select=20,
n_jobs=-1
)
rfecv.fit(X_train_prepared, y_train)
ranking = rfecv.ranking_
# top20_idx = np.argsort(ranking)[:20]
# rfecv_mask = np.zeros_like(ranking, dtype=bool)
# rfecv_mask[top20_idx] = True
# rfecv_selected_features = feature_names[rfecv_mask].tolist()
rfecv_df = pd.DataFrame({
"feature": feature_names,
"ranking": ranking
}).sort_values("ranking", ascending=True)
rfecv_selected_features = rfecv_df["feature"].head(20).tolist()
print("RFECV optimal n_features_:", rfecv.n_features_)
print("Number of RFECV-selected features used here:", len(rfecv_selected_features))
display(rfecv_df.head(20))RFECV optimal n_features_: 107
Number of RFECV-selected features used here: 20
| feature | ranking | |
|---|---|---|
| 0 | num__Id | 1 |
| 1 | num__MSSubClass | 1 |
| 2 | num__LotFrontage | 1 |
| 3 | num__LotArea | 1 |
| 4 | num__OverallQual | 1 |
| 5 | num__OverallCond | 1 |
| 6 | num__YearBuilt | 1 |
| 7 | num__YearRemodAdd | 1 |
| 8 | num__MasVnrArea | 1 |
| 9 | num__BsmtFinSF1 | 1 |
| 10 | num__BsmtFinSF2 | 1 |
| 11 | num__BsmtUnfSF | 1 |
| 12 | num__TotalBsmtSF | 1 |
| 13 | num__1stFlrSF | 1 |
| 14 | num__2ndFlrSF | 1 |
| 32 | num__ScreenPorch | 1 |
| 33 | num__PoolArea | 1 |
| 35 | num__MoSold | 1 |
| 36 | num__YrSold | 1 |
| 40 | cat__MSZoning_RL | 1 |
Model performance: all features vs selected features
For MI/PI/RFECV, we now compare random forest performance using all prepared features versus each selected 20-feature subset.
def evaluate_rf_subset(selected_features, label):
model = RandomForestRegressor(
n_estimators=300,
random_state=42,
n_jobs=-1
)
model.fit(X_train_prepared[selected_features], y_train)
pred = model.predict(X_test_prepared[selected_features])
return {
"feature_set": label,
"n_features": len(selected_features),
"Training_R2": r2_score(y_train, model.predict(X_train_prepared[selected_features])),
"Testing_R2": r2_score(y_test, pred)
}
performance_rows = [
evaluate_rf_subset(feature_names, "All features"),
evaluate_rf_subset(mi_selected_features, "MI selected (20)"),
evaluate_rf_subset(pi_selected_features, "PI selected (20)"),
evaluate_rf_subset(rfecv_selected_features, "RFECV selected (20)")
]
performance_df = pd.DataFrame(performance_rows).sort_values("Training_R2", ascending=False)
display(performance_df)| feature_set | n_features | Training_R2 | Testing_R2 | |
|---|---|---|---|---|
| 2 | PI selected (20) | 20 | 0.979675 | 0.898113 |
| 0 | All features | 287 | 0.979553 | 0.894106 |
| 1 | MI selected (20) | 20 | 0.977418 | 0.888593 |
| 3 | RFECV selected (20) | 20 | 0.976143 | 0.877697 |
Can you see the trade-off between performance and model complexity, when using all features vs a smaller subset of features?
Summary
In this practical, we used Ames Housing data to compare feature selection for linear and tree-based models:
- VIF reduced multicollinearity among numeric predictors.
- Lasso selected features by coefficient shrinkage for linear regression.
- MI, PI, and RFECV selected 20 features each for random forest modelling.
- We compared random forest performance using all features and each selected subset.