Huanfa Chen - huanfa.chen@ucl.ac.uk
13/03/2026
TunedThresholdClassifierCV from sklearn to tune threshold with CVfrom sklearn.datasets import load_breast_cancer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import recall_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.model_selection import TunedThresholdClassifierCV
import numpy as np
# Load data: 1 = malignant, 0 = benign
data = load_breast_cancer(as_frame=True)
X = data.data
y = (data.target == 0).astype(int)
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# Cross-validation with threshold tuning
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
thresholds = np.arange(0.1, 1.0, 0.1)
# Tune threshold to maximise recall using CV
rf = RandomForestClassifier(n_estimators=100, random_state=42)
tuned_rf = TunedThresholdClassifierCV(
estimator=rf,
scoring="recall",
thresholds=thresholds,
cv=cv
)
tuned_rf.fit(X_train, y_train)
y_pred = tuned_rf.predict(X_test)
test_recall = recall_score(y_test, y_pred, pos_label=1, zero_division=0)
optimal_thresh = tuned_rf.best_threshold_
print(f"Thresholds tested: {thresholds.min():.1f} to {thresholds.max():.1f}, step length = 0.1")
print(f"Optimal threshold: {optimal_thresh:.1f}")
print(f"Test recall at optimal threshold: {test_recall:.4f}")Thresholds tested: 0.1 to 0.9, step length = 0.1
Optimal threshold: 0.1
Test recall at optimal threshold: 1.0000





pip install -U imbalanced-learn# train/test split
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42, stratify=y
)
# sklearn Pipeline: imputation → normalisation → classifier (no resampling)
sklearn_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler()),
('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])
sklearn_pipeline.fit(X_train, y_train)
y_pred = sklearn_pipeline.predict(X_test)
print(f"Recall (malignant=1): {recall_score(y_test, y_pred, pos_label=1):.4f}")from imblearn.pipeline import Pipelinefrom imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.under_sampling import RandomUnderSampler
# imblearn Pipeline: imputation → normalisation → undersampling → classifier
imb_pipeline = ImbPipeline([
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler()),
('sampler', RandomUnderSampler(random_state=42)),
('clf', RandomForestClassifier(n_estimators=100, random_state=42))
])
# Sampler only runs during fit, NOT at prediction time
imb_pipeline.fit(X_train, y_train)
y_pred = imb_pipeline.predict(X_test)
print(f"Recall (malignant=1): {recall_score(y_test, y_pred, pos_label=1):.4f}")BalancedBaggingClassifier (sklearn API compatible)© CASA | ucl.ac.uk/bartlett/casa