Note
Go to the end to download the full example code
HistGradientBoosting Optimization#
Hyperparameter optimization for sklearn’s fast HistGradientBoosting algorithms.
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score
import numpy as np
import plotly
from mloptimizer.interfaces import HyperparameterSpaceBuilder, GeneticSearch
from mloptimizer.application.reporting.plots import plotly_search_space, plotly_logbook
Load and prepare the dataset
print("Loading Breast Cancer dataset...")
data = load_breast_cancer()
X, y = data.data, data.target
print(f"Dataset shape: {X.shape}")
Loading Breast Cancer dataset...
Dataset shape: (569, 30)
Split the data
Define the hyperparameter space
hyperparam_space = HyperparameterSpaceBuilder.get_default_space(
estimator_class=HistGradientBoostingClassifier
)
Configure and run the genetic optimization
genetic_params = {
'generations': 5,
'population_size': 8,
'n_elites': 2,
'seed': 42,
'use_mlflow': False,
'use_parallel': False
}
opt = GeneticSearch(
estimator_class=HistGradientBoostingClassifier,
hyperparam_space=hyperparam_space,
cv=3,
scoring='accuracy',
**genetic_params
)
print("Starting HistGradientBoostingClassifier optimization...")
opt.fit(X_train, y_train)
Starting HistGradientBoostingClassifier optimization...
Genetic execution: 0%| | 0/6 [00:00<?, ?it/s, best fitness=?]
Genetic execution: 17%|█▋ | 1/6 [00:00<00:01, 4.36it/s, best fitness=0.969]
Genetic execution: 17%|█▋ | 1/6 [00:00<00:04, 1.16it/s, best fitness=0.971]
Genetic execution: 33%|███▎ | 2/6 [00:01<00:03, 1.23it/s, best fitness=0.971]
Genetic execution: 50%|█████ | 3/6 [00:03<00:03, 1.22s/it, best fitness=0.971]
Genetic execution: 50%|█████ | 3/6 [00:04<00:03, 1.22s/it, best fitness=0.974]
Genetic execution: 67%|██████▋ | 4/6 [00:04<00:02, 1.27s/it, best fitness=0.974]
Genetic execution: 83%|████████▎ | 5/6 [00:06<00:01, 1.37s/it, best fitness=0.974]
Genetic execution: 83%|████████▎ | 5/6 [00:07<00:01, 1.37s/it, best fitness=0.976]
Genetic execution: 100%|██████████| 6/6 [00:07<00:00, 1.40s/it, best fitness=0.976]
Genetic execution: 100%|██████████| 6/6 [00:09<00:00, 1.53s/it, best fitness=0.976]
Evaluate the optimized model
best_clf = opt.best_estimator_
y_pred = best_clf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
test_f1 = f1_score(y_test, y_pred, average='binary')
print(f"\nOptimization completed!")
print(f"Best parameters: {opt.best_params_}")
print(f"Test accuracy: {test_accuracy:.4f}")
print(f"Test F1: {test_f1:.4f}")
Optimization completed!
Best parameters: {'categorical_features': 'warn', 'class_weight': None, 'early_stopping': 'auto', 'interaction_cst': None, 'l2_regularization': 0.9, 'learning_rate': 0.09, 'loss': 'log_loss', 'max_bins': 255, 'max_depth': 2, 'max_features': 1.0, 'max_iter': 264, 'max_leaf_nodes': 48, 'min_samples_leaf': 50, 'monotonic_cst': None, 'n_iter_no_change': 10, 'random_state': 42, 'scoring': 'loss', 'tol': 1e-07, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
Test accuracy: 0.9561
Test F1: 0.9660
Visualize the search space
population_df = opt.populations_
top_params = ['learning_rate', 'max_depth', 'max_iter', 'max_leaf_nodes', 'fitness']
df_filtered = population_df[top_params]
g_search_space = plotly_search_space(df_filtered, top_params)
g_search_space.update_layout(
title="HistGradientBoostingClassifier Hyperparameter Search Space",
autosize=True,
width=None,
height=650
)
plotly.io.show(g_search_space, config={'responsive': True})
Visualize the optimization evolution
g_logbook = plotly_logbook(opt.logbook_, population_df)
g_logbook.update_layout(
title="HistGradientBoostingClassifier Optimization Evolution",
autosize=True,
width=None,
height=500
)
plotly.io.show(g_logbook, config={'responsive': True})
Analyze optimization performance
print("\n=== Optimization Performance ===")
print(f"Unique evaluations performed: {opt.n_trials_}")
print(f"Total individuals in population history: {len(population_df)}")
print(f"Optimization time: {opt.optimization_time_:.4f} seconds")
print(f"Time per evaluation: {opt.optimization_time_ / opt.n_trials_:.4f} seconds")
=== Optimization Performance ===
Unique evaluations performed: 38
Total individuals in population history: 48
Optimization time: 9.2006 seconds
Time per evaluation: 0.2421 seconds
Total running time of the script: (0 minutes 10.543 seconds)