Coding-For-MBA

Optimisation is the bridge between baseline models and production-grade performance. Day 53 introduces reproducible workflows for:

Execute python Day_53_Model_Tuning_and_Feature_Selection/solutions.py to see both search strategies in action alongside feature importance diagnostics.

Interactive Notebooks

Run this lesson’s code interactively in your browser:

!!! tip “About JupyterLite” JupyterLite runs entirely in your browser using WebAssembly. No installation or server required! Note: First launch may take a moment to load.

Additional Materials

???+ example “solutions.py” View on GitHub

```python title="solutions.py"
"""Model tuning and feature selection utilities for Day 53."""

from __future__ import annotations

from dataclasses import dataclass
from typing import Dict, Iterable, Tuple

import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from skopt import BayesSearchCV
from skopt.space import Categorical, Real


@dataclass
class TuningResult:
    """Lightweight container for fitted search objects."""

    name: str
    search: object
    best_params: Dict[str, object]
    best_score: float


def generate_tuning_dataset(
    n_samples: int = 300,
    n_features: int = 10,
    n_informative: int = 5,
    class_sep: float = 2.0,
    random_state: int = 53,
) -> Tuple[np.ndarray, np.ndarray]:
    """Create a deterministic binary classification dataset for tuning demos."""

    X, y = make_classification(
        n_samples=n_samples,
        n_features=n_features,
        n_informative=n_informative,
        n_redundant=0,
        n_repeated=0,
        class_sep=class_sep,
        flip_y=0.01,
        random_state=random_state,
    )
    return X, y


def build_logistic_pipeline(random_state: int = 53) -> Pipeline:
    """Return a scaled logistic regression pipeline."""

    return make_pipeline(
        StandardScaler(),
        LogisticRegression(max_iter=2000, solver="lbfgs", random_state=random_state),
    )


def run_grid_search(
    X: np.ndarray,
    y: np.ndarray,
    param_grid: Dict[str, Iterable[float | int]] | None = None,
    cv: int = 5,
    scoring: str = "roc_auc",
    random_state: int = 53,
) -> TuningResult:
    """Execute a deterministic grid search for logistic regression hyperparameters."""

    pipeline = build_logistic_pipeline(random_state=random_state)
    if param_grid is None:
        param_grid = {
            "logisticregression__C": [0.1, 1.0, 10.0],
            "logisticregression__penalty": ["l2"],
        }
    cv_strategy = StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
    grid = GridSearchCV(
        pipeline,
        param_grid=param_grid,
        cv=cv_strategy,
        scoring=scoring,
        n_jobs=-1,
    )
    grid.fit(X, y)
    return TuningResult(
        name="grid_search",
        search=grid,
        best_params=grid.best_params_,
        best_score=float(grid.best_score_),
    )


def run_bayesian_optimisation(
    X: np.ndarray,
    y: np.ndarray,
    search_spaces: Dict[str, Iterable] | None = None,
    n_iter: int = 16,
    cv: int = 5,
    scoring: str = "roc_auc",
    random_state: int = 53,
) -> TuningResult:
    """Perform Bayesian optimisation using scikit-optimize's BayesSearchCV."""

    pipeline = build_logistic_pipeline(random_state=random_state)
    if search_spaces is None:
        search_spaces = {
            "logisticregression__C": Real(1e-2, 1e2, prior="log-uniform"),
            "logisticregression__penalty": Categorical(["l2"]),
        }
    cv_strategy = StratifiedKFold(n_splits=cv, shuffle=True, random_state=random_state)
    bayes = BayesSearchCV(
        pipeline,
        search_spaces=search_spaces,
        n_iter=n_iter,
        cv=cv_strategy,
        scoring=scoring,
        random_state=random_state,
        n_jobs=-1,
        refit=True,
    )
    bayes.fit(X, y)
    return TuningResult(
        name="bayesian_search",
        search=bayes,
        best_params=bayes.best_params_,
        best_score=float(bayes.best_score_),
    )


def compute_permutation_importance(
    model,
    X: np.ndarray,
    y: np.ndarray,
    n_repeats: int = 15,
    random_state: int = 53,
) -> pd.DataFrame:
    """Return permutation importance scores as a DataFrame."""

    result = permutation_importance(
        model,
        X,
        y,
        n_repeats=n_repeats,
        random_state=random_state,
        scoring="accuracy",
    )
    df = pd.DataFrame(
        {
            "feature": [f"feature_{i}" for i in range(X.shape[1])],
            "importance_mean": result.importances_mean,
            "importance_std": result.importances_std,
        }
    ).sort_values("importance_mean", ascending=False)
    return df.reset_index(drop=True)


def run_recursive_feature_elimination(
    X: np.ndarray,
    y: np.ndarray,
    n_features_to_select: int = 5,
    random_state: int = 53,
) -> Tuple[RFE, np.ndarray]:
    """Perform recursive feature elimination with logistic regression."""

    estimator = LogisticRegression(
        max_iter=2000, solver="lbfgs", random_state=random_state
    )
    selector = RFE(estimator, n_features_to_select=n_features_to_select)
    selector.fit(X, y)
    support = selector.support_
    return selector, support


def evaluate_selected_features(
    selector: RFE,
    X: np.ndarray,
    y: np.ndarray,
    cv: int = 5,
) -> float:
    """Evaluate the selected features with cross-validation accuracy."""

    X_selected = selector.transform(X)
    model = LogisticRegression(max_iter=2000)
    scores = cross_val_score(model, X_selected, y, cv=cv, scoring="accuracy")
    return float(np.mean(scores))


def run_day53_demo() -> Dict[str, TuningResult]:
    """Execute grid search and Bayesian optimisation workflows."""

    X, y = generate_tuning_dataset()
    grid = run_grid_search(X, y)
    bayes = run_bayesian_optimisation(X, y)

    pipeline = grid.search.best_estimator_
    permutation_df = compute_permutation_importance(pipeline, X, y)
    selector, support = run_recursive_feature_elimination(X, y)
    selected_score = evaluate_selected_features(selector, X, y)

    print("Permutation importance head:\n", permutation_df.head())
    print("Selected features:", np.where(support)[0])
    print(f"Cross-validated accuracy with selected features: {selected_score:.3f}")

    return {
        "grid": grid,
        "bayes": bayes,
    }


if __name__ == "__main__":
    results = run_day53_demo()
    for name, result in results.items():
        print(f"{name}: best score = {result.best_score:.3f}")
```