Coding-For-MBA

Day 52 highlights how bagging, boosting, and stacking unlock better accuracy than single estimators. Use the notebook or solutions.py helpers to:

Execute python Day_52_Ensemble_Methods/solutions.py to print validation scores for each ensemble configuration.

Interactive Notebooks

Run this lesson’s code interactively in your browser:

!!! tip “About JupyterLite” JupyterLite runs entirely in your browser using WebAssembly. No installation or server required! Note: First launch may take a moment to load.

Additional Materials

???+ example “solutions.py” View on GitHub

```python title="solutions.py"
"""Reusable ensemble helpers for Day 52."""

from __future__ import annotations

from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Sequence, Tuple

import numpy as np
import pandas as pd
from sklearn.calibration import CalibratedClassifierCV
from sklearn.datasets import make_classification
from sklearn.ensemble import (
    GradientBoostingClassifier,
    RandomForestClassifier,
    StackingClassifier,
)
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, brier_score_loss
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler


@dataclass
class EnsembleResult:
    """Summary of an ensemble model and its validation score."""

    name: str
    model: object
    score: float


def generate_classification_data(
    n_samples: int = 400,
    n_features: int = 12,
    n_informative: int = 6,
    class_sep: float = 1.8,
    random_state: int = 52,
) -> Tuple[np.ndarray, np.ndarray]:
    """Return a deterministic classification dataset suitable for ensembles."""

    X, y = make_classification(
        n_samples=n_samples,
        n_features=n_features,
        n_informative=n_informative,
        n_redundant=0,
        class_sep=class_sep,
        random_state=random_state,
    )
    return X, y


def train_random_forest(
    X: np.ndarray,
    y: np.ndarray,
    n_estimators: int = 200,
    max_depth: int | None = None,
    random_state: int = 52,
) -> RandomForestClassifier:
    """Fit a random forest classifier with out-of-bag scoring enabled."""

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        oob_score=True,
        random_state=random_state,
        bootstrap=True,
        n_jobs=-1,
    )
    model.fit(X, y)
    return model


def train_gradient_boosting(
    X: np.ndarray,
    y: np.ndarray,
    learning_rate: float = 0.05,
    n_estimators: int = 300,
    max_depth: int = 2,
    random_state: int = 52,
) -> GradientBoostingClassifier:
    """Train a gradient boosting classifier with mild regularisation."""

    model = GradientBoostingClassifier(
        learning_rate=learning_rate,
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state,
    )
    model.fit(X, y)
    return model


def build_stacking_classifier(
    estimators: List[Tuple[str, Pipeline]] | None = None,
    random_state: int = 52,
) -> StackingClassifier:
    """Create a stacking classifier with logistic regression as the final estimator."""

    if estimators is None:
        estimators = [
            (
                "rf",
                make_pipeline(
                    StandardScaler(with_mean=False),
                    RandomForestClassifier(
                        n_estimators=150,
                        max_depth=None,
                        random_state=random_state,
                        n_jobs=-1,
                    ),
                ),
            ),
            (
                "gb",
                make_pipeline(
                    StandardScaler(),
                    GradientBoostingClassifier(
                        learning_rate=0.05,
                        n_estimators=200,
                        max_depth=2,
                        random_state=random_state,
                    ),
                ),
            ),
        ]
    final_estimator = LogisticRegression(max_iter=1000, random_state=random_state)
    stacking = StackingClassifier(
        estimators=estimators,
        final_estimator=final_estimator,
        passthrough=False,
        stack_method="predict_proba",
        n_jobs=-1,
    )
    return stacking


def calibrate_classifier(
    model,
    X_train: np.ndarray,
    y_train: np.ndarray,
    method: str = "isotonic",
    cv: int = 3,
) -> CalibratedClassifierCV:
    """Wrap a fitted classifier with probability calibration."""

    calibrated = CalibratedClassifierCV(estimator=model, method=method, cv=cv)
    calibrated.fit(X_train, y_train)
    return calibrated


def evaluate_classifier(
    model, X_test: np.ndarray, y_test: np.ndarray
) -> Dict[str, float]:
    """Return accuracy and Brier score for the provided classifier."""

    probs = model.predict_proba(X_test)[:, 1]
    preds = (probs >= 0.5).astype(int)
    return {
        "accuracy": float(accuracy_score(y_test, preds)),
        "brier": float(brier_score_loss(y_test, probs)),
    }


def export_feature_importance(
    model: RandomForestClassifier,
    feature_names: Sequence[str],
    output_path: str | Path | None = None,
) -> pd.DataFrame:
    """Return and optionally persist feature importances as a DataFrame."""

    importances = pd.DataFrame(
        {
            "feature": list(feature_names),
            "importance": model.feature_importances_,
        }
    ).sort_values("importance", ascending=False)
    if output_path is not None:
        output_path = Path(output_path)
        importances.to_csv(output_path, index=False)
    return importances.reset_index(drop=True)


def evaluate_with_cross_validation(
    model,
    X: np.ndarray,
    y: np.ndarray,
    cv: int = 5,
    scoring: str = "roc_auc",
) -> float:
    """Return the mean cross-validated score for a classifier."""

    scores = cross_val_score(model, X, y, cv=cv, scoring=scoring)
    return float(np.mean(scores))


def run_day52_demo() -> Dict[str, EnsembleResult]:
    """Train and evaluate the featured ensemble models."""

    X, y = generate_classification_data()
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.25, random_state=52, stratify=y
    )

    rf = train_random_forest(X_train, y_train)
    gb = train_gradient_boosting(X_train, y_train)

    stacking = build_stacking_classifier()
    stacking.fit(X_train, y_train)
    calibrated = calibrate_classifier(stacking, X_train, y_train)

    results = {
        "random_forest": EnsembleResult(
            name="random_forest",
            model=rf,
            score=float(rf.oob_score_),
        ),
        "gradient_boosting": EnsembleResult(
            name="gradient_boosting",
            model=gb,
            score=float(evaluate_with_cross_validation(gb, X, y)),
        ),
        "stacking_calibrated": EnsembleResult(
            name="stacking_calibrated",
            model=calibrated,
            score=evaluate_classifier(calibrated, X_test, y_test)["accuracy"],
        ),
    }
    return results


if __name__ == "__main__":
    results = run_day52_demo()
    for name, result in results.items():
        print(f"{name}: validation score = {result.score:.3f}")
```