Loading learning content...
Regularized logistic regression introduces hyperparameters that cannot be learned from the training data alone—they must be selected through external validation. For models we've covered:
The choice of these hyperparameters dramatically affects model performance. Too little regularization leads to overfitting; too much leads to underfitting. Finding the optimal balance requires systematic exploration and validation.
This page develops principled approaches to hyperparameter selection, from simple grid search to advanced optimization techniques, with practical guidance for real-world applications.
Hyperparameter selection is fundamentally about navigating the bias-variance tradeoff. Strong regularization (large λ) increases bias but reduces variance. Weak regularization (small λ) reduces bias but increases variance. The optimal point minimizes total expected error, balancing these opposing forces.
Hyperparameters cannot be selected by optimizing training loss—this would invariably choose minimal regularization (λ → 0). We need an estimate of out-of-sample performance.
Hold-out validation (train/validation/test split) provides an estimate, but:
K-fold cross-validation addresses these issues:
Common choices for K:
Stratified K-Fold: For classification, maintain class proportions in each fold. Crucial for imbalanced datasets where naive splitting might leave some folds without minority class examples.
Repeated K-Fold: Run K-fold CV multiple times with different shuffles. Reduces variance in the estimated performance at the cost of more computation.
| Strategy | K | Bias | Variance | Computation | Use Case |
|---|---|---|---|---|---|
| 5-Fold | 5 | Medium | Medium | 5x | Default choice |
| 10-Fold | 10 | Lower | Higher | 10x | Medium-sized datasets |
| LOOCV | n | Lowest | Highest | nx | Small datasets (n < 100) |
| Repeated 5-Fold | 5×r | Medium | Lower | 5rx | When stability matters |
| 3-Fold | 3 | Higher | Lower | 3x | Large datasets, quick estimates |
For classification, common metrics for hyperparameter selection:
| Metric | Strengths | Weaknesses |
|---|---|---|
| AUC-ROC | Threshold-agnostic, balanced | Insensitive to calibration |
| Log-loss | Proper scoring rule, calibration-sensitive | Hard to interpret magnitudes |
| Accuracy | Intuitive, domain-relevant | Misleading with class imbalance |
| F1-score | Balances precision/recall | Ignores true negatives |
Recommendation: Use AUC-ROC for general comparison (ranking ability) and log-loss when probability calibration matters. Avoid accuracy unless classes are balanced.
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112
import numpy as npfrom sklearn.model_selection import ( cross_val_score, StratifiedKFold, RepeatedStratifiedKFold)from sklearn.linear_model import LogisticRegressionfrom sklearn.preprocessing import StandardScalerfrom sklearn.pipeline import Pipelinefrom sklearn.metrics import make_scorer, roc_auc_score, log_loss def evaluate_with_cv(X, y, lambda_reg, penalty='l2', l1_ratio=None, cv_strategy='5-fold', scoring='roc_auc'): """ Evaluate a regularized logistic regression model using cross-validation. Parameters ---------- X : array, shape (n, p) Feature matrix y : array, shape (n,) Binary labels lambda_reg : float Regularization strength (note: scikit-learn uses C = 1/lambda) penalty : str Type of regularization: 'l1', 'l2', or 'elasticnet' l1_ratio : float, optional L1 ratio for elastic net (0 = pure L2, 1 = pure L1) cv_strategy : str Cross-validation strategy: '5-fold', '10-fold', 'loocv', 'repeated' scoring : str Metric: 'roc_auc', 'neg_log_loss', 'accuracy' Returns ------- mean_score : float Mean CV score std_score : float Standard deviation of CV scores scores : array Individual fold scores """ # Build pipeline C = 1 / lambda_reg if lambda_reg > 0 else 1e6 # Handle lambda=0 if penalty == 'elasticnet': solver = 'saga' clf_params = {'penalty': penalty, 'C': C, 'l1_ratio': l1_ratio, 'solver': solver, 'max_iter': 2000} elif penalty == 'l1': solver = 'saga' clf_params = {'penalty': penalty, 'C': C, 'solver': solver, 'max_iter': 2000} else: solver = 'lbfgs' clf_params = {'penalty': penalty, 'C': C, 'solver': solver, 'max_iter': 2000} pipeline = Pipeline([ ('scaler', StandardScaler()), ('classifier', LogisticRegression(**clf_params)) ]) # Setup CV strategy if cv_strategy == '5-fold': cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) elif cv_strategy == '10-fold': cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42) elif cv_strategy == 'loocv': from sklearn.model_selection import LeaveOneOut cv = LeaveOneOut() elif cv_strategy == 'repeated': cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42) else: cv = 5 # Default # Run cross-validation scores = cross_val_score(pipeline, X, y, cv=cv, scoring=scoring, n_jobs=-1) return scores.mean(), scores.std(), scores def compare_cv_strategies(X, y, lambda_reg, penalty='l2'): """ Compare different CV strategies on same hyperparameters. """ strategies = ['5-fold', '10-fold', 'repeated'] print(f"CV Strategy Comparison (lambda={lambda_reg}, penalty={penalty})") print("=" * 60) print(f"{'Strategy':<20} {'Mean AUC':>12} {'Std':>12}") print("-" * 60) for strategy in strategies: mean, std, _ = evaluate_with_cv(X, y, lambda_reg, penalty, cv_strategy=strategy) print(f"{strategy:<20} {mean:>12.4f} {std:>12.4f}") # Example usageif __name__ == "__main__": from scipy.special import expit np.random.seed(42) # Generate data n, p = 200, 20 X = np.random.randn(n, p) true_coef = np.zeros(p) true_coef[:5] = [1.5, -1.0, 0.8, -0.5, 0.3] prob = expit(X @ true_coef) y = (np.random.rand(n) < prob).astype(float) # Compare strategies compare_cv_strategies(X, y, lambda_reg=0.1, penalty='l1')Grid search exhaustively evaluates all combinations of hyperparameter values on a predefined grid:
For single hyperparameter (L1 or L2): $$\lambda \in {\lambda_1, \lambda_2, \ldots, \lambda_K}$$
For two hyperparameters (Elastic Net): $$(\lambda, \alpha) \in {\lambda_1, \ldots, \lambda_K} \times {\alpha_1, \ldots, \alpha_M}$$
Grid construction recommendations:
For λ (regularization strength):
For α (L1 ratio in Elastic Net):
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
import numpy as npfrom sklearn.model_selection import GridSearchCV, StratifiedKFoldfrom sklearn.linear_model import LogisticRegressionfrom sklearn.preprocessing import StandardScalerfrom sklearn.pipeline import Pipelineimport time def grid_search_regularization(X, y, penalty='l2', cv=5, verbose=True): """ Perform grid search for regularization hyperparameters. For L1/L2: searches over lambda (via C = 1/lambda) For Elastic Net: searches over both lambda and l1_ratio """ # Build pipeline pipeline = Pipeline([ ('scaler', StandardScaler()), ('classifier', LogisticRegression(max_iter=2000, random_state=42)) ]) # Define parameter grid # Note: scikit-learn uses C = 1/lambda C_values = np.logspace(-4, 2, 20) # lambda from 0.01 to 10000 if penalty == 'elasticnet': param_grid = { 'classifier__penalty': ['elasticnet'], 'classifier__solver': ['saga'], 'classifier__C': C_values, 'classifier__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9] } elif penalty == 'l1': param_grid = { 'classifier__penalty': ['l1'], 'classifier__solver': ['saga'], 'classifier__C': C_values } else: # L2 param_grid = { 'classifier__penalty': ['l2'], 'classifier__solver': ['lbfgs'], 'classifier__C': C_values } # Grid search with cross-validation cv_strategy = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42) start_time = time.time() grid_search = GridSearchCV( pipeline, param_grid, cv=cv_strategy, scoring='roc_auc', n_jobs=-1, verbose=1 if verbose else 0, return_train_score=True ) grid_search.fit(X, y) elapsed_time = time.time() - start_time if verbose: print(f"\nGrid Search Results ({penalty} penalty)") print("=" * 60) print(f"Best parameters: {grid_search.best_params_}") print(f"Best CV AUC: {grid_search.best_score_:.4f}") print(f"Time elapsed: {elapsed_time:.2f}s") # Extract best lambda best_C = grid_search.best_params_['classifier__C'] print(f"Best lambda: {1/best_C:.4f} (C = {best_C:.4f})") if penalty == 'elasticnet': print(f"Best l1_ratio: {grid_search.best_params_['classifier__l1_ratio']}") return grid_search def visualize_grid_search_results(grid_search, penalty='l2'): """ Visualize grid search results. """ import matplotlib.pyplot as plt results = grid_search.cv_results_ if penalty == 'elasticnet': # 2D heatmap for Elastic Net C_values = np.unique([p['classifier__C'] for p in results['params']]) l1_ratios = np.unique([p['classifier__l1_ratio'] for p in results['params']]) # Reshape scores into grid scores = np.zeros((len(l1_ratios), len(C_values))) for i, params in enumerate(results['params']): c_idx = np.where(C_values == params['classifier__C'])[0][0] l1_idx = np.where(l1_ratios == params['classifier__l1_ratio'])[0][0] scores[l1_idx, c_idx] = results['mean_test_score'][i] fig, ax = plt.subplots(figsize=(10, 6)) im = ax.imshow(scores, aspect='auto', cmap='viridis') ax.set_xticks(np.arange(len(C_values))[::3]) ax.set_xticklabels([f'{1/c:.2e}' for c in C_values[::3]], rotation=45) ax.set_yticks(np.arange(len(l1_ratios))) ax.set_yticklabels([f'{r:.1f}' for r in l1_ratios]) ax.set_xlabel('λ (regularization strength)') ax.set_ylabel('α (L1 ratio)') ax.set_title('Elastic Net Grid Search: CV AUC') plt.colorbar(im, ax=ax, label='AUC') # Mark best best_idx = np.argmax(results['mean_test_score']) best_C = results['params'][best_idx]['classifier__C'] best_l1 = results['params'][best_idx]['classifier__l1_ratio'] best_c_idx = np.where(C_values == best_C)[0][0] best_l1_idx = np.where(l1_ratios == best_l1)[0][0] ax.scatter([best_c_idx], [best_l1_idx], color='red', s=200, marker='*', edgecolor='white', linewidth=2, zorder=5) else: # 1D plot for L1 or L2 fig, axes = plt.subplots(1, 2, figsize=(12, 4)) C_values = [p['classifier__C'] for p in results['params']] lambdas = [1/c for c in C_values] mean_scores = results['mean_test_score'] std_scores = results['std_test_score'] # Plot AUC vs lambda ax1 = axes[0] ax1.semilogx(lambdas, mean_scores, 'b-', linewidth=2) ax1.fill_between(lambdas, mean_scores - std_scores, mean_scores + std_scores, alpha=0.2) ax1.set_xlabel('λ (regularization strength)', fontsize=12) ax1.set_ylabel('CV AUC', fontsize=12) ax1.set_title(f'{penalty.upper()} Penalty: Grid Search', fontsize=14) # Mark best best_idx = np.argmax(mean_scores) ax1.axvline(lambdas[best_idx], color='red', linestyle='--', label=f'Best λ = {lambdas[best_idx]:.4f}') ax1.legend() # Training vs validation score (detect overfitting) ax2 = axes[1] train_scores = results['mean_train_score'] ax2.semilogx(lambdas, train_scores, 'g--', label='Train', linewidth=2) ax2.semilogx(lambdas, mean_scores, 'b-', label='Validation', linewidth=2) ax2.set_xlabel('λ (regularization strength)', fontsize=12) ax2.set_ylabel('AUC', fontsize=12) ax2.set_title('Train vs Validation', fontsize=14) ax2.legend() # Highlight overfitting region gap = train_scores - np.array(mean_scores) ax2.fill_between(lambdas, mean_scores, train_scores, where=(gap > 0.05), alpha=0.3, color='red', label='Overfitting region') plt.tight_layout() plt.savefig('grid_search_results.png', dpi=150) plt.show() # Exampleif __name__ == "__main__": from scipy.special import expit np.random.seed(42) # Generate data n, p = 300, 30 X = np.random.randn(n, p) true_coef = np.zeros(p) true_coef[:5] = [1.5, -1.0, 0.8, -0.5, 0.3] prob = expit(X @ true_coef) y = (np.random.rand(n) < prob).astype(float) # Grid search for each penalty type for penalty in ['l2', 'l1', 'elasticnet']: print(f"\n{'='*60}") print(f"Testing {penalty} penalty...") grid_search = grid_search_regularization(X, y, penalty=penalty) visualize_grid_search_results(grid_search, penalty=penalty)Grid search has computational complexity O(K × M × F × T) where K is grid points for λ, M is grid points for α, F is CV folds, and T is training time. For Elastic Net with 20 λ values, 5 α values, and 5-fold CV, this means 500 model fits. Large grids can be prohibitively expensive.
Random search offers surprising efficiency advantages over grid search:
Bergstra & Bengio (2012) showed that random search:
Key insight: With grid search, if one hyperparameter doesn't matter much, we "waste" evaluations varying it while good values of important hyperparameters go unexplored. Random search allocates more unique values to each dimension.
Instead of a fixed grid, we sample hyperparameters from distributions:
For λ (should span orders of magnitude): $$\log_{10}(\lambda) \sim \text{Uniform}(-4, 2)$$
This is equivalent to sampling λ uniformly on a log scale.
For α (bounded between 0 and 1): $$\alpha \sim \text{Uniform}(0.1, 1.0)$$
Or use a Beta distribution for more control: $$\alpha \sim \text{Beta}(2, 2)$$ centers mass around 0.5 $$\alpha \sim \text{Beta}(5, 2)$$ favors higher α (more L1)
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144
import numpy as npfrom sklearn.model_selection import RandomizedSearchCV, StratifiedKFoldfrom sklearn.linear_model import LogisticRegressionfrom sklearn.preprocessing import StandardScalerfrom sklearn.pipeline import Pipelinefrom scipy.stats import loguniform, uniformimport time def random_search_regularization(X, y, penalty='elasticnet', n_iter=50, cv=5): """ Random search for regularization hyperparameters. Uses efficient distributions for sampling: - Log-uniform for lambda (via C) - Uniform for l1_ratio """ pipeline = Pipeline([ ('scaler', StandardScaler()), ('classifier', LogisticRegression(max_iter=2000, random_state=42)) ]) # Define parameter distributions if penalty == 'elasticnet': # loguniform samples uniformly on log scale # loguniform(a, b) samples from [a, a*b] uniformly on log scale param_distributions = { 'classifier__penalty': ['elasticnet'], 'classifier__solver': ['saga'], 'classifier__C': loguniform(1e-2, 1e4), # C in [0.01, 100] 'classifier__l1_ratio': uniform(0.1, 0.9) # l1_ratio in [0.1, 1.0] } elif penalty == 'l1': param_distributions = { 'classifier__penalty': ['l1'], 'classifier__solver': ['saga'], 'classifier__C': loguniform(1e-2, 1e4) } else: param_distributions = { 'classifier__penalty': ['l2'], 'classifier__solver': ['lbfgs'], 'classifier__C': loguniform(1e-2, 1e4) } cv_strategy = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42) start_time = time.time() random_search = RandomizedSearchCV( pipeline, param_distributions, n_iter=n_iter, cv=cv_strategy, scoring='roc_auc', n_jobs=-1, random_state=42, verbose=1 ) random_search.fit(X, y) elapsed_time = time.time() - start_time print(f"\nRandom Search Results ({penalty}, {n_iter} iterations)") print("=" * 60) print(f"Best parameters: {random_search.best_params_}") print(f"Best CV AUC: {random_search.best_score_:.4f}") print(f"Time elapsed: {elapsed_time:.2f}s") best_C = random_search.best_params_['classifier__C'] print(f"Best lambda: {1/best_C:.4f}") return random_search def compare_grid_vs_random(X, y, penalty='l2', budget=50): """ Compare grid search and random search with the same computational budget. Both methods get 'budget' total model evaluations (ignoring CV overhead). """ print(f"\nGrid vs Random Search Comparison (budget={budget} evaluations)") print("=" * 70) # Grid search: sqrt(budget) points per dimension for 1D # For 2D (Elastic Net): roughly budget^0.5 per dimension if penalty == 'elasticnet': n_C = int(np.sqrt(budget)) n_alpha = int(budget / n_C) else: n_C = budget # Grid search pipeline = Pipeline([ ('scaler', StandardScaler()), ('classifier', LogisticRegression(max_iter=2000, random_state=42)) ]) C_values = np.logspace(-2, 2, n_C) if penalty == 'elasticnet': param_grid = { 'classifier__penalty': ['elasticnet'], 'classifier__solver': ['saga'], 'classifier__C': C_values, 'classifier__l1_ratio': np.linspace(0.1, 0.9, n_alpha) } else: param_grid = { 'classifier__penalty': [penalty], 'classifier__solver': ['saga' if penalty == 'l1' else 'lbfgs'], 'classifier__C': C_values } start_time = time.time() grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='roc_auc', n_jobs=-1) grid_search.fit(X, y) grid_time = time.time() - start_time # Random search with same budget start_time = time.time() random_search = random_search_regularization(X, y, penalty=penalty, n_iter=budget, cv=5) random_time = time.time() - start_time print(f"\n{'Method':<20} {'Best AUC':>12} {'Time (s)':>10}") print("-" * 50) print(f"{'Grid Search':<20} {grid_search.best_score_:>12.4f} {grid_time:>10.2f}") print(f"{'Random Search':<20} {random_search.best_score_:>12.4f} {random_time:>10.2f}") return grid_search, random_search # Exampleif __name__ == "__main__": from scipy.special import expit np.random.seed(42) n, p = 300, 30 X = np.random.randn(n, p) true_coef = np.zeros(p) true_coef[:5] = [1.5, -1.0, 0.8, -0.5, 0.3] prob = expit(X @ true_coef) y = (np.random.rand(n) < prob).astype(float) # Compare methods compare_grid_vs_random(X, y, penalty='elasticnet', budget=60)| Aspect | Grid Search | Random Search |
|---|---|---|
| Exploration | Exhaustive on grid | Stochastic, covers space |
| Reproducibility | Fully deterministic | Set random seed |
| High dimensions | Exponential scaling | Linear scaling |
| Important + unimportant params | Wastes budget | Explores efficiently |
| Easy to visualize | Yes (regular grid) | Harder (scattered points) |
| Finding exact optimum | Limited to grid | May miss if unlucky |
Grid and random search treat each evaluation independently—they don't learn from previous results. Bayesian optimization builds a probabilistic model of the objective function and uses it to select promising hyperparameters:
Key advantages:
Bayesian optimization is most valuable when evaluations are expensive (model training takes minutes/hours) and the hyperparameter space is low-dimensional (2-10 dimensions). For simple logistic regression with fast training, the overhead may not be worth it. But for complex pipelines or large datasets, it can significantly reduce total optimization time.
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154
import numpy as npfrom sklearn.linear_model import LogisticRegressionfrom sklearn.preprocessing import StandardScalerfrom sklearn.pipeline import Pipelinefrom sklearn.model_selection import cross_val_score, StratifiedKFold # Using Optuna for Bayesian optimization# pip install optunatry: import optuna from optuna.samplers import TPESampler OPTUNA_AVAILABLE = Trueexcept ImportError: OPTUNA_AVAILABLE = False print("Optuna not installed. Install with: pip install optuna") def bayesian_optimize_logistic(X, y, penalty='elasticnet', n_trials=30, cv=5): """ Bayesian optimization for logistic regression hyperparameters using Optuna. Uses Tree-structured Parzen Estimator (TPE) as the surrogate model. """ if not OPTUNA_AVAILABLE: print("Optuna required for Bayesian optimization") return None cv_strategy = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42) # Standardize features once scaler = StandardScaler() X_scaled = scaler.fit_transform(X) def objective(trial): """Objective function to maximize (CV AUC).""" # Sample hyperparameters # Log-uniform for lambda (via C) C = trial.suggest_float('C', 1e-3, 1e2, log=True) if penalty == 'elasticnet': l1_ratio = trial.suggest_float('l1_ratio', 0.1, 0.99) model = LogisticRegression( penalty='elasticnet', C=C, l1_ratio=l1_ratio, solver='saga', max_iter=2000, random_state=42 ) elif penalty == 'l1': model = LogisticRegression( penalty='l1', C=C, solver='saga', max_iter=2000, random_state=42 ) else: model = LogisticRegression( penalty='l2', C=C, solver='lbfgs', max_iter=2000, random_state=42 ) # Cross-validation score scores = cross_val_score(model, X_scaled, y, cv=cv_strategy, scoring='roc_auc', n_jobs=-1) return scores.mean() # Create study with TPE sampler study = optuna.create_study( direction='maximize', sampler=TPESampler(seed=42) ) # Suppress Optuna logging optuna.logging.set_verbosity(optuna.logging.WARNING) # Run optimization study.optimize(objective, n_trials=n_trials, show_progress_bar=True) print(f"\nBayesian Optimization Results ({penalty}, {n_trials} trials)") print("=" * 60) print(f"Best CV AUC: {study.best_value:.4f}") print(f"Best parameters:") for key, value in study.best_params.items(): if key == 'C': print(f" lambda = {1/value:.4f} (C = {value:.4f})") else: print(f" {key} = {value:.4f}") return study def visualize_optimization_history(study): """ Visualize the optimization history. """ import matplotlib.pyplot as plt # Extract trial data trials = study.trials values = [t.value for t in trials] best_so_far = np.maximum.accumulate(values) fig, axes = plt.subplots(1, 2, figsize=(12, 4)) # Optimization history ax1 = axes[0] ax1.scatter(range(len(values)), values, alpha=0.6, label='Trial value') ax1.plot(best_so_far, 'r-', linewidth=2, label='Best so far') ax1.set_xlabel('Trial', fontsize=12) ax1.set_ylabel('CV AUC', fontsize=12) ax1.set_title('Optimization History', fontsize=14) ax1.legend() # Parameter importance (if multiple hyperparameters) ax2 = axes[1] if len(study.best_params) > 1: # Show sampled parameter values colored by objective C_vals = [t.params['C'] for t in trials] if 'l1_ratio' in trials[0].params: l1_vals = [t.params['l1_ratio'] for t in trials] sc = ax2.scatter(C_vals, l1_vals, c=values, cmap='viridis', s=50, alpha=0.7) ax2.set_xscale('log') ax2.set_xlabel('C (1/λ)', fontsize=12) ax2.set_ylabel('L1 Ratio (α)', fontsize=12) ax2.set_title('Parameter Space Exploration', fontsize=14) plt.colorbar(sc, ax=ax2, label='CV AUC') # Mark best ax2.scatter([study.best_params['C']], [study.best_params['l1_ratio']], color='red', s=200, marker='*', edgecolor='white', linewidth=2, zorder=5) else: ax2.semilogx(C_vals, values, 'bo', alpha=0.6) ax2.set_xlabel('C (1/λ)', fontsize=12) ax2.set_ylabel('CV AUC', fontsize=12) ax2.set_title('Objective vs C', fontsize=14) plt.tight_layout() plt.savefig('bayesian_optimization.png', dpi=150) plt.show() # Exampleif __name__ == "__main__": from scipy.special import expit np.random.seed(42) n, p = 300, 30 X = np.random.randn(n, p) true_coef = np.zeros(p) true_coef[:5] = [1.5, -1.0, 0.8, -0.5, 0.3] prob = expit(X @ true_coef) y = (np.random.rand(n) < prob).astype(float) if OPTUNA_AVAILABLE: study = bayesian_optimize_logistic(X, y, penalty='elasticnet', n_trials=30) visualize_optimization_history(study)A practical workflow for hyperparameter selection:
Step 1: Initial Exploration (Quick)
Step 2: Refinement (Moderate)
Step 3: Validation (Thorough)
| Pitfall | Consequence | Solution |
|---|---|---|
| Optimizing on test set | Overfit to test data; biased estimates | Strict train/val/test separation |
| Too narrow λ range | Miss optimal regularization | Start with 4+ orders of magnitude |
| Too few CV folds | High variance in estimates | Use 5+ folds; repeat if possible |
| Ignoring class imbalance | Models that predict majority class | Use stratified K-fold; weighted metrics |
| Not standardizing inside CV | Data leakage from test to train | Put scaler in pipeline |
| Reporting best CV score | Optimistic bias | Report mean and std; use nested CV |
When you select hyperparameters via CV and report that same CV score as your model's performance, you introduce optimistic bias—you're measuring performance on data used for selection. For unbiased estimation, use nested CV: an outer loop for performance estimation, an inner loop for hyperparameter selection. This is more expensive but statistically rigorous.
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216
import numpy as npfrom sklearn.linear_model import LogisticRegressionCV, LogisticRegressionfrom sklearn.preprocessing import StandardScalerfrom sklearn.pipeline import Pipelinefrom sklearn.model_selection import ( train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV)from sklearn.metrics import ( roc_auc_score, log_loss, classification_report, confusion_matrix)from scipy.stats import loguniform, uniformimport matplotlib.pyplot as pltimport warningswarnings.filterwarnings('ignore') def complete_hyperparameter_workflow(X, y, test_size=0.2, random_state=42): """ Complete end-to-end workflow for hyperparameter selection in regularized logistic regression. Workflow: 1. Split data into train+validation and held-out test 2. Compare L1, L2, and Elastic Net using CV 3. For best penalty type, fine-tune hyperparameters 4. Evaluate final model on held-out test set 5. Report comprehensive metrics """ print("=" * 70) print("COMPLETE HYPERPARAMETER SELECTION WORKFLOW") print("=" * 70) # ===== STEP 1: Train/Test Split ===== print("\n[Step 1] Splitting data...") X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=test_size, random_state=random_state, stratify=y ) print(f" Training samples: {len(y_train)}") print(f" Test samples: {len(y_test)}") print(f" Class balance (train): {np.mean(y_train):.3f}") # ===== STEP 2: Compare Penalty Types ===== print("\n[Step 2] Comparing penalty types...") cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_state) penalties = { 'L2 (Ridge)': {'penalty': 'l2', 'solver': 'lbfgs'}, 'L1 (Lasso)': {'penalty': 'l1', 'solver': 'saga'}, 'Elastic Net': {'penalty': 'elasticnet', 'solver': 'saga', 'l1_ratio': 0.5} } penalty_results = {} for name, params in penalties.items(): pipeline = Pipeline([ ('scaler', StandardScaler()), ('classifier', LogisticRegressionCV( Cs=np.logspace(-3, 2, 20), cv=cv, scoring='roc_auc', max_iter=2000, random_state=random_state, n_jobs=-1, **params )) ]) # We can't easily use cross_val_score with LogisticRegressionCV, # so we fit and extract scores differently pipeline.fit(X_train, y_train) clf = pipeline.named_steps['classifier'] # Best score from inner CV (approximate) best_idx = np.argmax(clf.scores_[1].mean(axis=0)) cv_score = clf.scores_[1][:, best_idx].mean() cv_std = clf.scores_[1][:, best_idx].std() penalty_results[name] = { 'cv_score': cv_score, 'cv_std': cv_std, 'best_C': clf.C_[0] } print(f" {name}: AUC = {cv_score:.4f} ± {cv_std:.4f}, C = {clf.C_[0]:.4f}") # Select best penalty type best_penalty = max(penalty_results, key=lambda k: penalty_results[k]['cv_score']) print(f" → Best penalty type: {best_penalty}") # ===== STEP 3: Fine-tune Best Model ===== print(f"\n[Step 3] Fine-tuning {best_penalty}...") if best_penalty == 'Elastic Net': # Grid search over both C and l1_ratio param_grid = { 'classifier__C': np.logspace(-2, 1, 15), 'classifier__l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9] } base_model = LogisticRegression( penalty='elasticnet', solver='saga', max_iter=2000, random_state=random_state ) elif best_penalty == 'L1 (Lasso)': param_grid = { 'classifier__C': np.logspace(-2, 1, 25) } base_model = LogisticRegression( penalty='l1', solver='saga', max_iter=2000, random_state=random_state ) else: param_grid = { 'classifier__C': np.logspace(-2, 1, 25) } base_model = LogisticRegression( penalty='l2', solver='lbfgs', max_iter=2000, random_state=random_state ) fine_tune_pipeline = Pipeline([ ('scaler', StandardScaler()), ('classifier', base_model) ]) grid_search = GridSearchCV( fine_tune_pipeline, param_grid, cv=cv, scoring='roc_auc', n_jobs=-1, return_train_score=True ) grid_search.fit(X_train, y_train) print(f" Best hyperparameters: {grid_search.best_params_}") print(f" Best CV AUC: {grid_search.best_score_:.4f}") # ===== STEP 4: Final Model on Test Set ===== print("\n[Step 4] Evaluating on held-out test set...") final_model = grid_search.best_estimator_ # Predictions y_pred_proba = final_model.predict_proba(X_test)[:, 1] y_pred = final_model.predict(X_test) # Metrics test_auc = roc_auc_score(y_test, y_pred_proba) test_logloss = log_loss(y_test, y_pred_proba) test_accuracy = np.mean(y_pred == y_test) print(f" Test AUC: {test_auc:.4f}") print(f" Test Log-Loss: {test_logloss:.4f}") print(f" Test Accuracy: {test_accuracy:.4f}") # ===== STEP 5: Model Analysis ===== print("\n[Step 5] Model analysis...") coef = final_model.named_steps['classifier'].coef_.flatten() n_features = len(coef) n_nonzero = np.sum(np.abs(coef) > 1e-6) print(f" Total features: {n_features}") print(f" Non-zero coefficients: {n_nonzero}") print(f" Sparsity: {100 * (1 - n_nonzero/n_features):.1f}%") # Top important features top_idx = np.argsort(np.abs(coef))[-5:][::-1] print(f" Top 5 features (by |coefficient|):") for i, idx in enumerate(top_idx, 1): print(f" {i}. Feature {idx}: {coef[idx]:.4f}") # ===== Summary ===== print("\n" + "=" * 70) print("SUMMARY") print("=" * 70) print(f"Penalty type: {best_penalty}") print(f"Regularization strength λ: {1/grid_search.best_params_['classifier__C']:.4f}") if 'classifier__l1_ratio' in grid_search.best_params_: print(f"L1 ratio α: {grid_search.best_params_['classifier__l1_ratio']:.2f}") print(f"Cross-validation AUC: {grid_search.best_score_:.4f}") print(f"Test set AUC: {test_auc:.4f}") print(f"Selected features: {n_nonzero}/{n_features}") return { 'best_model': final_model, 'grid_search': grid_search, 'test_auc': test_auc, 'test_logloss': test_logloss, 'penalty_comparison': penalty_results } # Run the workflowif __name__ == "__main__": from scipy.special import expit np.random.seed(42) # Generate synthetic data n, p = 500, 50 X = np.random.randn(n, p) # Add correlation structure X[:, 10:15] = X[:, 0:1] + 0.2 * np.random.randn(n, 5) # Sparse true model true_coef = np.zeros(p) true_coef[:5] = [1.5, -1.2, 0.8, -0.6, 0.4] true_coef[10:12] = [0.5, -0.3] # Correlated features also contribute prob = expit(X @ true_coef) y = (np.random.rand(n) < prob).astype(float) # Run complete workflow results = complete_hyperparameter_workflow(X, y)You've completed the module on Regularized Logistic Regression! You now understand L2, L1, and Elastic Net regularization from mathematical, geometric, and Bayesian perspectives. You can compute regularization paths, interpret feature selection behavior, and systematically tune hyperparameters. These skills form the foundation for building robust, production-ready classification models.