Loading learning content...
Many powerful evaluation tools—ROC curves, precision-recall curves, threshold analysis—are inherently binary. They answer questions like "How well does the model distinguish class A from not-A?" But in multi-class problems, we need to evaluate K classes simultaneously.
One-vs-Rest (OvR) evaluation bridges this gap by decomposing the K-class problem into K binary problems. For each class k, we treat k as the positive class and all other classes as negative. This enables threshold-based analysis, per-class ROC/PR curves, and a richer understanding of classifier behavior than aggregate metrics alone provide.
By the end of this page, you will decompose multi-class problems into OvR binary tasks, compute per-class ROC and PR curves, aggregate curves into multi-class summaries, and understand when OvR evaluation is most informative.
For a K-class problem, OvR creates K binary classification tasks:
Task k: Class k (positive) vs. All other classes (negative)
For each sample:
This decomposition enables:
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081
import numpy as npfrom sklearn.preprocessing import label_binarizefrom typing import List, Tuple def ovr_decomposition( y_true: np.ndarray, y_prob: np.ndarray, classes: List = None) -> List[Tuple[np.ndarray, np.ndarray]]: """ Decompose multi-class problem into K binary OvR tasks. Parameters ---------- y_true : array of shape (n_samples,) True class labels y_prob : array of shape (n_samples, n_classes) Probability predictions for each class classes : list List of class labels in order matching y_prob columns Returns ------- binary_tasks : list of (y_true_binary, y_score) tuples One tuple per class, ready for ROC/PR computation """ if classes is None: classes = sorted(np.unique(y_true)) K = len(classes) n_samples = len(y_true) # Binarize true labels: shape (n_samples, n_classes) y_true_bin = label_binarize(y_true, classes=classes) # Handle binary case (returns 1D) if y_true_bin.ndim == 1: y_true_bin = np.column_stack([1 - y_true_bin, y_true_bin]) binary_tasks = [] for k in range(K): y_binary = y_true_bin[:, k] y_score = y_prob[:, k] binary_tasks.append((y_binary, y_score)) return binary_tasks def demonstrate_ovr(): """Demonstrate OvR decomposition with example data.""" np.random.seed(42) # Simulate 3-class problem n_samples = 300 y_true = np.array([0]*100 + [1]*100 + [2]*100) # Simulated probability predictions y_prob = np.random.dirichlet([3, 2, 1], n_samples) # Make predictions better correlated with true labels for i, true_class in enumerate(y_true): y_prob[i, true_class] += 0.3 y_prob = y_prob / y_prob.sum(axis=1, keepdims=True) # Renormalize binary_tasks = ovr_decomposition(y_true, y_prob, classes=[0, 1, 2]) print("OvR Decomposition Results:") print("-" * 50) for k, (y_bin, y_score) in enumerate(binary_tasks): n_pos = y_bin.sum() n_neg = len(y_bin) - n_pos mean_score_pos = y_score[y_bin == 1].mean() mean_score_neg = y_score[y_bin == 0].mean() print(f"Class {k}: {n_pos} positive, {n_neg} negative") print(f" Mean score for positives: {mean_score_pos:.3f}") print(f" Mean score for negatives: {mean_score_neg:.3f}") print() return binary_tasks if __name__ == "__main__": demonstrate_ovr()With OvR decomposition, we can compute a ROC curve for each class. Each curve shows the tradeoff between true positive rate and false positive rate for detecting that specific class.
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
import numpy as npfrom sklearn.metrics import roc_curve, auc, roc_auc_scorefrom sklearn.preprocessing import label_binarizeimport matplotlib.pyplot as pltfrom typing import Dict, List def compute_ovr_roc( y_true: np.ndarray, y_prob: np.ndarray, classes: List) -> Dict[int, Dict]: """ Compute per-class ROC curves using OvR strategy. Returns ------- roc_data : dict For each class: fpr, tpr, thresholds, auc """ y_true_bin = label_binarize(y_true, classes=classes) if y_true_bin.ndim == 1: y_true_bin = np.column_stack([1 - y_true_bin, y_true_bin]) roc_data = {} for k, class_name in enumerate(classes): fpr, tpr, thresholds = roc_curve(y_true_bin[:, k], y_prob[:, k]) roc_auc = auc(fpr, tpr) roc_data[class_name] = { 'fpr': fpr, 'tpr': tpr, 'thresholds': thresholds, 'auc': roc_auc } return roc_data def compute_macro_micro_auc( y_true: np.ndarray, y_prob: np.ndarray, classes: List) -> Dict[str, float]: """ Compute macro and micro-averaged AUC. Macro-AUC: Average of per-class AUCs Micro-AUC: Pool all binary predictions, compute single AUC """ y_true_bin = label_binarize(y_true, classes=classes) if y_true_bin.ndim == 1: y_true_bin = np.column_stack([1 - y_true_bin, y_true_bin]) # Per-class AUC class_aucs = [] for k in range(len(classes)): auc_k = roc_auc_score(y_true_bin[:, k], y_prob[:, k]) class_aucs.append(auc_k) macro_auc = np.mean(class_aucs) # Micro: flatten and compute micro_auc = roc_auc_score(y_true_bin.ravel(), y_prob.ravel()) return { 'macro_auc': macro_auc, 'micro_auc': micro_auc, 'per_class_auc': dict(zip(classes, class_aucs)) } def plot_ovr_roc(roc_data: Dict, title: str = "OvR ROC Curves"): """Plot all per-class ROC curves on single figure.""" fig, ax = plt.subplots(figsize=(8, 8)) colors = plt.cm.Set1(np.linspace(0, 1, len(roc_data))) for (class_name, data), color in zip(roc_data.items(), colors): ax.plot( data['fpr'], data['tpr'], color=color, lw=2, label=f"{class_name} (AUC = {data['auc']:.3f})" ) # Diagonal reference ax.plot([0, 1], [0, 1], 'k--', lw=1, label='Random') ax.set_xlim([0, 1]) ax.set_ylim([0, 1.05]) ax.set_xlabel('False Positive Rate', fontsize=12) ax.set_ylabel('True Positive Rate', fontsize=12) ax.set_title(title, fontsize=14) ax.legend(loc='lower right') ax.grid(True, alpha=0.3) plt.tight_layout() return fig• High AUC class: Easy to distinguish from others • Low AUC class: Frequently confused with other classes • AUC ≈ 0.5: No discriminative power for this class
Compare curves to identify which classes need improvement.
For imbalanced classes, precision-recall curves are often more informative than ROC curves. The baseline for random performance in PR space depends on class prevalence, making it immediately clear when a class is difficult due to rarity vs. feature overlap.
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495
import numpy as npfrom sklearn.metrics import precision_recall_curve, average_precision_scorefrom sklearn.preprocessing import label_binarizeimport matplotlib.pyplot as pltfrom typing import Dict, List def compute_ovr_pr( y_true: np.ndarray, y_prob: np.ndarray, classes: List) -> Dict: """ Compute per-class Precision-Recall curves using OvR. Returns ------- pr_data : dict For each class: precision, recall, thresholds, average_precision """ y_true_bin = label_binarize(y_true, classes=classes) if y_true_bin.ndim == 1: y_true_bin = np.column_stack([1 - y_true_bin, y_true_bin]) pr_data = {} for k, class_name in enumerate(classes): precision, recall, thresholds = precision_recall_curve( y_true_bin[:, k], y_prob[:, k] ) ap = average_precision_score(y_true_bin[:, k], y_prob[:, k]) # Baseline: class prevalence prevalence = y_true_bin[:, k].mean() pr_data[class_name] = { 'precision': precision, 'recall': recall, 'thresholds': thresholds, 'average_precision': ap, 'prevalence': prevalence } return pr_data def compute_mean_average_precision( y_true: np.ndarray, y_prob: np.ndarray, classes: List) -> float: """ Compute Mean Average Precision (mAP) across all classes. mAP is the macro-average of per-class Average Precision. Commonly used in object detection and multi-label classification. """ y_true_bin = label_binarize(y_true, classes=classes) if y_true_bin.ndim == 1: y_true_bin = np.column_stack([1 - y_true_bin, y_true_bin]) aps = [] for k in range(len(classes)): ap = average_precision_score(y_true_bin[:, k], y_prob[:, k]) aps.append(ap) return np.mean(aps) def plot_ovr_pr(pr_data: Dict, title: str = "OvR Precision-Recall Curves"): """Plot per-class PR curves with prevalence baselines.""" fig, ax = plt.subplots(figsize=(8, 8)) colors = plt.cm.Set1(np.linspace(0, 1, len(pr_data))) for (class_name, data), color in zip(pr_data.items(), colors): ax.plot( data['recall'], data['precision'], color=color, lw=2, label=f"{class_name} (AP = {data['average_precision']:.3f})" ) # Add prevalence baseline ax.axhline( data['prevalence'], color=color, linestyle='--', alpha=0.3, lw=1 ) ax.set_xlim([0, 1]) ax.set_ylim([0, 1.05]) ax.set_xlabel('Recall', fontsize=12) ax.set_ylabel('Precision', fontsize=12) ax.set_title(title, fontsize=14) ax.legend(loc='upper right') ax.grid(True, alpha=0.3) plt.tight_layout() return figAfter computing per-class metrics, we often need a single summary number. The same averaging strategies from earlier apply:
| Strategy | Formula | Interpretation |
|---|---|---|
| Macro | (1/K) × Σₖ AUCₖ | Average performance across classes |
| Weighted | Σₖ (supportₖ/N) × AUCₖ | Weight by class frequency |
| Micro | AUC(concat all OvR tasks) | Pool all binary predictions |
| One-vs-One (OvO) | Average pairwise AUCs | Alternative decomposition |
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455
import numpy as npfrom sklearn.metrics import roc_auc_scorefrom sklearn.preprocessing import label_binarizefrom typing import Dict def aggregate_ovr_auc( y_true: np.ndarray, y_prob: np.ndarray, classes: list, average: str = 'macro') -> float: """ Compute aggregated AUC using specified strategy. Parameters ---------- average : {'macro', 'weighted', 'micro'} Aggregation strategy Returns ------- auc : float Aggregated AUC score """ y_true_bin = label_binarize(y_true, classes=classes) if y_true_bin.ndim == 1: y_true_bin = np.column_stack([1 - y_true_bin, y_true_bin]) if average == 'micro': return roc_auc_score(y_true_bin.ravel(), y_prob.ravel()) elif average == 'macro': aucs = [roc_auc_score(y_true_bin[:, k], y_prob[:, k]) for k in range(len(classes))] return np.mean(aucs) elif average == 'weighted': aucs = [] weights = [] for k in range(len(classes)): aucs.append(roc_auc_score(y_true_bin[:, k], y_prob[:, k])) weights.append(y_true_bin[:, k].sum()) return np.average(aucs, weights=weights) else: raise ValueError(f"Unknown average: {average}") def compare_aggregation_methods(y_true, y_prob, classes): """Compare different aggregation methods.""" print("AUC Aggregation Comparison:") print("-" * 40) for avg in ['macro', 'weighted', 'micro']: auc = aggregate_ovr_auc(y_true, y_prob, classes, average=avg) print(f" {avg.capitalize():<10}: {auc:.4f}")A key advantage of OvR analysis is the ability to set different classification thresholds for different classes. This is powerful when classes have different costs or requirements.
1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798
import numpy as npfrom sklearn.metrics import precision_recall_curve, f1_scorefrom typing import Dict, List def find_optimal_thresholds( y_true: np.ndarray, y_prob: np.ndarray, classes: List, criterion: str = 'f1') -> Dict[str, float]: """ Find optimal threshold for each class independently. Parameters ---------- criterion : {'f1', 'precision', 'recall'} What to optimize (F1 balances precision/recall) Returns ------- thresholds : dict Optimal threshold for each class """ from sklearn.preprocessing import label_binarize y_true_bin = label_binarize(y_true, classes=classes) if y_true_bin.ndim == 1: y_true_bin = np.column_stack([1 - y_true_bin, y_true_bin]) optimal_thresholds = {} for k, class_name in enumerate(classes): precision, recall, thresholds = precision_recall_curve( y_true_bin[:, k], y_prob[:, k] ) if criterion == 'f1': # F1 = 2 * P * R / (P + R) f1_scores = 2 * precision * recall / (precision + recall + 1e-10) # Note: thresholds has length len(precision) - 1 best_idx = np.argmax(f1_scores[:-1]) optimal_thresholds[class_name] = thresholds[best_idx] elif criterion == 'precision': # Find threshold for minimum acceptable precision min_precision = 0.8 valid = precision[:-1] >= min_precision if valid.any(): # Highest recall meeting precision constraint idx = np.where(valid)[0][-1] optimal_thresholds[class_name] = thresholds[idx] else: optimal_thresholds[class_name] = 0.5 elif criterion == 'recall': # Find threshold for minimum acceptable recall min_recall = 0.8 valid = recall[:-1] >= min_recall if valid.any(): # Highest precision meeting recall constraint idx = np.where(valid)[0][0] optimal_thresholds[class_name] = thresholds[idx] else: optimal_thresholds[class_name] = 0.5 return optimal_thresholds def apply_class_thresholds( y_prob: np.ndarray, thresholds: Dict[str, float], classes: List) -> np.ndarray: """ Apply per-class thresholds to get final predictions. For each sample, predict the class with highest (probability - threshold) among classes exceeding threshold. """ n_samples = y_prob.shape[0] K = len(classes) # Compute margin above threshold for each class margins = np.zeros((n_samples, K)) for k, class_name in enumerate(classes): margins[:, k] = y_prob[:, k] - thresholds.get(class_name, 0.5) # Predict class with highest positive margin # If no margin is positive, use highest probability predictions = [] for i in range(n_samples): if margins[i].max() > 0: pred_idx = margins[i].argmax() else: pred_idx = y_prob[i].argmax() predictions.append(classes[pred_idx]) return np.array(predictions)• Asymmetric costs: Set lower thresholds for high-cost-of-missing classes • Varying base rates: Rare classes may need lower thresholds • Multi-label problems: Each class decision is independent • Deployment constraints: May need specific precision/recall targets per class
Congratulations! You've completed Module 3: Multi-class Metrics. You now understand micro, macro, and weighted averaging; Cohen's kappa for chance-corrected evaluation; confusion matrix analysis; and one-vs-rest strategies for per-class evaluation. These tools form the foundation for rigorous multi-class model evaluation.