File size: 3,537 Bytes
1e5b98a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
# src/cross_validation.py
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, f1_score, precision_score, recall_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline


def perform_cross_validation(
    model: Pipeline,
    X: pd.DataFrame,
    y: np.ndarray,
    n_splits: int = 5,
    random_state: int = 42,
    threshold: float = 0.5,
) -> Dict[str, Dict[str, float]]:
    """
    Perform k-fold stratified cross-validation and return aggregated metrics.
    
    Args:
        model: Scikit-learn pipeline or model
        X: Feature matrix
        y: Target array
        n_splits: Number of folds for cross-validation
        random_state: Random state for reproducibility
        threshold: Probability threshold for binary classification
        
    Returns:
        Dictionary with mean and std metrics across folds
    """
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    
    # Store metrics for each fold
    fold_metrics = {
        "auc": [],
        "f1": [],
        "precision": [],
        "recall": [],
        "accuracy": [],
    }
    
    print(f"\n{'='*60}")
    print(f"Performing {n_splits}-Fold Stratified Cross-Validation")
    print(f"{'='*60}\n")
    
    for fold_idx, (train_idx, val_idx) in enumerate(skf.split(X, y), 1):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y[train_idx], y[val_idx]
        
        # Train model on fold
        model.fit(X_train_fold, y_train_fold)
        
        # Predict on validation fold
        y_proba_fold = model.predict_proba(X_val_fold)[:, 1]
        y_pred_fold = (y_proba_fold >= threshold).astype(int)
        
        # Calculate metrics
        auc = roc_auc_score(y_val_fold, y_proba_fold)
        f1 = f1_score(y_val_fold, y_pred_fold)
        precision = precision_score(y_val_fold, y_pred_fold)
        recall = recall_score(y_val_fold, y_pred_fold)
        accuracy = accuracy_score(y_val_fold, y_pred_fold)
        
        # Store metrics
        fold_metrics["auc"].append(auc)
        fold_metrics["f1"].append(f1)
        fold_metrics["precision"].append(precision)
        fold_metrics["recall"].append(recall)
        fold_metrics["accuracy"].append(accuracy)
        
        print(f"Fold {fold_idx}/{n_splits}:")
        print(f"  AUC: {auc:.4f} | F1: {f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | Accuracy: {accuracy:.4f}")
    
    # Calculate mean and std across folds
    cv_results = {}
    for metric_name, values in fold_metrics.items():
        cv_results[metric_name] = {
            "mean": float(np.mean(values)),
            "std": float(np.std(values)),
            "values": [float(v) for v in values],
        }
    
    print(f"\n{'='*60}")
    print("Cross-Validation Results (Mean ± Std)")
    print(f"{'='*60}")
    for metric_name, stats in cv_results.items():
        print(f"{metric_name.upper():12s}: {stats['mean']:.4f} ± {stats['std']:.4f}")
    print(f"{'='*60}\n")
    
    return cv_results


def get_cv_summary(cv_results: Dict[str, Dict[str, float]]) -> Dict[str, float]:
    """
    Extract mean metrics from CV results for logging.
    
    Args:
        cv_results: Results from perform_cross_validation
        
    Returns:
        Dictionary with mean metrics only
    """
    return {f"cv_{metric}_mean": stats["mean"] for metric, stats in cv_results.items()}