""" evaluate.py ─────────── Step 5 (optional). Loads saved model artifacts and runs a full evaluation report. Saves plots to model_artifacts/ and prints a hackathon-ready summary. Run after train_models.py. """ import sys, json, warnings import numpy as np import pandas as pd import joblib import matplotlib.pyplot as plt import seaborn as sns from pathlib import Path from rich.console import Console from rich.table import Table from rich.panel import Panel from sklearn.metrics import ( roc_curve, auc, classification_report, confusion_matrix, precision_recall_curve, average_precision_score, ) warnings.filterwarnings("ignore") console = Console() sys.path.insert(0, str(Path(__file__).parent)) from config import ( PROCESSED_DIR, ARTIFACTS_DIR, BEHAVIORAL_FEATURES, TARGET_FAILURE, TARGET_STYLE, RANDOM_STATE, TEST_SIZE ) from sklearn.model_selection import train_test_split def load_artifacts(): failure_model = joblib.load(ARTIFACTS_DIR / "failure_predictor.pkl") scaler = joblib.load(ARTIFACTS_DIR / "feature_scaler.pkl") ws_model = joblib.load(ARTIFACTS_DIR / "work_style_classifier.pkl") ws_encoder = joblib.load(ARTIFACTS_DIR / "work_style_label_encoder.pkl") ds_model = joblib.load(ARTIFACTS_DIR / "distraction_scorer.pkl") return failure_model, scaler, ws_model, ws_encoder, ds_model def evaluate_failure_predictor(model, scaler, df): console.rule("[bold cyan]A — Failure Predictor Evaluation[/bold cyan]") feat_cols = [c for c in BEHAVIORAL_FEATURES if c in df.columns] X = scaler.transform(df[feat_cols].values) y = df[TARGET_FAILURE].values X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y ) y_proba = model.predict_proba(X_test)[:, 1] y_pred = (y_proba >= 0.5).astype(int) # ROC Curve fpr, tpr, _ = roc_curve(y_test, y_proba) roc_auc = auc(fpr, tpr) # PR Curve precision, recall, _ = precision_recall_curve(y_test, y_proba) ap = average_precision_score(y_test, y_proba) fig, axes = plt.subplots(1, 3, figsize=(16, 5)) fig.suptitle("Task Failure Predictor — Evaluation Report", fontsize=14, fontweight="bold") # Plot ROC axes[0].plot(fpr, tpr, color="#00D4FF", lw=2, label=f"AUC = {roc_auc:.3f}") axes[0].plot([0, 1], [0, 1], "k--", lw=1) axes[0].set_xlabel("False Positive Rate") axes[0].set_ylabel("True Positive Rate") axes[0].set_title("ROC Curve") axes[0].legend() axes[0].set_facecolor("#0D1B2A") axes[0].figure.patch.set_alpha(0) # Plot PR axes[1].plot(recall, precision, color="#FF6B6B", lw=2, label=f"AP = {ap:.3f}") axes[1].set_xlabel("Recall") axes[1].set_ylabel("Precision") axes[1].set_title("Precision–Recall Curve") axes[1].legend() # Plot score distribution axes[2].hist(y_proba[y_test == 0], bins=30, alpha=0.6, color="#FF6B6B", label="Failed Tasks") axes[2].hist(y_proba[y_test == 1], bins=30, alpha=0.6, color="#4ECDC4", label="Completed Tasks") axes[2].axvline(0.65, color="yellow", linestyle="--", label="Threshold (0.65)") axes[2].set_xlabel("Predicted Failure Probability") axes[2].set_ylabel("Count") axes[2].set_title("Score Distribution") axes[2].legend() plt.tight_layout() fig.savefig(ARTIFACTS_DIR / "failure_predictor_eval.png", dpi=150, bbox_inches="tight") plt.close() console.log("[green]✓ Saved: failure_predictor_eval.png[/green]") console.print(classification_report(y_test, y_pred)) console.log(f"ROC-AUC: {roc_auc:.4f} | Avg Precision: {ap:.4f}") def evaluate_work_style(ws_model, ws_encoder, df): console.rule("[bold cyan]B — Work Style Classifier Evaluation[/bold cyan]") from config import RF_PARAMS WORK_STYLE_FEATURES = [ "session_duration_minutes", "break_count", "distraction_events", "stress_level", "motivation_level", "previous_completion_rate", "deadline_days_remaining", ] df_ws = df[df[TARGET_STYLE].isin(["turtle", "hare", "hybrid"])].copy() if len(df_ws) < 50: console.log("[yellow]Insufficient labelled rows for work style evaluation.[/yellow]") return feat_cols = [c for c in WORK_STYLE_FEATURES if c in df_ws.columns] X = df_ws[feat_cols].values y = ws_encoder.transform(df_ws[TARGET_STYLE]) _, X_test, _, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y) y_pred = ws_model.predict(X_test) console.print(classification_report(y_test, y_pred, target_names=ws_encoder.classes_)) cm = confusion_matrix(y_test, y_pred) fig, ax = plt.subplots(figsize=(6, 5)) sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=ws_encoder.classes_, yticklabels=ws_encoder.classes_, ax=ax) ax.set_title("Work Style Classifier — Confusion Matrix") ax.set_xlabel("Predicted") ax.set_ylabel("Actual") fig.tight_layout() fig.savefig(ARTIFACTS_DIR / "work_style_confusion_matrix.png", dpi=150) plt.close() console.log("[green]✓ Saved: work_style_confusion_matrix.png[/green]") def print_hackathon_summary(): console.print(Panel.fit("🏆 Hackathon-Ready Model Summary", style="bold yellow")) metrics_path = ARTIFACTS_DIR / "metrics.json" if not metrics_path.exists(): console.log("[red]metrics.json not found. Run train_models.py first.[/red]") return with open(metrics_path) as f: metrics = json.load(f) table = Table(title="Model Performance Metrics") table.add_column("Model", style="cyan", min_width=30) table.add_column("Metric", style="white") table.add_column("Result", style="bold green", justify="right") fp = metrics.get("failure_predictor", {}) ws = metrics.get("work_style_classifier", {}) ds = metrics.get("distraction_scorer", {}) if fp: table.add_row("Task Failure Predictor (XGBoost)", "AUC-ROC", str(fp.get("auc_roc"))) table.add_row("", "F1 Score", str(fp.get("f1_score"))) table.add_row("", "Accuracy", str(fp.get("accuracy"))) table.add_row("", "5-fold CV AUC", f"{fp.get('cv_auc_mean')} ± {fp.get('cv_auc_std')}") if ws: table.add_row("Work Style Classifier (RF)", "Accuracy", str(ws.get("accuracy"))) table.add_row("", "Macro-F1", str(ws.get("macro_f1"))) if ds: table.add_row("Distraction Scorer (GBR)", "RMSE", str(ds.get("rmse"))) table.add_row("", "R²", str(ds.get("r2"))) console.print(table) console.print("\n[bold]Artifacts ready for backend integration:[/bold]") for f in sorted(ARTIFACTS_DIR.iterdir()): console.print(f" [dim]→[/dim] {f.name}") def main(): console.print(Panel.fit("📊 Step 5 — Full Evaluation Report", style="bold magenta")) training_path = PROCESSED_DIR / "training_dataset.csv" if not training_path.exists(): console.print("[red]Run generate_data.py and preprocess.py first.[/red]") sys.exit(1) df = pd.read_csv(training_path) failure_model, scaler, ws_model, ws_encoder, ds_model = load_artifacts() evaluate_failure_predictor(failure_model, scaler, df) evaluate_work_style(ws_model, ws_encoder, df) print_hackathon_summary() console.print("\n[bold green]✅ Evaluation complete. All plots saved to model_artifacts/[/bold green]") if __name__ == "__main__": main()