Spaces:
Sleeping
Sleeping
| """ | |
| evaluate.py | |
| ─────────── | |
| Step 5 (optional). Loads saved model artifacts and runs a full evaluation report. | |
| Saves plots to model_artifacts/ and prints a hackathon-ready summary. | |
| Run after train_models.py. | |
| """ | |
| import sys, json, warnings | |
| import numpy as np | |
| import pandas as pd | |
| import joblib | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from pathlib import Path | |
| from rich.console import Console | |
| from rich.table import Table | |
| from rich.panel import Panel | |
| from sklearn.metrics import ( | |
| roc_curve, auc, classification_report, | |
| confusion_matrix, precision_recall_curve, average_precision_score, | |
| ) | |
| warnings.filterwarnings("ignore") | |
| console = Console() | |
| sys.path.insert(0, str(Path(__file__).parent)) | |
| from config import ( | |
| PROCESSED_DIR, ARTIFACTS_DIR, BEHAVIORAL_FEATURES, | |
| TARGET_FAILURE, TARGET_STYLE, RANDOM_STATE, TEST_SIZE | |
| ) | |
| from sklearn.model_selection import train_test_split | |
| def load_artifacts(): | |
| failure_model = joblib.load(ARTIFACTS_DIR / "failure_predictor.pkl") | |
| scaler = joblib.load(ARTIFACTS_DIR / "feature_scaler.pkl") | |
| ws_model = joblib.load(ARTIFACTS_DIR / "work_style_classifier.pkl") | |
| ws_encoder = joblib.load(ARTIFACTS_DIR / "work_style_label_encoder.pkl") | |
| ds_model = joblib.load(ARTIFACTS_DIR / "distraction_scorer.pkl") | |
| return failure_model, scaler, ws_model, ws_encoder, ds_model | |
| def evaluate_failure_predictor(model, scaler, df): | |
| console.rule("[bold cyan]A — Failure Predictor Evaluation[/bold cyan]") | |
| feat_cols = [c for c in BEHAVIORAL_FEATURES if c in df.columns] | |
| X = scaler.transform(df[feat_cols].values) | |
| y = df[TARGET_FAILURE].values | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y | |
| ) | |
| y_proba = model.predict_proba(X_test)[:, 1] | |
| y_pred = (y_proba >= 0.5).astype(int) | |
| # ROC Curve | |
| fpr, tpr, _ = roc_curve(y_test, y_proba) | |
| roc_auc = auc(fpr, tpr) | |
| # PR Curve | |
| precision, recall, _ = precision_recall_curve(y_test, y_proba) | |
| ap = average_precision_score(y_test, y_proba) | |
| fig, axes = plt.subplots(1, 3, figsize=(16, 5)) | |
| fig.suptitle("Task Failure Predictor — Evaluation Report", fontsize=14, fontweight="bold") | |
| # Plot ROC | |
| axes[0].plot(fpr, tpr, color="#00D4FF", lw=2, label=f"AUC = {roc_auc:.3f}") | |
| axes[0].plot([0, 1], [0, 1], "k--", lw=1) | |
| axes[0].set_xlabel("False Positive Rate") | |
| axes[0].set_ylabel("True Positive Rate") | |
| axes[0].set_title("ROC Curve") | |
| axes[0].legend() | |
| axes[0].set_facecolor("#0D1B2A") | |
| axes[0].figure.patch.set_alpha(0) | |
| # Plot PR | |
| axes[1].plot(recall, precision, color="#FF6B6B", lw=2, label=f"AP = {ap:.3f}") | |
| axes[1].set_xlabel("Recall") | |
| axes[1].set_ylabel("Precision") | |
| axes[1].set_title("Precision–Recall Curve") | |
| axes[1].legend() | |
| # Plot score distribution | |
| axes[2].hist(y_proba[y_test == 0], bins=30, alpha=0.6, color="#FF6B6B", label="Failed Tasks") | |
| axes[2].hist(y_proba[y_test == 1], bins=30, alpha=0.6, color="#4ECDC4", label="Completed Tasks") | |
| axes[2].axvline(0.65, color="yellow", linestyle="--", label="Threshold (0.65)") | |
| axes[2].set_xlabel("Predicted Failure Probability") | |
| axes[2].set_ylabel("Count") | |
| axes[2].set_title("Score Distribution") | |
| axes[2].legend() | |
| plt.tight_layout() | |
| fig.savefig(ARTIFACTS_DIR / "failure_predictor_eval.png", dpi=150, bbox_inches="tight") | |
| plt.close() | |
| console.log("[green]✓ Saved: failure_predictor_eval.png[/green]") | |
| console.print(classification_report(y_test, y_pred)) | |
| console.log(f"ROC-AUC: {roc_auc:.4f} | Avg Precision: {ap:.4f}") | |
| def evaluate_work_style(ws_model, ws_encoder, df): | |
| console.rule("[bold cyan]B — Work Style Classifier Evaluation[/bold cyan]") | |
| from config import RF_PARAMS | |
| WORK_STYLE_FEATURES = [ | |
| "session_duration_minutes", "break_count", "distraction_events", | |
| "stress_level", "motivation_level", "previous_completion_rate", | |
| "deadline_days_remaining", | |
| ] | |
| df_ws = df[df[TARGET_STYLE].isin(["turtle", "hare", "hybrid"])].copy() | |
| if len(df_ws) < 50: | |
| console.log("[yellow]Insufficient labelled rows for work style evaluation.[/yellow]") | |
| return | |
| feat_cols = [c for c in WORK_STYLE_FEATURES if c in df_ws.columns] | |
| X = df_ws[feat_cols].values | |
| y = ws_encoder.transform(df_ws[TARGET_STYLE]) | |
| _, X_test, _, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y) | |
| y_pred = ws_model.predict(X_test) | |
| console.print(classification_report(y_test, y_pred, target_names=ws_encoder.classes_)) | |
| cm = confusion_matrix(y_test, y_pred) | |
| fig, ax = plt.subplots(figsize=(6, 5)) | |
| sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", | |
| xticklabels=ws_encoder.classes_, | |
| yticklabels=ws_encoder.classes_, ax=ax) | |
| ax.set_title("Work Style Classifier — Confusion Matrix") | |
| ax.set_xlabel("Predicted") | |
| ax.set_ylabel("Actual") | |
| fig.tight_layout() | |
| fig.savefig(ARTIFACTS_DIR / "work_style_confusion_matrix.png", dpi=150) | |
| plt.close() | |
| console.log("[green]✓ Saved: work_style_confusion_matrix.png[/green]") | |
| def print_hackathon_summary(): | |
| console.print(Panel.fit("🏆 Hackathon-Ready Model Summary", style="bold yellow")) | |
| metrics_path = ARTIFACTS_DIR / "metrics.json" | |
| if not metrics_path.exists(): | |
| console.log("[red]metrics.json not found. Run train_models.py first.[/red]") | |
| return | |
| with open(metrics_path) as f: | |
| metrics = json.load(f) | |
| table = Table(title="Model Performance Metrics") | |
| table.add_column("Model", style="cyan", min_width=30) | |
| table.add_column("Metric", style="white") | |
| table.add_column("Result", style="bold green", justify="right") | |
| fp = metrics.get("failure_predictor", {}) | |
| ws = metrics.get("work_style_classifier", {}) | |
| ds = metrics.get("distraction_scorer", {}) | |
| if fp: | |
| table.add_row("Task Failure Predictor (XGBoost)", "AUC-ROC", str(fp.get("auc_roc"))) | |
| table.add_row("", "F1 Score", str(fp.get("f1_score"))) | |
| table.add_row("", "Accuracy", str(fp.get("accuracy"))) | |
| table.add_row("", "5-fold CV AUC", f"{fp.get('cv_auc_mean')} ± {fp.get('cv_auc_std')}") | |
| if ws: | |
| table.add_row("Work Style Classifier (RF)", "Accuracy", str(ws.get("accuracy"))) | |
| table.add_row("", "Macro-F1", str(ws.get("macro_f1"))) | |
| if ds: | |
| table.add_row("Distraction Scorer (GBR)", "RMSE", str(ds.get("rmse"))) | |
| table.add_row("", "R²", str(ds.get("r2"))) | |
| console.print(table) | |
| console.print("\n[bold]Artifacts ready for backend integration:[/bold]") | |
| for f in sorted(ARTIFACTS_DIR.iterdir()): | |
| console.print(f" [dim]→[/dim] {f.name}") | |
| def main(): | |
| console.print(Panel.fit("📊 Step 5 — Full Evaluation Report", style="bold magenta")) | |
| training_path = PROCESSED_DIR / "training_dataset.csv" | |
| if not training_path.exists(): | |
| console.print("[red]Run generate_data.py and preprocess.py first.[/red]") | |
| sys.exit(1) | |
| df = pd.read_csv(training_path) | |
| failure_model, scaler, ws_model, ws_encoder, ds_model = load_artifacts() | |
| evaluate_failure_predictor(failure_model, scaler, df) | |
| evaluate_work_style(ws_model, ws_encoder, df) | |
| print_hackathon_summary() | |
| console.print("\n[bold green]✅ Evaluation complete. All plots saved to model_artifacts/[/bold green]") | |
| if __name__ == "__main__": | |
| main() | |