agentbaba's picture
Upload data_pipeline/evaluate.py with huggingface_hub
836a349 verified
"""
evaluate.py
───────────
Step 5 (optional). Loads saved model artifacts and runs a full evaluation report.
Saves plots to model_artifacts/ and prints a hackathon-ready summary.
Run after train_models.py.
"""
import sys, json, warnings
import numpy as np
import pandas as pd
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from sklearn.metrics import (
roc_curve, auc, classification_report,
confusion_matrix, precision_recall_curve, average_precision_score,
)
warnings.filterwarnings("ignore")
console = Console()
sys.path.insert(0, str(Path(__file__).parent))
from config import (
PROCESSED_DIR, ARTIFACTS_DIR, BEHAVIORAL_FEATURES,
TARGET_FAILURE, TARGET_STYLE, RANDOM_STATE, TEST_SIZE
)
from sklearn.model_selection import train_test_split
def load_artifacts():
failure_model = joblib.load(ARTIFACTS_DIR / "failure_predictor.pkl")
scaler = joblib.load(ARTIFACTS_DIR / "feature_scaler.pkl")
ws_model = joblib.load(ARTIFACTS_DIR / "work_style_classifier.pkl")
ws_encoder = joblib.load(ARTIFACTS_DIR / "work_style_label_encoder.pkl")
ds_model = joblib.load(ARTIFACTS_DIR / "distraction_scorer.pkl")
return failure_model, scaler, ws_model, ws_encoder, ds_model
def evaluate_failure_predictor(model, scaler, df):
console.rule("[bold cyan]A — Failure Predictor Evaluation[/bold cyan]")
feat_cols = [c for c in BEHAVIORAL_FEATURES if c in df.columns]
X = scaler.transform(df[feat_cols].values)
y = df[TARGET_FAILURE].values
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)
y_proba = model.predict_proba(X_test)[:, 1]
y_pred = (y_proba >= 0.5).astype(int)
# ROC Curve
fpr, tpr, _ = roc_curve(y_test, y_proba)
roc_auc = auc(fpr, tpr)
# PR Curve
precision, recall, _ = precision_recall_curve(y_test, y_proba)
ap = average_precision_score(y_test, y_proba)
fig, axes = plt.subplots(1, 3, figsize=(16, 5))
fig.suptitle("Task Failure Predictor — Evaluation Report", fontsize=14, fontweight="bold")
# Plot ROC
axes[0].plot(fpr, tpr, color="#00D4FF", lw=2, label=f"AUC = {roc_auc:.3f}")
axes[0].plot([0, 1], [0, 1], "k--", lw=1)
axes[0].set_xlabel("False Positive Rate")
axes[0].set_ylabel("True Positive Rate")
axes[0].set_title("ROC Curve")
axes[0].legend()
axes[0].set_facecolor("#0D1B2A")
axes[0].figure.patch.set_alpha(0)
# Plot PR
axes[1].plot(recall, precision, color="#FF6B6B", lw=2, label=f"AP = {ap:.3f}")
axes[1].set_xlabel("Recall")
axes[1].set_ylabel("Precision")
axes[1].set_title("Precision–Recall Curve")
axes[1].legend()
# Plot score distribution
axes[2].hist(y_proba[y_test == 0], bins=30, alpha=0.6, color="#FF6B6B", label="Failed Tasks")
axes[2].hist(y_proba[y_test == 1], bins=30, alpha=0.6, color="#4ECDC4", label="Completed Tasks")
axes[2].axvline(0.65, color="yellow", linestyle="--", label="Threshold (0.65)")
axes[2].set_xlabel("Predicted Failure Probability")
axes[2].set_ylabel("Count")
axes[2].set_title("Score Distribution")
axes[2].legend()
plt.tight_layout()
fig.savefig(ARTIFACTS_DIR / "failure_predictor_eval.png", dpi=150, bbox_inches="tight")
plt.close()
console.log("[green]✓ Saved: failure_predictor_eval.png[/green]")
console.print(classification_report(y_test, y_pred))
console.log(f"ROC-AUC: {roc_auc:.4f} | Avg Precision: {ap:.4f}")
def evaluate_work_style(ws_model, ws_encoder, df):
console.rule("[bold cyan]B — Work Style Classifier Evaluation[/bold cyan]")
from config import RF_PARAMS
WORK_STYLE_FEATURES = [
"session_duration_minutes", "break_count", "distraction_events",
"stress_level", "motivation_level", "previous_completion_rate",
"deadline_days_remaining",
]
df_ws = df[df[TARGET_STYLE].isin(["turtle", "hare", "hybrid"])].copy()
if len(df_ws) < 50:
console.log("[yellow]Insufficient labelled rows for work style evaluation.[/yellow]")
return
feat_cols = [c for c in WORK_STYLE_FEATURES if c in df_ws.columns]
X = df_ws[feat_cols].values
y = ws_encoder.transform(df_ws[TARGET_STYLE])
_, X_test, _, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)
y_pred = ws_model.predict(X_test)
console.print(classification_report(y_test, y_pred, target_names=ws_encoder.classes_))
cm = confusion_matrix(y_test, y_pred)
fig, ax = plt.subplots(figsize=(6, 5))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
xticklabels=ws_encoder.classes_,
yticklabels=ws_encoder.classes_, ax=ax)
ax.set_title("Work Style Classifier — Confusion Matrix")
ax.set_xlabel("Predicted")
ax.set_ylabel("Actual")
fig.tight_layout()
fig.savefig(ARTIFACTS_DIR / "work_style_confusion_matrix.png", dpi=150)
plt.close()
console.log("[green]✓ Saved: work_style_confusion_matrix.png[/green]")
def print_hackathon_summary():
console.print(Panel.fit("🏆 Hackathon-Ready Model Summary", style="bold yellow"))
metrics_path = ARTIFACTS_DIR / "metrics.json"
if not metrics_path.exists():
console.log("[red]metrics.json not found. Run train_models.py first.[/red]")
return
with open(metrics_path) as f:
metrics = json.load(f)
table = Table(title="Model Performance Metrics")
table.add_column("Model", style="cyan", min_width=30)
table.add_column("Metric", style="white")
table.add_column("Result", style="bold green", justify="right")
fp = metrics.get("failure_predictor", {})
ws = metrics.get("work_style_classifier", {})
ds = metrics.get("distraction_scorer", {})
if fp:
table.add_row("Task Failure Predictor (XGBoost)", "AUC-ROC", str(fp.get("auc_roc")))
table.add_row("", "F1 Score", str(fp.get("f1_score")))
table.add_row("", "Accuracy", str(fp.get("accuracy")))
table.add_row("", "5-fold CV AUC", f"{fp.get('cv_auc_mean')} ± {fp.get('cv_auc_std')}")
if ws:
table.add_row("Work Style Classifier (RF)", "Accuracy", str(ws.get("accuracy")))
table.add_row("", "Macro-F1", str(ws.get("macro_f1")))
if ds:
table.add_row("Distraction Scorer (GBR)", "RMSE", str(ds.get("rmse")))
table.add_row("", "R²", str(ds.get("r2")))
console.print(table)
console.print("\n[bold]Artifacts ready for backend integration:[/bold]")
for f in sorted(ARTIFACTS_DIR.iterdir()):
console.print(f" [dim]→[/dim] {f.name}")
def main():
console.print(Panel.fit("📊 Step 5 — Full Evaluation Report", style="bold magenta"))
training_path = PROCESSED_DIR / "training_dataset.csv"
if not training_path.exists():
console.print("[red]Run generate_data.py and preprocess.py first.[/red]")
sys.exit(1)
df = pd.read_csv(training_path)
failure_model, scaler, ws_model, ws_encoder, ds_model = load_artifacts()
evaluate_failure_predictor(failure_model, scaler, df)
evaluate_work_style(ws_model, ws_encoder, df)
print_hackathon_summary()
console.print("\n[bold green]✅ Evaluation complete. All plots saved to model_artifacts/[/bold green]")
if __name__ == "__main__":
main()