Spaces:

agentbaba
/

productivity-copilot-env

Sleeping

App Files Files Community

productivity-copilot-env / data_pipeline /evaluate.py

agentbaba

Upload data_pipeline/evaluate.py with huggingface_hub

836a349 verified about 2 months ago

raw

history blame contribute delete

7.7 kB

	"""
	evaluate.py
	───────────
	Step 5 (optional). Loads saved model artifacts and runs a full evaluation report.
	Saves plots to model_artifacts/ and prints a hackathon-ready summary.

	Run after train_models.py.
	"""

	import sys, json, warnings
	import numpy as np
	import pandas as pd
	import joblib
	import matplotlib.pyplot as plt
	import seaborn as sns
	from pathlib import Path
	from rich.console import Console
	from rich.table import Table
	from rich.panel import Panel

	from sklearn.metrics import (
	roc_curve, auc, classification_report,
	confusion_matrix, precision_recall_curve, average_precision_score,
	)

	warnings.filterwarnings("ignore")
	console = Console()

	sys.path.insert(0, str(Path(__file__).parent))
	from config import (
	PROCESSED_DIR, ARTIFACTS_DIR, BEHAVIORAL_FEATURES,
	TARGET_FAILURE, TARGET_STYLE, RANDOM_STATE, TEST_SIZE
	)
	from sklearn.model_selection import train_test_split


	def load_artifacts():
	failure_model = joblib.load(ARTIFACTS_DIR / "failure_predictor.pkl")
	scaler = joblib.load(ARTIFACTS_DIR / "feature_scaler.pkl")
	ws_model = joblib.load(ARTIFACTS_DIR / "work_style_classifier.pkl")
	ws_encoder = joblib.load(ARTIFACTS_DIR / "work_style_label_encoder.pkl")
	ds_model = joblib.load(ARTIFACTS_DIR / "distraction_scorer.pkl")
	return failure_model, scaler, ws_model, ws_encoder, ds_model


	def evaluate_failure_predictor(model, scaler, df):
	console.rule("[bold cyan]A — Failure Predictor Evaluation[/bold cyan]")
	feat_cols = [c for c in BEHAVIORAL_FEATURES if c in df.columns]
	X = scaler.transform(df[feat_cols].values)
	y = df[TARGET_FAILURE].values

	X_train, X_test, y_train, y_test = train_test_split(
	X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
	)

	y_proba = model.predict_proba(X_test)[:, 1]
	y_pred = (y_proba >= 0.5).astype(int)

	# ROC Curve
	fpr, tpr, _ = roc_curve(y_test, y_proba)
	roc_auc = auc(fpr, tpr)

	# PR Curve
	precision, recall, _ = precision_recall_curve(y_test, y_proba)
	ap = average_precision_score(y_test, y_proba)

	fig, axes = plt.subplots(1, 3, figsize=(16, 5))
	fig.suptitle("Task Failure Predictor — Evaluation Report", fontsize=14, fontweight="bold")

	# Plot ROC
	axes[0].plot(fpr, tpr, color="#00D4FF", lw=2, label=f"AUC = {roc_auc:.3f}")
	axes[0].plot([0, 1], [0, 1], "k--", lw=1)
	axes[0].set_xlabel("False Positive Rate")
	axes[0].set_ylabel("True Positive Rate")
	axes[0].set_title("ROC Curve")
	axes[0].legend()
	axes[0].set_facecolor("#0D1B2A")
	axes[0].figure.patch.set_alpha(0)

	# Plot PR
	axes[1].plot(recall, precision, color="#FF6B6B", lw=2, label=f"AP = {ap:.3f}")
	axes[1].set_xlabel("Recall")
	axes[1].set_ylabel("Precision")
	axes[1].set_title("Precision–Recall Curve")
	axes[1].legend()

	# Plot score distribution
	axes[2].hist(y_proba[y_test == 0], bins=30, alpha=0.6, color="#FF6B6B", label="Failed Tasks")
	axes[2].hist(y_proba[y_test == 1], bins=30, alpha=0.6, color="#4ECDC4", label="Completed Tasks")
	axes[2].axvline(0.65, color="yellow", linestyle="--", label="Threshold (0.65)")
	axes[2].set_xlabel("Predicted Failure Probability")
	axes[2].set_ylabel("Count")
	axes[2].set_title("Score Distribution")
	axes[2].legend()

	plt.tight_layout()
	fig.savefig(ARTIFACTS_DIR / "failure_predictor_eval.png", dpi=150, bbox_inches="tight")
	plt.close()
	console.log("[green]✓ Saved: failure_predictor_eval.png[/green]")

	console.print(classification_report(y_test, y_pred))
	console.log(f"ROC-AUC: {roc_auc:.4f} \| Avg Precision: {ap:.4f}")


	def evaluate_work_style(ws_model, ws_encoder, df):
	console.rule("[bold cyan]B — Work Style Classifier Evaluation[/bold cyan]")
	from config import RF_PARAMS
	WORK_STYLE_FEATURES = [
	"session_duration_minutes", "break_count", "distraction_events",
	"stress_level", "motivation_level", "previous_completion_rate",
	"deadline_days_remaining",
	]
	df_ws = df[df[TARGET_STYLE].isin(["turtle", "hare", "hybrid"])].copy()
	if len(df_ws) < 50:
	console.log("[yellow]Insufficient labelled rows for work style evaluation.[/yellow]")
	return

	feat_cols = [c for c in WORK_STYLE_FEATURES if c in df_ws.columns]
	X = df_ws[feat_cols].values
	y = ws_encoder.transform(df_ws[TARGET_STYLE])
	_, X_test, _, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y)
	y_pred = ws_model.predict(X_test)

	console.print(classification_report(y_test, y_pred, target_names=ws_encoder.classes_))

	cm = confusion_matrix(y_test, y_pred)
	fig, ax = plt.subplots(figsize=(6, 5))
	sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
	xticklabels=ws_encoder.classes_,
	yticklabels=ws_encoder.classes_, ax=ax)
	ax.set_title("Work Style Classifier — Confusion Matrix")
	ax.set_xlabel("Predicted")
	ax.set_ylabel("Actual")
	fig.tight_layout()
	fig.savefig(ARTIFACTS_DIR / "work_style_confusion_matrix.png", dpi=150)
	plt.close()
	console.log("[green]✓ Saved: work_style_confusion_matrix.png[/green]")


	def print_hackathon_summary():
	console.print(Panel.fit("🏆 Hackathon-Ready Model Summary", style="bold yellow"))
	metrics_path = ARTIFACTS_DIR / "metrics.json"
	if not metrics_path.exists():
	console.log("[red]metrics.json not found. Run train_models.py first.[/red]")
	return
	with open(metrics_path) as f:
	metrics = json.load(f)

	table = Table(title="Model Performance Metrics")
	table.add_column("Model", style="cyan", min_width=30)
	table.add_column("Metric", style="white")
	table.add_column("Result", style="bold green", justify="right")

	fp = metrics.get("failure_predictor", {})
	ws = metrics.get("work_style_classifier", {})
	ds = metrics.get("distraction_scorer", {})

	if fp:
	table.add_row("Task Failure Predictor (XGBoost)", "AUC-ROC", str(fp.get("auc_roc")))
	table.add_row("", "F1 Score", str(fp.get("f1_score")))
	table.add_row("", "Accuracy", str(fp.get("accuracy")))
	table.add_row("", "5-fold CV AUC", f"{fp.get('cv_auc_mean')} ± {fp.get('cv_auc_std')}")
	if ws:
	table.add_row("Work Style Classifier (RF)", "Accuracy", str(ws.get("accuracy")))
	table.add_row("", "Macro-F1", str(ws.get("macro_f1")))
	if ds:
	table.add_row("Distraction Scorer (GBR)", "RMSE", str(ds.get("rmse")))
	table.add_row("", "R²", str(ds.get("r2")))

	console.print(table)
	console.print("\n[bold]Artifacts ready for backend integration:[/bold]")
	for f in sorted(ARTIFACTS_DIR.iterdir()):
	console.print(f" [dim]→[/dim] {f.name}")


	def main():
	console.print(Panel.fit("📊 Step 5 — Full Evaluation Report", style="bold magenta"))

	training_path = PROCESSED_DIR / "training_dataset.csv"
	if not training_path.exists():
	console.print("[red]Run generate_data.py and preprocess.py first.[/red]")
	sys.exit(1)

	df = pd.read_csv(training_path)
	failure_model, scaler, ws_model, ws_encoder, ds_model = load_artifacts()

	evaluate_failure_predictor(failure_model, scaler, df)
	evaluate_work_style(ws_model, ws_encoder, df)
	print_hackathon_summary()

	console.print("\n[bold green]✅ Evaluation complete. All plots saved to model_artifacts/[/bold green]")


	if __name__ == "__main__":
	main()