Spaces:
Sleeping
Sleeping
| """Evaluation entrypoint for FraudShield trainable agents.""" | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import os | |
| import subprocess | |
| from pathlib import Path | |
| from typing import Any | |
| import matplotlib.pyplot as plt | |
| from config import ExperimentConfig | |
| from utils import ensure_dir, moving_average, save_json, seed_everything | |
| def _run_inference(extra_env: dict[str, str] | None = None) -> dict[str, Any]: | |
| """Run ``inference.py`` with a controlled environment and return its report.""" | |
| env_vars = os.environ.copy() | |
| for key in ("HF_TOKEN", "HUGGINGFACEHUB_API_TOKEN", "API_KEY", "OPENAI_API_KEY", "API_BASE_URL", "MODEL_NAME", "LOCAL_MODEL_PATH"): | |
| env_vars.pop(key, None) | |
| if extra_env: | |
| env_vars.update(extra_env) | |
| subprocess.run( | |
| ["python", "inference.py"], | |
| check=True, | |
| capture_output=True, | |
| text=True, | |
| env=env_vars, | |
| ) | |
| with open("fraudshield_baseline_results.json", "r", encoding="utf-8") as handle: | |
| return json.load(handle) | |
| def evaluate_agent(config: ExperimentConfig) -> dict[str, Any]: | |
| """Compare heuristic baseline against the trained local checkpoint.""" | |
| seed_everything(config.seed) | |
| trained_model_path = str(Path(config.training.output_dir) / "trained_policy") | |
| baseline_results = _run_inference() | |
| trained_results = _run_inference({"LOCAL_MODEL_PATH": trained_model_path}) | |
| comparison_rows = [] | |
| win_count = 0 | |
| for task in config.evaluation.tasks: | |
| baseline_score = float(baseline_results[task]["score"]) | |
| trained_score = float(trained_results[task]["score"]) | |
| if trained_score > baseline_score: | |
| win_count += 1 | |
| comparison_rows.append( | |
| { | |
| "task": task, | |
| "baseline_score": baseline_score, | |
| "trained_score": trained_score, | |
| "delta": trained_score - baseline_score, | |
| } | |
| ) | |
| return { | |
| "baseline": { | |
| "easy": baseline_results["easy"]["score"], | |
| "medium": baseline_results["medium"]["score"], | |
| "hard": baseline_results["hard"]["score"], | |
| "final_score": baseline_results["final_score"], | |
| "agent_metadata": baseline_results.get("metadata", {}), | |
| }, | |
| "trained": { | |
| "easy": trained_results["easy"]["score"], | |
| "medium": trained_results["medium"]["score"], | |
| "hard": trained_results["hard"]["score"], | |
| "final_score": trained_results["final_score"], | |
| "agent_metadata": trained_results.get("metadata", {}), | |
| "local_model_path": trained_model_path, | |
| }, | |
| "comparison": comparison_rows, | |
| "success_rate": win_count / max(1, len(config.evaluation.tasks)), | |
| "preference_score": trained_results["final_score"] - baseline_results["final_score"], | |
| "before_after": { | |
| "base_model_final": baseline_results["final_score"], | |
| "trained_model_final": trained_results["final_score"], | |
| }, | |
| } | |
| def save_evaluation_artifacts(report: dict[str, Any], config: ExperimentConfig) -> None: | |
| """Persist evaluation metrics and plots.""" | |
| plots_dir = ensure_dir(config.evaluation.plots_dir) | |
| baseline_scores = [row["baseline_score"] for row in report["comparison"]] | |
| trained_scores = [row["trained_score"] for row in report["comparison"]] | |
| deltas = [row["delta"] for row in report["comparison"]] | |
| labels = [row["task"] for row in report["comparison"]] | |
| plt.figure(figsize=(8, 4)) | |
| plt.plot(range(1, len(baseline_scores) + 1), baseline_scores, marker="o", label="baseline") | |
| plt.plot(range(1, len(trained_scores) + 1), trained_scores, marker="o", label="trained") | |
| plt.xticks(range(1, len(labels) + 1), labels) | |
| plt.ylabel("task score") | |
| plt.title("FraudShield before vs after") | |
| plt.legend() | |
| plt.tight_layout() | |
| plt.savefig(plots_dir / "before_after_scores.png") | |
| plt.close() | |
| plt.figure(figsize=(8, 4)) | |
| plt.plot(range(1, len(deltas) + 1), deltas, marker="o", label="score_delta") | |
| plt.plot(range(1, len(deltas) + 1), moving_average(deltas, window=2), marker="x", label="moving_avg_delta") | |
| plt.xticks(range(1, len(labels) + 1), labels) | |
| plt.ylabel("score delta") | |
| plt.title("FraudShield score improvement by task") | |
| plt.legend() | |
| plt.tight_layout() | |
| plt.savefig(plots_dir / "evaluation_rewards.png") | |
| plt.close() | |
| save_json(report, Path(config.training.output_dir) / "evaluation_report.json") | |
| save_json( | |
| { | |
| "status": "completed", | |
| "trainer": config.training.algorithm, | |
| "baseline": report["baseline"], | |
| "trained": report["trained"], | |
| "task_comparison": report["comparison"], | |
| "score_delta": report["preference_score"], | |
| "success_rate": report["success_rate"], | |
| "artifact_urls": { | |
| "before_after_plot": str(plots_dir / "before_after_scores.png"), | |
| "reward_plot": str(plots_dir / "evaluation_rewards.png"), | |
| "comparison_table": str(Path(config.training.output_dir) / "evaluation_report.json"), | |
| }, | |
| }, | |
| Path("artifacts") / "training_summary.json", | |
| ) | |
| def main() -> None: | |
| parser = argparse.ArgumentParser(description="Evaluate FraudShield trainable agents.") | |
| parser.add_argument("--config", default="configs/colab_qlora_grpo.json", help="Path to experiment config JSON.") | |
| args = parser.parse_args() | |
| config = ExperimentConfig.load(args.config) | |
| report = evaluate_agent(config) | |
| save_evaluation_artifacts(report, config) | |
| print(json.dumps(report, indent=2)) | |
| if __name__ == "__main__": | |
| main() | |