Spaces:
Sleeping
Sleeping
File size: 5,731 Bytes
ce9edc2 8c34447 ce9edc2 8c34447 ce9edc2 8c34447 ce9edc2 8c34447 ce9edc2 8c34447 ce9edc2 8c34447 ce9edc2 8c34447 ce9edc2 8c34447 ce9edc2 8c34447 ce9edc2 8c34447 ce9edc2 a023c6a ce9edc2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 | """Evaluation entrypoint for FraudShield trainable agents."""
from __future__ import annotations
import argparse
import json
import os
import subprocess
from pathlib import Path
from typing import Any
import matplotlib.pyplot as plt
from config import ExperimentConfig
from utils import ensure_dir, moving_average, save_json, seed_everything
def _run_inference(extra_env: dict[str, str] | None = None) -> dict[str, Any]:
"""Run ``inference.py`` with a controlled environment and return its report."""
env_vars = os.environ.copy()
for key in ("HF_TOKEN", "HUGGINGFACEHUB_API_TOKEN", "API_KEY", "OPENAI_API_KEY", "API_BASE_URL", "MODEL_NAME", "LOCAL_MODEL_PATH"):
env_vars.pop(key, None)
if extra_env:
env_vars.update(extra_env)
subprocess.run(
["python", "inference.py"],
check=True,
capture_output=True,
text=True,
env=env_vars,
)
with open("fraudshield_baseline_results.json", "r", encoding="utf-8") as handle:
return json.load(handle)
def evaluate_agent(config: ExperimentConfig) -> dict[str, Any]:
"""Compare heuristic baseline against the trained local checkpoint."""
seed_everything(config.seed)
trained_model_path = str(Path(config.training.output_dir) / "trained_policy")
baseline_results = _run_inference()
trained_results = _run_inference({"LOCAL_MODEL_PATH": trained_model_path})
comparison_rows = []
win_count = 0
for task in config.evaluation.tasks:
baseline_score = float(baseline_results[task]["score"])
trained_score = float(trained_results[task]["score"])
if trained_score > baseline_score:
win_count += 1
comparison_rows.append(
{
"task": task,
"baseline_score": baseline_score,
"trained_score": trained_score,
"delta": trained_score - baseline_score,
}
)
return {
"baseline": {
"easy": baseline_results["easy"]["score"],
"medium": baseline_results["medium"]["score"],
"hard": baseline_results["hard"]["score"],
"final_score": baseline_results["final_score"],
"agent_metadata": baseline_results.get("metadata", {}),
},
"trained": {
"easy": trained_results["easy"]["score"],
"medium": trained_results["medium"]["score"],
"hard": trained_results["hard"]["score"],
"final_score": trained_results["final_score"],
"agent_metadata": trained_results.get("metadata", {}),
"local_model_path": trained_model_path,
},
"comparison": comparison_rows,
"success_rate": win_count / max(1, len(config.evaluation.tasks)),
"preference_score": trained_results["final_score"] - baseline_results["final_score"],
"before_after": {
"base_model_final": baseline_results["final_score"],
"trained_model_final": trained_results["final_score"],
},
}
def save_evaluation_artifacts(report: dict[str, Any], config: ExperimentConfig) -> None:
"""Persist evaluation metrics and plots."""
plots_dir = ensure_dir(config.evaluation.plots_dir)
baseline_scores = [row["baseline_score"] for row in report["comparison"]]
trained_scores = [row["trained_score"] for row in report["comparison"]]
deltas = [row["delta"] for row in report["comparison"]]
labels = [row["task"] for row in report["comparison"]]
plt.figure(figsize=(8, 4))
plt.plot(range(1, len(baseline_scores) + 1), baseline_scores, marker="o", label="baseline")
plt.plot(range(1, len(trained_scores) + 1), trained_scores, marker="o", label="trained")
plt.xticks(range(1, len(labels) + 1), labels)
plt.ylabel("task score")
plt.title("FraudShield before vs after")
plt.legend()
plt.tight_layout()
plt.savefig(plots_dir / "before_after_scores.png")
plt.close()
plt.figure(figsize=(8, 4))
plt.plot(range(1, len(deltas) + 1), deltas, marker="o", label="score_delta")
plt.plot(range(1, len(deltas) + 1), moving_average(deltas, window=2), marker="x", label="moving_avg_delta")
plt.xticks(range(1, len(labels) + 1), labels)
plt.ylabel("score delta")
plt.title("FraudShield score improvement by task")
plt.legend()
plt.tight_layout()
plt.savefig(plots_dir / "evaluation_rewards.png")
plt.close()
save_json(report, Path(config.training.output_dir) / "evaluation_report.json")
save_json(
{
"status": "completed",
"trainer": config.training.algorithm,
"baseline": report["baseline"],
"trained": report["trained"],
"task_comparison": report["comparison"],
"score_delta": report["preference_score"],
"success_rate": report["success_rate"],
"artifact_urls": {
"before_after_plot": str(plots_dir / "before_after_scores.png"),
"reward_plot": str(plots_dir / "evaluation_rewards.png"),
"comparison_table": str(Path(config.training.output_dir) / "evaluation_report.json"),
},
},
Path("artifacts") / "training_summary.json",
)
def main() -> None:
parser = argparse.ArgumentParser(description="Evaluate FraudShield trainable agents.")
parser.add_argument("--config", default="configs/colab_qlora_grpo.json", help="Path to experiment config JSON.")
args = parser.parse_args()
config = ExperimentConfig.load(args.config)
report = evaluate_agent(config)
save_evaluation_artifacts(report, config)
print(json.dumps(report, indent=2))
if __name__ == "__main__":
main()
|