File size: 5,731 Bytes
ce9edc2
 
 
 
 
 
8c34447
 
ce9edc2
 
 
 
 
 
 
 
 
8c34447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce9edc2
8c34447
ce9edc2
 
8c34447
 
 
 
 
 
 
ce9edc2
8c34447
 
 
 
 
ce9edc2
 
8c34447
 
 
ce9edc2
 
8c34447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce9edc2
 
 
 
 
 
8c34447
 
 
 
 
 
ce9edc2
8c34447
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce9edc2
 
 
 
8c34447
ce9edc2
a023c6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce9edc2
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
"""Evaluation entrypoint for FraudShield trainable agents."""

from __future__ import annotations

import argparse
import json
import os
import subprocess
from pathlib import Path
from typing import Any

import matplotlib.pyplot as plt

from config import ExperimentConfig
from utils import ensure_dir, moving_average, save_json, seed_everything


def _run_inference(extra_env: dict[str, str] | None = None) -> dict[str, Any]:
    """Run ``inference.py`` with a controlled environment and return its report."""

    env_vars = os.environ.copy()
    for key in ("HF_TOKEN", "HUGGINGFACEHUB_API_TOKEN", "API_KEY", "OPENAI_API_KEY", "API_BASE_URL", "MODEL_NAME", "LOCAL_MODEL_PATH"):
        env_vars.pop(key, None)
    if extra_env:
        env_vars.update(extra_env)
    subprocess.run(
        ["python", "inference.py"],
        check=True,
        capture_output=True,
        text=True,
        env=env_vars,
    )
    with open("fraudshield_baseline_results.json", "r", encoding="utf-8") as handle:
        return json.load(handle)


def evaluate_agent(config: ExperimentConfig) -> dict[str, Any]:
    """Compare heuristic baseline against the trained local checkpoint."""

    seed_everything(config.seed)
    trained_model_path = str(Path(config.training.output_dir) / "trained_policy")

    baseline_results = _run_inference()
    trained_results = _run_inference({"LOCAL_MODEL_PATH": trained_model_path})

    comparison_rows = []
    win_count = 0
    for task in config.evaluation.tasks:
        baseline_score = float(baseline_results[task]["score"])
        trained_score = float(trained_results[task]["score"])
        if trained_score > baseline_score:
            win_count += 1
        comparison_rows.append(
            {
                "task": task,
                "baseline_score": baseline_score,
                "trained_score": trained_score,
                "delta": trained_score - baseline_score,
            }
        )

    return {
        "baseline": {
            "easy": baseline_results["easy"]["score"],
            "medium": baseline_results["medium"]["score"],
            "hard": baseline_results["hard"]["score"],
            "final_score": baseline_results["final_score"],
            "agent_metadata": baseline_results.get("metadata", {}),
        },
        "trained": {
            "easy": trained_results["easy"]["score"],
            "medium": trained_results["medium"]["score"],
            "hard": trained_results["hard"]["score"],
            "final_score": trained_results["final_score"],
            "agent_metadata": trained_results.get("metadata", {}),
            "local_model_path": trained_model_path,
        },
        "comparison": comparison_rows,
        "success_rate": win_count / max(1, len(config.evaluation.tasks)),
        "preference_score": trained_results["final_score"] - baseline_results["final_score"],
        "before_after": {
            "base_model_final": baseline_results["final_score"],
            "trained_model_final": trained_results["final_score"],
        },
    }


def save_evaluation_artifacts(report: dict[str, Any], config: ExperimentConfig) -> None:
    """Persist evaluation metrics and plots."""

    plots_dir = ensure_dir(config.evaluation.plots_dir)

    baseline_scores = [row["baseline_score"] for row in report["comparison"]]
    trained_scores = [row["trained_score"] for row in report["comparison"]]
    deltas = [row["delta"] for row in report["comparison"]]
    labels = [row["task"] for row in report["comparison"]]

    plt.figure(figsize=(8, 4))
    plt.plot(range(1, len(baseline_scores) + 1), baseline_scores, marker="o", label="baseline")
    plt.plot(range(1, len(trained_scores) + 1), trained_scores, marker="o", label="trained")
    plt.xticks(range(1, len(labels) + 1), labels)
    plt.ylabel("task score")
    plt.title("FraudShield before vs after")
    plt.legend()
    plt.tight_layout()
    plt.savefig(plots_dir / "before_after_scores.png")
    plt.close()

    plt.figure(figsize=(8, 4))
    plt.plot(range(1, len(deltas) + 1), deltas, marker="o", label="score_delta")
    plt.plot(range(1, len(deltas) + 1), moving_average(deltas, window=2), marker="x", label="moving_avg_delta")
    plt.xticks(range(1, len(labels) + 1), labels)
    plt.ylabel("score delta")
    plt.title("FraudShield score improvement by task")
    plt.legend()
    plt.tight_layout()
    plt.savefig(plots_dir / "evaluation_rewards.png")
    plt.close()

    save_json(report, Path(config.training.output_dir) / "evaluation_report.json")
    save_json(
        {
            "status": "completed",
            "trainer": config.training.algorithm,
            "baseline": report["baseline"],
            "trained": report["trained"],
            "task_comparison": report["comparison"],
            "score_delta": report["preference_score"],
            "success_rate": report["success_rate"],
            "artifact_urls": {
                "before_after_plot": str(plots_dir / "before_after_scores.png"),
                "reward_plot": str(plots_dir / "evaluation_rewards.png"),
                "comparison_table": str(Path(config.training.output_dir) / "evaluation_report.json"),
            },
        },
        Path("artifacts") / "training_summary.json",
    )


def main() -> None:
    parser = argparse.ArgumentParser(description="Evaluate FraudShield trainable agents.")
    parser.add_argument("--config", default="configs/colab_qlora_grpo.json", help="Path to experiment config JSON.")
    args = parser.parse_args()
    config = ExperimentConfig.load(args.config)
    report = evaluate_agent(config)
    save_evaluation_artifacts(report, config)
    print(json.dumps(report, indent=2))


if __name__ == "__main__":
    main()