""" FinePrint Evaluation Script: Runs trained or heuristic models through test episodes, generates reward curves, and produces before/after comparisons. """ import sys import json import random from pathlib import Path from typing import Dict, List import numpy as np sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from config import TrainingConfig from fineprint.env import FinePrintEnv from fineprint.workflows import get_all_workflow_names from train_unsloth import run_episode_with_heuristic, collect_metrics def evaluate( env: FinePrintEnv, num_episodes: int = 20, seed: int = 42, verbose: bool = True, ) -> Dict: """ Evaluate the heuristic policy over multiple episodes. Returns aggregated metrics and per-episode details. """ all_results = [] for i in range(num_episodes): result = run_episode_with_heuristic(env, seed=seed + i) all_results.append(result) if verbose: print( f" Episode {i+1:3d}: " f"reward={result['total_reward']:+7.1f} " f"failures={result['compliance_failures']} " f"detections={result['drift_detections']} " f"completed={result['workflows_completed']} " f"satisfaction={result['user_satisfaction']:.0%}" ) metrics = collect_metrics(all_results) return { "metrics": metrics, "episodes": all_results, } def generate_reward_curve(results: List[Dict], output_path: str) -> None: """Save reward curve data to JSON for plotting.""" rewards = [r["total_reward"] for r in results] failures = [r["compliance_failures"] for r in results] detections = [r["drift_detections"] for r in results] satisfaction = [r["user_satisfaction"] for r in results] data = { "episode_rewards": rewards, "compliance_failures": failures, "drift_detections": detections, "user_satisfaction": satisfaction, "cumulative_avg_reward": [ float(np.mean(rewards[: i + 1])) for i in range(len(rewards)) ], } with open(output_path, "w") as f: json.dump(data, f, indent=2) print(f"Reward curve data saved to {output_path}") def print_comparison(baseline: Dict, trained: Dict) -> None: """Print a before/after comparison table.""" print() print("=" * 60) print("BEFORE vs AFTER COMPARISON") print("=" * 60) print(f"{'Metric':<30} {'Baseline':>12} {'Trained':>12}") print("-" * 60) for key in baseline: b_val = baseline[key] t_val = trained.get(key, 0) if isinstance(b_val, float): improvement = t_val - b_val arrow = "↑" if improvement > 0 else "↓" if improvement < 0 else "=" print(f"{key:<30} {b_val:>12.2f} {t_val:>12.2f} {arrow}") else: print(f"{key:<30} {str(b_val):>12} {str(t_val):>12}") print("=" * 60) def evaluate_model( model, tokenizer, env: FinePrintEnv, config, device, num_episodes: int = 20, seed: int = 42, verbose: bool = True, ) -> Dict: """ Evaluate a trained model over multiple episodes using greedy decoding. """ from train_unsloth import run_model_episode, collect_metrics all_results = [] for i in range(num_episodes): result = run_model_episode( model, tokenizer, env, config, seed=seed + i, device=device, ) all_results.append(result) if verbose: print( f" Episode {i+1:3d}: " f"reward={result['total_reward']:+7.1f} " f"failures={result['compliance_failures']} " f"detections={result['drift_detections']} " f"completed={result['workflows_completed']} " f"satisfaction={result['user_satisfaction']:.0%}" ) metrics = collect_metrics(all_results) return {"metrics": metrics, "episodes": all_results} def main(): """Run evaluation.""" config = TrainingConfig() policies_path = str(Path(__file__).resolve().parent.parent / config.policies_dir) env = FinePrintEnv( policies_dir=policies_path, num_workflows_per_episode=config.num_workflows_per_episode, max_episode_steps=config.max_episode_steps, drift_probability=config.drift_probability, silent_drift_ratio=config.silent_drift_ratio, ) print("=" * 60) print("FINEPRINT EVALUATION") print("=" * 60) print(f"Episodes: {config.eval_episodes}") print(f"Seed: {config.eval_seed}") print() # ── Heuristic evaluation ── print("Running heuristic evaluation...") result = evaluate( env, num_episodes=config.eval_episodes, seed=config.eval_seed, verbose=True, ) heuristic_metrics = result["metrics"] print() print("=" * 60) print("HEURISTIC AGGREGATE METRICS") print("=" * 60) for key, val in heuristic_metrics.items(): if isinstance(val, float): print(f" {key}: {val:.4f}") else: print(f" {key}: {val}") # Save results output_dir = Path(config.log_dir) output_dir.mkdir(parents=True, exist_ok=True) generate_reward_curve( result["episodes"], str(output_dir / "eval_reward_curve.json"), ) # ── Trained model evaluation (if checkpoint exists) ── ckpt_path = Path(config.checkpoint_dir) / "best" if not ckpt_path.exists(): ckpt_path = Path(config.checkpoint_dir) / "final" if ckpt_path.exists(): try: from unsloth import FastLanguageModel import torch print(f"\nLoading trained model from {ckpt_path}...") model, tokenizer = FastLanguageModel.from_pretrained( model_name=str(ckpt_path), max_seq_length=config.max_seq_length, dtype=None, load_in_4bit=True, ) FastLanguageModel.for_inference(model) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token device = model.device print("Running trained-model evaluation...") trained_result = evaluate_model( model, tokenizer, env, config, device, num_episodes=config.eval_episodes, seed=config.eval_seed, verbose=True, ) trained_metrics = trained_result["metrics"] generate_reward_curve( trained_result["episodes"], str(output_dir / "trained_eval_reward_curve.json"), ) print_comparison(heuristic_metrics, trained_metrics) except ImportError: print("\nUnsloth not available — skipping trained model evaluation.") else: # Load baseline if available for comparison baseline_path = output_dir / "baseline_metrics.json" if baseline_path.exists(): with open(baseline_path, "r") as f: baseline = json.load(f) print_comparison(baseline, heuristic_metrics) env.close() print("\nEvaluation complete.") if __name__ == "__main__": main()