Spaces:
Sleeping
Sleeping
| """ | |
| FinePrint Evaluation Script: Runs trained or heuristic models through test episodes, | |
| generates reward curves, and produces before/after comparisons. | |
| """ | |
| import sys | |
| import json | |
| import random | |
| from pathlib import Path | |
| from typing import Dict, List | |
| import numpy as np | |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) | |
| from config import TrainingConfig | |
| from fineprint.env import FinePrintEnv | |
| from fineprint.workflows import get_all_workflow_names | |
| from train_unsloth import run_episode_with_heuristic, collect_metrics | |
| def evaluate( | |
| env: FinePrintEnv, | |
| num_episodes: int = 20, | |
| seed: int = 42, | |
| verbose: bool = True, | |
| ) -> Dict: | |
| """ | |
| Evaluate the heuristic policy over multiple episodes. | |
| Returns aggregated metrics and per-episode details. | |
| """ | |
| all_results = [] | |
| for i in range(num_episodes): | |
| result = run_episode_with_heuristic(env, seed=seed + i) | |
| all_results.append(result) | |
| if verbose: | |
| print( | |
| f" Episode {i+1:3d}: " | |
| f"reward={result['total_reward']:+7.1f} " | |
| f"failures={result['compliance_failures']} " | |
| f"detections={result['drift_detections']} " | |
| f"completed={result['workflows_completed']} " | |
| f"satisfaction={result['user_satisfaction']:.0%}" | |
| ) | |
| metrics = collect_metrics(all_results) | |
| return { | |
| "metrics": metrics, | |
| "episodes": all_results, | |
| } | |
| def generate_reward_curve(results: List[Dict], output_path: str) -> None: | |
| """Save reward curve data to JSON for plotting.""" | |
| rewards = [r["total_reward"] for r in results] | |
| failures = [r["compliance_failures"] for r in results] | |
| detections = [r["drift_detections"] for r in results] | |
| satisfaction = [r["user_satisfaction"] for r in results] | |
| data = { | |
| "episode_rewards": rewards, | |
| "compliance_failures": failures, | |
| "drift_detections": detections, | |
| "user_satisfaction": satisfaction, | |
| "cumulative_avg_reward": [ | |
| float(np.mean(rewards[: i + 1])) for i in range(len(rewards)) | |
| ], | |
| } | |
| with open(output_path, "w") as f: | |
| json.dump(data, f, indent=2) | |
| print(f"Reward curve data saved to {output_path}") | |
| def print_comparison(baseline: Dict, trained: Dict) -> None: | |
| """Print a before/after comparison table.""" | |
| print() | |
| print("=" * 60) | |
| print("BEFORE vs AFTER COMPARISON") | |
| print("=" * 60) | |
| print(f"{'Metric':<30} {'Baseline':>12} {'Trained':>12}") | |
| print("-" * 60) | |
| for key in baseline: | |
| b_val = baseline[key] | |
| t_val = trained.get(key, 0) | |
| if isinstance(b_val, float): | |
| improvement = t_val - b_val | |
| arrow = "β" if improvement > 0 else "β" if improvement < 0 else "=" | |
| print(f"{key:<30} {b_val:>12.2f} {t_val:>12.2f} {arrow}") | |
| else: | |
| print(f"{key:<30} {str(b_val):>12} {str(t_val):>12}") | |
| print("=" * 60) | |
| def evaluate_model( | |
| model, | |
| tokenizer, | |
| env: FinePrintEnv, | |
| config, | |
| device, | |
| num_episodes: int = 20, | |
| seed: int = 42, | |
| verbose: bool = True, | |
| ) -> Dict: | |
| """ | |
| Evaluate a trained model over multiple episodes using greedy decoding. | |
| """ | |
| from train_unsloth import run_model_episode, collect_metrics | |
| all_results = [] | |
| for i in range(num_episodes): | |
| result = run_model_episode( | |
| model, tokenizer, env, config, | |
| seed=seed + i, device=device, | |
| ) | |
| all_results.append(result) | |
| if verbose: | |
| print( | |
| f" Episode {i+1:3d}: " | |
| f"reward={result['total_reward']:+7.1f} " | |
| f"failures={result['compliance_failures']} " | |
| f"detections={result['drift_detections']} " | |
| f"completed={result['workflows_completed']} " | |
| f"satisfaction={result['user_satisfaction']:.0%}" | |
| ) | |
| metrics = collect_metrics(all_results) | |
| return {"metrics": metrics, "episodes": all_results} | |
| def main(): | |
| """Run evaluation.""" | |
| config = TrainingConfig() | |
| policies_path = str(Path(__file__).resolve().parent.parent / config.policies_dir) | |
| env = FinePrintEnv( | |
| policies_dir=policies_path, | |
| num_workflows_per_episode=config.num_workflows_per_episode, | |
| max_episode_steps=config.max_episode_steps, | |
| drift_probability=config.drift_probability, | |
| silent_drift_ratio=config.silent_drift_ratio, | |
| ) | |
| print("=" * 60) | |
| print("FINEPRINT EVALUATION") | |
| print("=" * 60) | |
| print(f"Episodes: {config.eval_episodes}") | |
| print(f"Seed: {config.eval_seed}") | |
| print() | |
| # ββ Heuristic evaluation ββ | |
| print("Running heuristic evaluation...") | |
| result = evaluate( | |
| env, | |
| num_episodes=config.eval_episodes, | |
| seed=config.eval_seed, | |
| verbose=True, | |
| ) | |
| heuristic_metrics = result["metrics"] | |
| print() | |
| print("=" * 60) | |
| print("HEURISTIC AGGREGATE METRICS") | |
| print("=" * 60) | |
| for key, val in heuristic_metrics.items(): | |
| if isinstance(val, float): | |
| print(f" {key}: {val:.4f}") | |
| else: | |
| print(f" {key}: {val}") | |
| # Save results | |
| output_dir = Path(config.log_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| generate_reward_curve( | |
| result["episodes"], | |
| str(output_dir / "eval_reward_curve.json"), | |
| ) | |
| # ββ Trained model evaluation (if checkpoint exists) ββ | |
| ckpt_path = Path(config.checkpoint_dir) / "best" | |
| if not ckpt_path.exists(): | |
| ckpt_path = Path(config.checkpoint_dir) / "final" | |
| if ckpt_path.exists(): | |
| try: | |
| from unsloth import FastLanguageModel | |
| import torch | |
| print(f"\nLoading trained model from {ckpt_path}...") | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name=str(ckpt_path), | |
| max_seq_length=config.max_seq_length, | |
| dtype=None, | |
| load_in_4bit=True, | |
| ) | |
| FastLanguageModel.for_inference(model) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| device = model.device | |
| print("Running trained-model evaluation...") | |
| trained_result = evaluate_model( | |
| model, tokenizer, env, config, device, | |
| num_episodes=config.eval_episodes, | |
| seed=config.eval_seed, | |
| verbose=True, | |
| ) | |
| trained_metrics = trained_result["metrics"] | |
| generate_reward_curve( | |
| trained_result["episodes"], | |
| str(output_dir / "trained_eval_reward_curve.json"), | |
| ) | |
| print_comparison(heuristic_metrics, trained_metrics) | |
| except ImportError: | |
| print("\nUnsloth not available β skipping trained model evaluation.") | |
| else: | |
| # Load baseline if available for comparison | |
| baseline_path = output_dir / "baseline_metrics.json" | |
| if baseline_path.exists(): | |
| with open(baseline_path, "r") as f: | |
| baseline = json.load(f) | |
| print_comparison(baseline, heuristic_metrics) | |
| env.close() | |
| print("\nEvaluation complete.") | |
| if __name__ == "__main__": | |
| main() | |