vigneshmoovendhan's picture
Fine Print RL final
0b6a889
"""
FinePrint Evaluation Script: Runs trained or heuristic models through test episodes,
generates reward curves, and produces before/after comparisons.
"""
import sys
import json
import random
from pathlib import Path
from typing import Dict, List
import numpy as np
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from config import TrainingConfig
from fineprint.env import FinePrintEnv
from fineprint.workflows import get_all_workflow_names
from train_unsloth import run_episode_with_heuristic, collect_metrics
def evaluate(
env: FinePrintEnv,
num_episodes: int = 20,
seed: int = 42,
verbose: bool = True,
) -> Dict:
"""
Evaluate the heuristic policy over multiple episodes.
Returns aggregated metrics and per-episode details.
"""
all_results = []
for i in range(num_episodes):
result = run_episode_with_heuristic(env, seed=seed + i)
all_results.append(result)
if verbose:
print(
f" Episode {i+1:3d}: "
f"reward={result['total_reward']:+7.1f} "
f"failures={result['compliance_failures']} "
f"detections={result['drift_detections']} "
f"completed={result['workflows_completed']} "
f"satisfaction={result['user_satisfaction']:.0%}"
)
metrics = collect_metrics(all_results)
return {
"metrics": metrics,
"episodes": all_results,
}
def generate_reward_curve(results: List[Dict], output_path: str) -> None:
"""Save reward curve data to JSON for plotting."""
rewards = [r["total_reward"] for r in results]
failures = [r["compliance_failures"] for r in results]
detections = [r["drift_detections"] for r in results]
satisfaction = [r["user_satisfaction"] for r in results]
data = {
"episode_rewards": rewards,
"compliance_failures": failures,
"drift_detections": detections,
"user_satisfaction": satisfaction,
"cumulative_avg_reward": [
float(np.mean(rewards[: i + 1])) for i in range(len(rewards))
],
}
with open(output_path, "w") as f:
json.dump(data, f, indent=2)
print(f"Reward curve data saved to {output_path}")
def print_comparison(baseline: Dict, trained: Dict) -> None:
"""Print a before/after comparison table."""
print()
print("=" * 60)
print("BEFORE vs AFTER COMPARISON")
print("=" * 60)
print(f"{'Metric':<30} {'Baseline':>12} {'Trained':>12}")
print("-" * 60)
for key in baseline:
b_val = baseline[key]
t_val = trained.get(key, 0)
if isinstance(b_val, float):
improvement = t_val - b_val
arrow = "↑" if improvement > 0 else "↓" if improvement < 0 else "="
print(f"{key:<30} {b_val:>12.2f} {t_val:>12.2f} {arrow}")
else:
print(f"{key:<30} {str(b_val):>12} {str(t_val):>12}")
print("=" * 60)
def evaluate_model(
model,
tokenizer,
env: FinePrintEnv,
config,
device,
num_episodes: int = 20,
seed: int = 42,
verbose: bool = True,
) -> Dict:
"""
Evaluate a trained model over multiple episodes using greedy decoding.
"""
from train_unsloth import run_model_episode, collect_metrics
all_results = []
for i in range(num_episodes):
result = run_model_episode(
model, tokenizer, env, config,
seed=seed + i, device=device,
)
all_results.append(result)
if verbose:
print(
f" Episode {i+1:3d}: "
f"reward={result['total_reward']:+7.1f} "
f"failures={result['compliance_failures']} "
f"detections={result['drift_detections']} "
f"completed={result['workflows_completed']} "
f"satisfaction={result['user_satisfaction']:.0%}"
)
metrics = collect_metrics(all_results)
return {"metrics": metrics, "episodes": all_results}
def main():
"""Run evaluation."""
config = TrainingConfig()
policies_path = str(Path(__file__).resolve().parent.parent / config.policies_dir)
env = FinePrintEnv(
policies_dir=policies_path,
num_workflows_per_episode=config.num_workflows_per_episode,
max_episode_steps=config.max_episode_steps,
drift_probability=config.drift_probability,
silent_drift_ratio=config.silent_drift_ratio,
)
print("=" * 60)
print("FINEPRINT EVALUATION")
print("=" * 60)
print(f"Episodes: {config.eval_episodes}")
print(f"Seed: {config.eval_seed}")
print()
# ── Heuristic evaluation ──
print("Running heuristic evaluation...")
result = evaluate(
env,
num_episodes=config.eval_episodes,
seed=config.eval_seed,
verbose=True,
)
heuristic_metrics = result["metrics"]
print()
print("=" * 60)
print("HEURISTIC AGGREGATE METRICS")
print("=" * 60)
for key, val in heuristic_metrics.items():
if isinstance(val, float):
print(f" {key}: {val:.4f}")
else:
print(f" {key}: {val}")
# Save results
output_dir = Path(config.log_dir)
output_dir.mkdir(parents=True, exist_ok=True)
generate_reward_curve(
result["episodes"],
str(output_dir / "eval_reward_curve.json"),
)
# ── Trained model evaluation (if checkpoint exists) ──
ckpt_path = Path(config.checkpoint_dir) / "best"
if not ckpt_path.exists():
ckpt_path = Path(config.checkpoint_dir) / "final"
if ckpt_path.exists():
try:
from unsloth import FastLanguageModel
import torch
print(f"\nLoading trained model from {ckpt_path}...")
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=str(ckpt_path),
max_seq_length=config.max_seq_length,
dtype=None,
load_in_4bit=True,
)
FastLanguageModel.for_inference(model)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
device = model.device
print("Running trained-model evaluation...")
trained_result = evaluate_model(
model, tokenizer, env, config, device,
num_episodes=config.eval_episodes,
seed=config.eval_seed,
verbose=True,
)
trained_metrics = trained_result["metrics"]
generate_reward_curve(
trained_result["episodes"],
str(output_dir / "trained_eval_reward_curve.json"),
)
print_comparison(heuristic_metrics, trained_metrics)
except ImportError:
print("\nUnsloth not available β€” skipping trained model evaluation.")
else:
# Load baseline if available for comparison
baseline_path = output_dir / "baseline_metrics.json"
if baseline_path.exists():
with open(baseline_path, "r") as f:
baseline = json.load(f)
print_comparison(baseline, heuristic_metrics)
env.close()
print("\nEvaluation complete.")
if __name__ == "__main__":
main()