"""Evaluation script: compare base model vs trained model on held-out scenarios. Usage: python training/eval.py --base-model Qwen/Qwen2.5-7B --trained-model SidraMiconi/exec-assistant-arena-lora """ import json import os import sys import argparse sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from exec_assistant_arena import ExecAssistantArenaEnv from exec_assistant_arena.models import AssistantAction from training.train_grpo import parse_tool_calls ENV_URL = "http://localhost:8000" def evaluate_model(model, tokenizer, scenarios, env_url, label="model"): """Run model through eval scenarios and collect metrics.""" from unsloth import FastLanguageModel FastLanguageModel.for_inference(model) results = [] for i, scenario in enumerate(scenarios): prompt = scenario["prompt"] inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to("cuda") outputs = model.generate( **inputs, max_new_tokens=1024, temperature=0.7, do_sample=True, ) completion = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) # Score through environment try: with ExecAssistantArenaEnv(base_url=env_url) as env: seed = scenario.get("seed", i + 80) difficulty = scenario.get("difficulty", "medium") env.reset(seed=seed, difficulty=difficulty) actions = parse_tool_calls(completion) total_reward = 0.0 for action in actions: result = env.step(action) total_reward += (result.reward or 0.0) if result.done: break if not result.done: result = env.step(AssistantAction(tool="done")) total_reward += (result.reward or 0.0) state = env.state() results.append({ "scenario_idx": i, "seed": seed, "difficulty": difficulty, "total_reward": total_reward, "conflicts_resolved": state.conflicts_resolved, "total_conflicts": state.total_conflicts, "conflict_rate": state.conflicts_resolved / max(1, state.total_conflicts), "emails_drafted": state.emails_drafted, "total_emails": state.total_emails, "preferences_inferred": state.preferences_inferred, "deadlines_met": state.deadlines_met, "unnecessary_actions": state.unnecessary_actions, "n_actions": len(actions), "completion": completion[:500], }) except Exception as e: print(f" Error on scenario {i}: {e}") results.append({"scenario_idx": i, "total_reward": -1.0, "error": str(e)}) print(f" [{label}] Scenario {i}: reward={results[-1].get('total_reward', 'err'):.2f}") return results def print_comparison(base_results, trained_results): """Print side-by-side comparison.""" print("\n" + "=" * 70) print("EVALUATION RESULTS") print("=" * 70) metrics = ["total_reward", "conflict_rate", "emails_drafted", "preferences_inferred", "unnecessary_actions"] for metric in metrics: base_vals = [r.get(metric, 0) for r in base_results if "error" not in r] trained_vals = [r.get(metric, 0) for r in trained_results if "error" not in r] if base_vals and trained_vals: base_avg = sum(base_vals) / len(base_vals) trained_avg = sum(trained_vals) / len(trained_vals) delta = trained_avg - base_avg print(f" {metric:25s} base={base_avg:7.2f} trained={trained_avg:7.2f} delta={delta:+.2f}") print("=" * 70) def main(): parser = argparse.ArgumentParser() parser.add_argument("--base-model", default="Qwen/Qwen2.5-7B") parser.add_argument("--trained-model", default="SidraMiconi/exec-assistant-arena-lora") parser.add_argument("--env-url", default=ENV_URL) parser.add_argument("--output", default="training/eval_results.json") args = parser.parse_args() script_dir = os.path.dirname(os.path.abspath(__file__)) with open(os.path.join(script_dir, "scenarios/eval_scenarios.json")) as f: scenarios = json.load(f) print(f"Evaluating on {len(scenarios)} held-out scenarios\n") from unsloth import FastLanguageModel # Load base model print("Loading base model...") base_model, base_tokenizer = FastLanguageModel.from_pretrained( model_name=args.base_model, max_seq_length=2048, load_in_4bit=True, ) print("Evaluating base model...") base_results = evaluate_model(base_model, base_tokenizer, scenarios, args.env_url, "base") del base_model # Load trained model print("\nLoading trained model...") trained_model, trained_tokenizer = FastLanguageModel.from_pretrained( model_name=args.trained_model, max_seq_length=2048, load_in_4bit=True, ) print("Evaluating trained model...") trained_results = evaluate_model(trained_model, trained_tokenizer, scenarios, args.env_url, "trained") print_comparison(base_results, trained_results) # Save results output = { "base_model": args.base_model, "trained_model": args.trained_model, "base_results": base_results, "trained_results": trained_results, } with open(args.output, "w") as f: json.dump(output, f, indent=2) print(f"\nResults saved to {args.output}") if __name__ == "__main__": main()