Spaces:
Runtime error
Runtime error
| """Evaluation script: compare base model vs trained model on held-out scenarios. | |
| Usage: | |
| python training/eval.py --base-model Qwen/Qwen2.5-7B --trained-model SidraMiconi/exec-assistant-arena-lora | |
| """ | |
| import json | |
| import os | |
| import sys | |
| import argparse | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from exec_assistant_arena import ExecAssistantArenaEnv | |
| from exec_assistant_arena.models import AssistantAction | |
| from training.train_grpo import parse_tool_calls | |
| ENV_URL = "http://localhost:8000" | |
| def evaluate_model(model, tokenizer, scenarios, env_url, label="model"): | |
| """Run model through eval scenarios and collect metrics.""" | |
| from unsloth import FastLanguageModel | |
| FastLanguageModel.for_inference(model) | |
| results = [] | |
| for i, scenario in enumerate(scenarios): | |
| prompt = scenario["prompt"] | |
| inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to("cuda") | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=1024, | |
| temperature=0.7, | |
| do_sample=True, | |
| ) | |
| completion = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True) | |
| # Score through environment | |
| try: | |
| with ExecAssistantArenaEnv(base_url=env_url) as env: | |
| seed = scenario.get("seed", i + 80) | |
| difficulty = scenario.get("difficulty", "medium") | |
| env.reset(seed=seed, difficulty=difficulty) | |
| actions = parse_tool_calls(completion) | |
| total_reward = 0.0 | |
| for action in actions: | |
| result = env.step(action) | |
| total_reward += (result.reward or 0.0) | |
| if result.done: | |
| break | |
| if not result.done: | |
| result = env.step(AssistantAction(tool="done")) | |
| total_reward += (result.reward or 0.0) | |
| state = env.state() | |
| results.append({ | |
| "scenario_idx": i, | |
| "seed": seed, | |
| "difficulty": difficulty, | |
| "total_reward": total_reward, | |
| "conflicts_resolved": state.conflicts_resolved, | |
| "total_conflicts": state.total_conflicts, | |
| "conflict_rate": state.conflicts_resolved / max(1, state.total_conflicts), | |
| "emails_drafted": state.emails_drafted, | |
| "total_emails": state.total_emails, | |
| "preferences_inferred": state.preferences_inferred, | |
| "deadlines_met": state.deadlines_met, | |
| "unnecessary_actions": state.unnecessary_actions, | |
| "n_actions": len(actions), | |
| "completion": completion[:500], | |
| }) | |
| except Exception as e: | |
| print(f" Error on scenario {i}: {e}") | |
| results.append({"scenario_idx": i, "total_reward": -1.0, "error": str(e)}) | |
| print(f" [{label}] Scenario {i}: reward={results[-1].get('total_reward', 'err'):.2f}") | |
| return results | |
| def print_comparison(base_results, trained_results): | |
| """Print side-by-side comparison.""" | |
| print("\n" + "=" * 70) | |
| print("EVALUATION RESULTS") | |
| print("=" * 70) | |
| metrics = ["total_reward", "conflict_rate", "emails_drafted", "preferences_inferred", "unnecessary_actions"] | |
| for metric in metrics: | |
| base_vals = [r.get(metric, 0) for r in base_results if "error" not in r] | |
| trained_vals = [r.get(metric, 0) for r in trained_results if "error" not in r] | |
| if base_vals and trained_vals: | |
| base_avg = sum(base_vals) / len(base_vals) | |
| trained_avg = sum(trained_vals) / len(trained_vals) | |
| delta = trained_avg - base_avg | |
| print(f" {metric:25s} base={base_avg:7.2f} trained={trained_avg:7.2f} delta={delta:+.2f}") | |
| print("=" * 70) | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--base-model", default="Qwen/Qwen2.5-7B") | |
| parser.add_argument("--trained-model", default="SidraMiconi/exec-assistant-arena-lora") | |
| parser.add_argument("--env-url", default=ENV_URL) | |
| parser.add_argument("--output", default="training/eval_results.json") | |
| args = parser.parse_args() | |
| script_dir = os.path.dirname(os.path.abspath(__file__)) | |
| with open(os.path.join(script_dir, "scenarios/eval_scenarios.json")) as f: | |
| scenarios = json.load(f) | |
| print(f"Evaluating on {len(scenarios)} held-out scenarios\n") | |
| from unsloth import FastLanguageModel | |
| # Load base model | |
| print("Loading base model...") | |
| base_model, base_tokenizer = FastLanguageModel.from_pretrained( | |
| model_name=args.base_model, max_seq_length=2048, load_in_4bit=True, | |
| ) | |
| print("Evaluating base model...") | |
| base_results = evaluate_model(base_model, base_tokenizer, scenarios, args.env_url, "base") | |
| del base_model | |
| # Load trained model | |
| print("\nLoading trained model...") | |
| trained_model, trained_tokenizer = FastLanguageModel.from_pretrained( | |
| model_name=args.trained_model, max_seq_length=2048, load_in_4bit=True, | |
| ) | |
| print("Evaluating trained model...") | |
| trained_results = evaluate_model(trained_model, trained_tokenizer, scenarios, args.env_url, "trained") | |
| print_comparison(base_results, trained_results) | |
| # Save results | |
| output = { | |
| "base_model": args.base_model, | |
| "trained_model": args.trained_model, | |
| "base_results": base_results, | |
| "trained_results": trained_results, | |
| } | |
| with open(args.output, "w") as f: | |
| json.dump(output, f, indent=2) | |
| print(f"\nResults saved to {args.output}") | |
| if __name__ == "__main__": | |
| main() | |