"""Quick evaluation: run model on test tasks and score with reward functions. Usage: # Evaluate base model via vLLM python scripts/eval_model.py --vllm-url http://localhost:8001/v1 --model Qwen/Qwen3-8B # Evaluate trained adapter (local inference) python scripts/eval_model.py --adapter-path ./outputs/grpo-support-agent/final_adapter # Run all 8 tasks and save results python scripts/eval_model.py --all-tasks --label vanilla --output-json ./outputs/eval_vanilla.json """ import argparse import json import sys import time from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from server.environment import CustomerSupportEnvironment from scripts.collect_data import ( build_system_prompt, format_initial_obs, format_step_obs, parse_tool_call, run_episode, load_tasks, DATA_DIR, ) from scripts.train_grpo import ( format_reward, tool_validity_reward, reasoning_reward, no_reasoning_leak_reward, _get_text, TOOL_CALL_RE, ) def eval_episode_rewards(steps): """Score an episode's steps with all reward functions.""" scores = {"format": [], "tool_valid": [], "reasoning": [], "no_leak": []} for s in steps: comp = s.get("completion", "") comp_list = [[{"role": "assistant", "content": comp}]] scores["format"].extend(format_reward(comp_list)) scores["tool_valid"].extend(tool_validity_reward(comp_list)) scores["reasoning"].extend(reasoning_reward(comp_list)) scores["no_leak"].extend(no_reasoning_leak_reward(comp_list)) return {k: sum(v) / len(v) if v else 0 for k, v in scores.items()} def main(): parser = argparse.ArgumentParser() parser.add_argument("--vllm-url", default="http://localhost:8001/v1") parser.add_argument("--model", default="Qwen/Qwen3-8B") parser.add_argument("--adapter-path", default=None, help="Path to LoRA adapter (if evaluating trained model)") parser.add_argument("--tasks", type=int, default=4, help="Number of tasks to evaluate") parser.add_argument("--all-tasks", action="store_true", help="Run all 8 tasks") parser.add_argument("--seed", type=int, default=77, help="Random seed") parser.add_argument("--label", default=None, help="Label for this eval run") parser.add_argument("--output-json", default=None, help="Save results to JSON file") args = parser.parse_args() if args.all_tasks: args.tasks = 999 # will be capped by actual task count if args.adapter_path: # Load model with adapter for local inference from transformers import AutoModelForCausalLM, AutoTokenizer from peft import PeftModel import torch print(f"Loading base model {args.model}...") tokenizer = AutoTokenizer.from_pretrained(args.model) model = AutoModelForCausalLM.from_pretrained( args.model, torch_dtype=torch.bfloat16, device_map="auto" ) print(f"Loading adapter from {args.adapter_path}...") model = PeftModel.from_pretrained(model, args.adapter_path) model.eval() def generate_fn(messages): text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) inputs = tokenizer(text, return_tensors="pt").to(model.device) with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=512, temperature=0.7, do_sample=True, top_p=0.9, ) new_tokens = outputs[0][inputs["input_ids"].shape[1]:] return tokenizer.decode(new_tokens, skip_special_tokens=True) else: # Use vLLM from openai import OpenAI client = OpenAI(base_url=args.vllm_url, api_key="none") try: models = client.models.list() print(f"Connected to vLLM: {[m.id for m in models.data]}") except Exception as e: print(f"Cannot connect to vLLM: {e}") sys.exit(1) def generate_fn(messages): resp = client.chat.completions.create( model=args.model, messages=messages, temperature=0.7, max_tokens=512, ) return resp.choices[0].message.content or "" # Setup environment env = CustomerSupportEnvironment() system_prompt = build_system_prompt(env) tasks = load_tasks(DATA_DIR / "tasks") task_ids = list(tasks.keys())[:args.tasks] label = args.label or ("adapter" if args.adapter_path else "base") print(f"\n=== Evaluating [{label}] on {len(task_ids)} tasks ===\n") results = [] for task_id in task_ids: t0 = time.time() steps = run_episode(env, generate_fn, system_prompt, task_id=task_id, seed=args.seed) elapsed = time.time() - t0 final = steps[-1] if steps else {} episode_reward = final.get("episode_reward", 0) resolved = final.get("episode_resolved", final.get("resolved", False)) reward_scores = eval_episode_rewards(steps) tools_used = [s.get("tool_name", "?") for s in steps if "tool_name" in s] result = { "task_id": task_id, "steps": len(steps), "episode_reward": episode_reward, "resolved": resolved, "tools": tools_used, "elapsed": round(elapsed, 1), **reward_scores, } results.append(result) status = "RESOLVED" if resolved else "not resolved" print(f"{task_id}:") print(f" Steps: {len(steps)}, Reward: {episode_reward:.3f}, {status}") print(f" Tools: {tools_used}") print(f" Format: {reward_scores['format']:.2f}, Valid: {reward_scores['tool_valid']:.2f}, " f"Reasoning: {reward_scores['reasoning']:.2f}, No-leak: {reward_scores['no_leak']:.2f}") print(f" Time: {elapsed:.1f}s") print() # Summary print(f"=== SUMMARY [{label}] ===") avg_reward = sum(r["episode_reward"] for r in results) / len(results) resolve_rate = sum(1 for r in results if r["resolved"]) / len(results) avg_steps = sum(r["steps"] for r in results) / len(results) avg_format = sum(r["format"] for r in results) / len(results) avg_valid = sum(r["tool_valid"] for r in results) / len(results) avg_reasoning = sum(r["reasoning"] for r in results) / len(results) avg_no_leak = sum(r["no_leak"] for r in results) / len(results) print(f"Avg episode reward: {avg_reward:.3f}") print(f"Resolution rate: {resolve_rate:.0%}") print(f"Avg steps: {avg_steps:.1f}") print(f"Avg format score: {avg_format:.2f}") print(f"Avg tool validity: {avg_valid:.2f}") print(f"Avg reasoning: {avg_reasoning:.2f}") print(f"Avg no-leak: {avg_no_leak:.2f}") # Save results if args.output_json: output = { "label": label, "model": args.model, "adapter_path": args.adapter_path, "seed": args.seed, "summary": { "avg_episode_reward": round(avg_reward, 4), "resolution_rate": round(resolve_rate, 4), "avg_steps": round(avg_steps, 2), "avg_format": round(avg_format, 4), "avg_tool_valid": round(avg_valid, 4), "avg_reasoning": round(avg_reasoning, 4), "avg_no_leak": round(avg_no_leak, 4), }, "tasks": results, } Path(args.output_json).parent.mkdir(parents=True, exist_ok=True) with open(args.output_json, "w") as f: json.dump(output, f, indent=2, default=str) print(f"\nResults saved to {args.output_json}") env.close() if __name__ == "__main__": main()