jjmachan's picture
Upload folder using huggingface_hub
8cad0d1 verified
"""Quick evaluation: run model on test tasks and score with reward functions.
Usage:
# Evaluate base model via vLLM
python scripts/eval_model.py --vllm-url http://localhost:8001/v1 --model Qwen/Qwen3-8B
# Evaluate trained adapter (local inference)
python scripts/eval_model.py --adapter-path ./outputs/grpo-support-agent/final_adapter
# Run all 8 tasks and save results
python scripts/eval_model.py --all-tasks --label vanilla --output-json ./outputs/eval_vanilla.json
"""
import argparse
import json
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from server.environment import CustomerSupportEnvironment
from scripts.collect_data import (
build_system_prompt, format_initial_obs, format_step_obs,
parse_tool_call, run_episode, load_tasks, DATA_DIR,
)
from scripts.train_grpo import (
format_reward, tool_validity_reward, reasoning_reward,
no_reasoning_leak_reward, _get_text, TOOL_CALL_RE,
)
def eval_episode_rewards(steps):
"""Score an episode's steps with all reward functions."""
scores = {"format": [], "tool_valid": [], "reasoning": [], "no_leak": []}
for s in steps:
comp = s.get("completion", "")
comp_list = [[{"role": "assistant", "content": comp}]]
scores["format"].extend(format_reward(comp_list))
scores["tool_valid"].extend(tool_validity_reward(comp_list))
scores["reasoning"].extend(reasoning_reward(comp_list))
scores["no_leak"].extend(no_reasoning_leak_reward(comp_list))
return {k: sum(v) / len(v) if v else 0 for k, v in scores.items()}
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--vllm-url", default="http://localhost:8001/v1")
parser.add_argument("--model", default="Qwen/Qwen3-8B")
parser.add_argument("--adapter-path", default=None, help="Path to LoRA adapter (if evaluating trained model)")
parser.add_argument("--tasks", type=int, default=4, help="Number of tasks to evaluate")
parser.add_argument("--all-tasks", action="store_true", help="Run all 8 tasks")
parser.add_argument("--seed", type=int, default=77, help="Random seed")
parser.add_argument("--label", default=None, help="Label for this eval run")
parser.add_argument("--output-json", default=None, help="Save results to JSON file")
args = parser.parse_args()
if args.all_tasks:
args.tasks = 999 # will be capped by actual task count
if args.adapter_path:
# Load model with adapter for local inference
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
import torch
print(f"Loading base model {args.model}...")
tokenizer = AutoTokenizer.from_pretrained(args.model)
model = AutoModelForCausalLM.from_pretrained(
args.model, torch_dtype=torch.bfloat16, device_map="auto"
)
print(f"Loading adapter from {args.adapter_path}...")
model = PeftModel.from_pretrained(model, args.adapter_path)
model.eval()
def generate_fn(messages):
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer(text, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs, max_new_tokens=512, temperature=0.7,
do_sample=True, top_p=0.9,
)
new_tokens = outputs[0][inputs["input_ids"].shape[1]:]
return tokenizer.decode(new_tokens, skip_special_tokens=True)
else:
# Use vLLM
from openai import OpenAI
client = OpenAI(base_url=args.vllm_url, api_key="none")
try:
models = client.models.list()
print(f"Connected to vLLM: {[m.id for m in models.data]}")
except Exception as e:
print(f"Cannot connect to vLLM: {e}")
sys.exit(1)
def generate_fn(messages):
resp = client.chat.completions.create(
model=args.model, messages=messages,
temperature=0.7, max_tokens=512,
)
return resp.choices[0].message.content or ""
# Setup environment
env = CustomerSupportEnvironment()
system_prompt = build_system_prompt(env)
tasks = load_tasks(DATA_DIR / "tasks")
task_ids = list(tasks.keys())[:args.tasks]
label = args.label or ("adapter" if args.adapter_path else "base")
print(f"\n=== Evaluating [{label}] on {len(task_ids)} tasks ===\n")
results = []
for task_id in task_ids:
t0 = time.time()
steps = run_episode(env, generate_fn, system_prompt, task_id=task_id, seed=args.seed)
elapsed = time.time() - t0
final = steps[-1] if steps else {}
episode_reward = final.get("episode_reward", 0)
resolved = final.get("episode_resolved", final.get("resolved", False))
reward_scores = eval_episode_rewards(steps)
tools_used = [s.get("tool_name", "?") for s in steps if "tool_name" in s]
result = {
"task_id": task_id,
"steps": len(steps),
"episode_reward": episode_reward,
"resolved": resolved,
"tools": tools_used,
"elapsed": round(elapsed, 1),
**reward_scores,
}
results.append(result)
status = "RESOLVED" if resolved else "not resolved"
print(f"{task_id}:")
print(f" Steps: {len(steps)}, Reward: {episode_reward:.3f}, {status}")
print(f" Tools: {tools_used}")
print(f" Format: {reward_scores['format']:.2f}, Valid: {reward_scores['tool_valid']:.2f}, "
f"Reasoning: {reward_scores['reasoning']:.2f}, No-leak: {reward_scores['no_leak']:.2f}")
print(f" Time: {elapsed:.1f}s")
print()
# Summary
print(f"=== SUMMARY [{label}] ===")
avg_reward = sum(r["episode_reward"] for r in results) / len(results)
resolve_rate = sum(1 for r in results if r["resolved"]) / len(results)
avg_steps = sum(r["steps"] for r in results) / len(results)
avg_format = sum(r["format"] for r in results) / len(results)
avg_valid = sum(r["tool_valid"] for r in results) / len(results)
avg_reasoning = sum(r["reasoning"] for r in results) / len(results)
avg_no_leak = sum(r["no_leak"] for r in results) / len(results)
print(f"Avg episode reward: {avg_reward:.3f}")
print(f"Resolution rate: {resolve_rate:.0%}")
print(f"Avg steps: {avg_steps:.1f}")
print(f"Avg format score: {avg_format:.2f}")
print(f"Avg tool validity: {avg_valid:.2f}")
print(f"Avg reasoning: {avg_reasoning:.2f}")
print(f"Avg no-leak: {avg_no_leak:.2f}")
# Save results
if args.output_json:
output = {
"label": label,
"model": args.model,
"adapter_path": args.adapter_path,
"seed": args.seed,
"summary": {
"avg_episode_reward": round(avg_reward, 4),
"resolution_rate": round(resolve_rate, 4),
"avg_steps": round(avg_steps, 2),
"avg_format": round(avg_format, 4),
"avg_tool_valid": round(avg_valid, 4),
"avg_reasoning": round(avg_reasoning, 4),
"avg_no_leak": round(avg_no_leak, 4),
},
"tasks": results,
}
Path(args.output_json).parent.mkdir(parents=True, exist_ok=True)
with open(args.output_json, "w") as f:
json.dump(output, f, indent=2, default=str)
print(f"\nResults saved to {args.output_json}")
env.close()
if __name__ == "__main__":
main()