SidraMiconi's picture
Upload folder using huggingface_hub
f63162c verified
"""Evaluation script: compare base model vs trained model on held-out scenarios.
Usage:
python training/eval.py --base-model Qwen/Qwen2.5-7B --trained-model SidraMiconi/exec-assistant-arena-lora
"""
import json
import os
import sys
import argparse
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from exec_assistant_arena import ExecAssistantArenaEnv
from exec_assistant_arena.models import AssistantAction
from training.train_grpo import parse_tool_calls
ENV_URL = "http://localhost:8000"
def evaluate_model(model, tokenizer, scenarios, env_url, label="model"):
"""Run model through eval scenarios and collect metrics."""
from unsloth import FastLanguageModel
FastLanguageModel.for_inference(model)
results = []
for i, scenario in enumerate(scenarios):
prompt = scenario["prompt"]
inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=2048).to("cuda")
outputs = model.generate(
**inputs,
max_new_tokens=1024,
temperature=0.7,
do_sample=True,
)
completion = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
# Score through environment
try:
with ExecAssistantArenaEnv(base_url=env_url) as env:
seed = scenario.get("seed", i + 80)
difficulty = scenario.get("difficulty", "medium")
env.reset(seed=seed, difficulty=difficulty)
actions = parse_tool_calls(completion)
total_reward = 0.0
for action in actions:
result = env.step(action)
total_reward += (result.reward or 0.0)
if result.done:
break
if not result.done:
result = env.step(AssistantAction(tool="done"))
total_reward += (result.reward or 0.0)
state = env.state()
results.append({
"scenario_idx": i,
"seed": seed,
"difficulty": difficulty,
"total_reward": total_reward,
"conflicts_resolved": state.conflicts_resolved,
"total_conflicts": state.total_conflicts,
"conflict_rate": state.conflicts_resolved / max(1, state.total_conflicts),
"emails_drafted": state.emails_drafted,
"total_emails": state.total_emails,
"preferences_inferred": state.preferences_inferred,
"deadlines_met": state.deadlines_met,
"unnecessary_actions": state.unnecessary_actions,
"n_actions": len(actions),
"completion": completion[:500],
})
except Exception as e:
print(f" Error on scenario {i}: {e}")
results.append({"scenario_idx": i, "total_reward": -1.0, "error": str(e)})
print(f" [{label}] Scenario {i}: reward={results[-1].get('total_reward', 'err'):.2f}")
return results
def print_comparison(base_results, trained_results):
"""Print side-by-side comparison."""
print("\n" + "=" * 70)
print("EVALUATION RESULTS")
print("=" * 70)
metrics = ["total_reward", "conflict_rate", "emails_drafted", "preferences_inferred", "unnecessary_actions"]
for metric in metrics:
base_vals = [r.get(metric, 0) for r in base_results if "error" not in r]
trained_vals = [r.get(metric, 0) for r in trained_results if "error" not in r]
if base_vals and trained_vals:
base_avg = sum(base_vals) / len(base_vals)
trained_avg = sum(trained_vals) / len(trained_vals)
delta = trained_avg - base_avg
print(f" {metric:25s} base={base_avg:7.2f} trained={trained_avg:7.2f} delta={delta:+.2f}")
print("=" * 70)
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--base-model", default="Qwen/Qwen2.5-7B")
parser.add_argument("--trained-model", default="SidraMiconi/exec-assistant-arena-lora")
parser.add_argument("--env-url", default=ENV_URL)
parser.add_argument("--output", default="training/eval_results.json")
args = parser.parse_args()
script_dir = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(script_dir, "scenarios/eval_scenarios.json")) as f:
scenarios = json.load(f)
print(f"Evaluating on {len(scenarios)} held-out scenarios\n")
from unsloth import FastLanguageModel
# Load base model
print("Loading base model...")
base_model, base_tokenizer = FastLanguageModel.from_pretrained(
model_name=args.base_model, max_seq_length=2048, load_in_4bit=True,
)
print("Evaluating base model...")
base_results = evaluate_model(base_model, base_tokenizer, scenarios, args.env_url, "base")
del base_model
# Load trained model
print("\nLoading trained model...")
trained_model, trained_tokenizer = FastLanguageModel.from_pretrained(
model_name=args.trained_model, max_seq_length=2048, load_in_4bit=True,
)
print("Evaluating trained model...")
trained_results = evaluate_model(trained_model, trained_tokenizer, scenarios, args.env_url, "trained")
print_comparison(base_results, trained_results)
# Save results
output = {
"base_model": args.base_model,
"trained_model": args.trained_model,
"base_results": base_results,
"trained_results": trained_results,
}
with open(args.output, "w") as f:
json.dump(output, f, indent=2)
print(f"\nResults saved to {args.output}")
if __name__ == "__main__":
main()