"""Compare base vs trained model on the same prompts.""" from __future__ import annotations import argparse import json import random from typing import Dict, List import torch from transformers import AutoModelForCausalLM, AutoTokenizer from training_script import ( SYSTEM_PROMPT, OpenEnvReward, build_prompt_examples, completion_to_text, parse_action_completion, selected_scenarios, ) def generate_completions( model, tokenizer, prompts: List[str], max_new_tokens: int = 220, ) -> List[str]: completions = [] for prompt in prompts: messages = [ {"role": "system", "content": SYSTEM_PROMPT}, {"role": "user", "content": prompt}, ] input_text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) inputs = tokenizer(input_text, return_tensors="pt").to(model.device) with torch.no_grad(): output = model.generate( **inputs, max_new_tokens=max_new_tokens, do_sample=True, temperature=0.7, top_p=0.9, ) generated = output[0][inputs["input_ids"].shape[1]:] completions.append(tokenizer.decode(generated, skip_special_tokens=True)) return completions def evaluate_model( model, tokenizer, examples: List[Dict[str, str]], reward_fn: OpenEnvReward, label: str, ) -> Dict[str, float]: prompts = [ex["prompt"] for ex in examples] completions = generate_completions(model, tokenizer, prompts) rewards = [] valid_actions = 0 for comp, ex in zip(completions, examples): reward = reward_fn( completions=[comp], scenario_name=[ex.get("scenario_name")], history_actions=[ex.get("history_actions")], )[0] rewards.append(reward) if parse_action_completion(comp) is not None: valid_actions += 1 avg_reward = sum(rewards) / len(rewards) if rewards else 0 valid_pct = valid_actions / len(completions) * 100 if completions else 0 print(f"\n{'='*50}") print(f" {label}") print(f"{'='*50}") print(f" Samples: {len(completions)}") print(f" Avg reward: {avg_reward:.4f}") print(f" Min reward: {min(rewards):.4f}") print(f" Max reward: {max(rewards):.4f}") print(f" Valid actions: {valid_actions}/{len(completions)} ({valid_pct:.1f}%)") print() # Show a few example completions for i, (comp, r) in enumerate(zip(completions[:3], rewards[:3])): print(f" Example {i+1} (reward={r:.2f}):") print(f" {comp[:200]}") print() return {"avg_reward": avg_reward, "valid_pct": valid_pct, "rewards": rewards} def main(): parser = argparse.ArgumentParser(description="Compare base vs trained model") parser.add_argument("--base-model", default="Qwen/Qwen3.5-0.8B", help="Base model ID from HuggingFace") parser.add_argument("--trained-model", default="./grpo-output", help="Path to trained model (local dir or HF repo)") parser.add_argument("--num-samples", type=int, default=16, help="Number of eval prompts") parser.add_argument("--seed", type=int, default=42) parser.add_argument("--trust-remote-code", action="store_true") args = parser.parse_args() random.seed(args.seed) # Build eval prompts scenarios = selected_scenarios(None) examples = build_prompt_examples( dataset_episodes=args.num_samples, rollout_steps=1, # one prompt per episode collection_policy="heuristic", scenario_names=scenarios, seed=args.seed, domain_randomise=False, ) print(f"Built {len(examples)} eval prompts across {len(scenarios)} scenarios") reward_fn = OpenEnvReward(reward_backend="local", base_url="") # Evaluate base model print(f"\nLoading base model: {args.base_model}") base_tokenizer = AutoTokenizer.from_pretrained( args.base_model, trust_remote_code=args.trust_remote_code ) if base_tokenizer.pad_token is None: base_tokenizer.pad_token = base_tokenizer.eos_token base_model = AutoModelForCausalLM.from_pretrained( args.base_model, trust_remote_code=args.trust_remote_code, torch_dtype=torch.bfloat16, device_map="auto", ) base_results = evaluate_model( base_model, base_tokenizer, examples, reward_fn, "BASE MODEL" ) del base_model torch.cuda.empty_cache() # Evaluate trained model print(f"\nLoading trained model: {args.trained_model}") trained_tokenizer = AutoTokenizer.from_pretrained( args.trained_model, trust_remote_code=args.trust_remote_code ) if trained_tokenizer.pad_token is None: trained_tokenizer.pad_token = trained_tokenizer.eos_token trained_model = AutoModelForCausalLM.from_pretrained( args.trained_model, trust_remote_code=args.trust_remote_code, torch_dtype=torch.bfloat16, device_map="auto", ) trained_results = evaluate_model( trained_model, trained_tokenizer, examples, reward_fn, "TRAINED MODEL" ) # Summary delta = trained_results["avg_reward"] - base_results["avg_reward"] print(f"{'='*50}") print(f" COMPARISON SUMMARY") print(f"{'='*50}") print(f" Base avg reward: {base_results['avg_reward']:.4f}") print(f" Trained avg reward: {trained_results['avg_reward']:.4f}") print(f" Delta: {delta:+.4f}") print(f" Base valid actions: {base_results['valid_pct']:.1f}%") print(f" Trained valid: {trained_results['valid_pct']:.1f}%") print() if __name__ == "__main__": main()