| | """Compare base vs trained model on the same prompts."""
|
| |
|
| | from __future__ import annotations
|
| |
|
| | import argparse
|
| | import json
|
| | import random
|
| | from typing import Dict, List
|
| |
|
| | import torch
|
| | from transformers import AutoModelForCausalLM, AutoTokenizer
|
| |
|
| | from training_script import (
|
| | SYSTEM_PROMPT,
|
| | OpenEnvReward,
|
| | build_prompt_examples,
|
| | completion_to_text,
|
| | parse_action_completion,
|
| | selected_scenarios,
|
| | )
|
| |
|
| |
|
| | def generate_completions(
|
| | model,
|
| | tokenizer,
|
| | prompts: List[str],
|
| | max_new_tokens: int = 220,
|
| | ) -> List[str]:
|
| | completions = []
|
| | for prompt in prompts:
|
| | messages = [
|
| | {"role": "system", "content": SYSTEM_PROMPT},
|
| | {"role": "user", "content": prompt},
|
| | ]
|
| | input_text = tokenizer.apply_chat_template(
|
| | messages, tokenize=False, add_generation_prompt=True
|
| | )
|
| | inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
|
| | with torch.no_grad():
|
| | output = model.generate(
|
| | **inputs,
|
| | max_new_tokens=max_new_tokens,
|
| | do_sample=True,
|
| | temperature=0.7,
|
| | top_p=0.9,
|
| | )
|
| | generated = output[0][inputs["input_ids"].shape[1]:]
|
| | completions.append(tokenizer.decode(generated, skip_special_tokens=True))
|
| | return completions
|
| |
|
| |
|
| | def evaluate_model(
|
| | model,
|
| | tokenizer,
|
| | examples: List[Dict[str, str]],
|
| | reward_fn: OpenEnvReward,
|
| | label: str,
|
| | ) -> Dict[str, float]:
|
| | prompts = [ex["prompt"] for ex in examples]
|
| | completions = generate_completions(model, tokenizer, prompts)
|
| |
|
| | rewards = []
|
| | valid_actions = 0
|
| | for comp, ex in zip(completions, examples):
|
| | reward = reward_fn(
|
| | completions=[comp],
|
| | scenario_name=[ex.get("scenario_name")],
|
| | history_actions=[ex.get("history_actions")],
|
| | )[0]
|
| | rewards.append(reward)
|
| | if parse_action_completion(comp) is not None:
|
| | valid_actions += 1
|
| |
|
| | avg_reward = sum(rewards) / len(rewards) if rewards else 0
|
| | valid_pct = valid_actions / len(completions) * 100 if completions else 0
|
| |
|
| | print(f"\n{'='*50}")
|
| | print(f" {label}")
|
| | print(f"{'='*50}")
|
| | print(f" Samples: {len(completions)}")
|
| | print(f" Avg reward: {avg_reward:.4f}")
|
| | print(f" Min reward: {min(rewards):.4f}")
|
| | print(f" Max reward: {max(rewards):.4f}")
|
| | print(f" Valid actions: {valid_actions}/{len(completions)} ({valid_pct:.1f}%)")
|
| | print()
|
| |
|
| |
|
| | for i, (comp, r) in enumerate(zip(completions[:3], rewards[:3])):
|
| | print(f" Example {i+1} (reward={r:.2f}):")
|
| | print(f" {comp[:200]}")
|
| | print()
|
| |
|
| | return {"avg_reward": avg_reward, "valid_pct": valid_pct, "rewards": rewards}
|
| |
|
| |
|
| | def main():
|
| | parser = argparse.ArgumentParser(description="Compare base vs trained model")
|
| | parser.add_argument("--base-model", default="Qwen/Qwen3.5-0.8B",
|
| | help="Base model ID from HuggingFace")
|
| | parser.add_argument("--trained-model", default="./grpo-output",
|
| | help="Path to trained model (local dir or HF repo)")
|
| | parser.add_argument("--num-samples", type=int, default=16,
|
| | help="Number of eval prompts")
|
| | parser.add_argument("--seed", type=int, default=42)
|
| | parser.add_argument("--trust-remote-code", action="store_true")
|
| | args = parser.parse_args()
|
| |
|
| | random.seed(args.seed)
|
| |
|
| |
|
| | scenarios = selected_scenarios(None)
|
| | examples = build_prompt_examples(
|
| | dataset_episodes=args.num_samples,
|
| | rollout_steps=1,
|
| | collection_policy="heuristic",
|
| | scenario_names=scenarios,
|
| | seed=args.seed,
|
| | domain_randomise=False,
|
| | )
|
| | print(f"Built {len(examples)} eval prompts across {len(scenarios)} scenarios")
|
| |
|
| | reward_fn = OpenEnvReward(reward_backend="local", base_url="")
|
| |
|
| |
|
| | print(f"\nLoading base model: {args.base_model}")
|
| | base_tokenizer = AutoTokenizer.from_pretrained(
|
| | args.base_model, trust_remote_code=args.trust_remote_code
|
| | )
|
| | if base_tokenizer.pad_token is None:
|
| | base_tokenizer.pad_token = base_tokenizer.eos_token
|
| | base_model = AutoModelForCausalLM.from_pretrained(
|
| | args.base_model,
|
| | trust_remote_code=args.trust_remote_code,
|
| | torch_dtype=torch.bfloat16,
|
| | device_map="auto",
|
| | )
|
| | base_results = evaluate_model(
|
| | base_model, base_tokenizer, examples, reward_fn, "BASE MODEL"
|
| | )
|
| | del base_model
|
| | torch.cuda.empty_cache()
|
| |
|
| |
|
| | print(f"\nLoading trained model: {args.trained_model}")
|
| | trained_tokenizer = AutoTokenizer.from_pretrained(
|
| | args.trained_model, trust_remote_code=args.trust_remote_code
|
| | )
|
| | if trained_tokenizer.pad_token is None:
|
| | trained_tokenizer.pad_token = trained_tokenizer.eos_token
|
| | trained_model = AutoModelForCausalLM.from_pretrained(
|
| | args.trained_model,
|
| | trust_remote_code=args.trust_remote_code,
|
| | torch_dtype=torch.bfloat16,
|
| | device_map="auto",
|
| | )
|
| | trained_results = evaluate_model(
|
| | trained_model, trained_tokenizer, examples, reward_fn, "TRAINED MODEL"
|
| | )
|
| |
|
| |
|
| | delta = trained_results["avg_reward"] - base_results["avg_reward"]
|
| | print(f"{'='*50}")
|
| | print(f" COMPARISON SUMMARY")
|
| | print(f"{'='*50}")
|
| | print(f" Base avg reward: {base_results['avg_reward']:.4f}")
|
| | print(f" Trained avg reward: {trained_results['avg_reward']:.4f}")
|
| | print(f" Delta: {delta:+.4f}")
|
| | print(f" Base valid actions: {base_results['valid_pct']:.1f}%")
|
| | print(f" Trained valid: {trained_results['valid_pct']:.1f}%")
|
| | print()
|
| |
|
| |
|
| | if __name__ == "__main__":
|
| | main()
|
| |
|