Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Re-evaluate the trained GRPO model without re-training. | |
| Usage: | |
| python eval_trained.py | |
| python eval_trained.py --checkpoint ./checkpoints/grpo_api_tester | |
| """ | |
| import argparse | |
| import os | |
| import sys | |
| sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) | |
| import logging | |
| logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") | |
| # Suppress noisy logs | |
| for _noisy in ["httpx", "httpcore", "urllib3", "huggingface_hub", "filelock"]: | |
| logging.getLogger(_noisy).setLevel(logging.WARNING) | |
| logger = logging.getLogger(__name__) | |
| def main(): | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument( | |
| "--checkpoint", | |
| default="./checkpoints/grpo_api_tester", | |
| help="Path to the trained model checkpoint", | |
| ) | |
| parser.add_argument( | |
| "--base-model", | |
| default="Qwen/Qwen3-1.7B", | |
| help="Base model (needed if checkpoint is LoRA-only)", | |
| ) | |
| parser.add_argument( | |
| "--max-steps", | |
| type=int, | |
| default=25, | |
| help="Max actions per task during evaluation", | |
| ) | |
| parser.add_argument( | |
| "--seed", | |
| type=int, | |
| default=9999, | |
| help="Random seed for evaluation", | |
| ) | |
| args = parser.parse_args() | |
| print(f"\n{'='*60}") | |
| print(f" Re-evaluating trained model") | |
| print(f"{'='*60}") | |
| print(f" Checkpoint: {args.checkpoint}") | |
| print(f" Base model: {args.base_model}") | |
| print(f" Max steps: {args.max_steps}") | |
| print(f" Seed: {args.seed}") | |
| print(f"{'='*60}\n") | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer | |
| from peft import PeftModel | |
| # Detect device | |
| if torch.cuda.is_available(): | |
| device = "cuda" | |
| dtype = torch.bfloat16 | |
| print(f" GPU: {torch.cuda.get_device_name(0)}") | |
| else: | |
| device = "cpu" | |
| dtype = torch.float32 | |
| print(" WARNING: No GPU — eval will be slow") | |
| # Load tokenizer (from base model is fine) | |
| print(f" Loading tokenizer from {args.base_model}...", flush=True) | |
| tokenizer = AutoTokenizer.from_pretrained(args.base_model, trust_remote_code=True) | |
| if tokenizer.pad_token is None: | |
| tokenizer.pad_token = tokenizer.eos_token | |
| # Load base model | |
| print(f" Loading base model {args.base_model}...", flush=True) | |
| base_model = AutoModelForCausalLM.from_pretrained( | |
| args.base_model, | |
| trust_remote_code=True, | |
| torch_dtype=dtype, | |
| device_map="auto", | |
| ) | |
| # Load LoRA adapter from checkpoint | |
| print(f" Loading LoRA adapter from {args.checkpoint}...", flush=True) | |
| try: | |
| model = PeftModel.from_pretrained(base_model, args.checkpoint) | |
| # Merge LoRA into base for faster inference | |
| print(f" Merging LoRA into base...", flush=True) | |
| model = model.merge_and_unload() | |
| print(f" Model loaded successfully.", flush=True) | |
| except Exception as exc: | |
| print(f" WARNING: Failed to load LoRA adapter: {exc}", flush=True) | |
| print(f" Using base model without LoRA.", flush=True) | |
| model = base_model | |
| # Run evaluation on all 3 tasks | |
| from training.evaluate import run_rollout | |
| print(f"\n{'='*60}") | |
| print(f" Running evaluation on all tasks...") | |
| print(f"{'='*60}\n") | |
| results = {} | |
| for task_id in ["basic_validation", "edge_cases", "security_workflows"]: | |
| print(f"\n--- Task: {task_id} ---") | |
| result = run_rollout( | |
| model, tokenizer, | |
| task_id=task_id, | |
| seed=args.seed, | |
| max_steps=args.max_steps, | |
| ) | |
| results[task_id] = result | |
| print(f" reward={result['total_reward']:.3f}, " | |
| f"bugs={result['bugs_found']}/{result['total_bugs']}, " | |
| f"coverage={result['coverage_pct']:.1f}%") | |
| # Print summary | |
| print(f"\n{'='*60}") | |
| print(f" RESULTS") | |
| print(f"{'='*60}") | |
| print(f"{'Task':<25} {'Reward':<10} {'Bugs':<10} {'Coverage':<10}") | |
| print(f"{'-'*60}") | |
| for task_id, r in results.items(): | |
| print(f"{task_id:<25} {r['total_reward']:<10.3f} " | |
| f"{r['bugs_found']}/{r['total_bugs']:<8} " | |
| f"{r['coverage_pct']:<10.1f}%") | |
| print(f"{'='*60}\n") | |
| avg = sum(r["total_reward"] for r in results.values()) / len(results) | |
| print(f" Average reward: {avg:.3f}") | |
| if __name__ == "__main__": | |
| main() | |