api-testing-env / eval_trained.py
Mayank022's picture
Upload folder using huggingface_hub
a4f74f3 verified
#!/usr/bin/env python3
"""
Re-evaluate the trained GRPO model without re-training.
Usage:
python eval_trained.py
python eval_trained.py --checkpoint ./checkpoints/grpo_api_tester
"""
import argparse
import os
import sys
sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
import logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
# Suppress noisy logs
for _noisy in ["httpx", "httpcore", "urllib3", "huggingface_hub", "filelock"]:
logging.getLogger(_noisy).setLevel(logging.WARNING)
logger = logging.getLogger(__name__)
def main():
parser = argparse.ArgumentParser()
parser.add_argument(
"--checkpoint",
default="./checkpoints/grpo_api_tester",
help="Path to the trained model checkpoint",
)
parser.add_argument(
"--base-model",
default="Qwen/Qwen3-1.7B",
help="Base model (needed if checkpoint is LoRA-only)",
)
parser.add_argument(
"--max-steps",
type=int,
default=25,
help="Max actions per task during evaluation",
)
parser.add_argument(
"--seed",
type=int,
default=9999,
help="Random seed for evaluation",
)
args = parser.parse_args()
print(f"\n{'='*60}")
print(f" Re-evaluating trained model")
print(f"{'='*60}")
print(f" Checkpoint: {args.checkpoint}")
print(f" Base model: {args.base_model}")
print(f" Max steps: {args.max_steps}")
print(f" Seed: {args.seed}")
print(f"{'='*60}\n")
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import PeftModel
# Detect device
if torch.cuda.is_available():
device = "cuda"
dtype = torch.bfloat16
print(f" GPU: {torch.cuda.get_device_name(0)}")
else:
device = "cpu"
dtype = torch.float32
print(" WARNING: No GPU — eval will be slow")
# Load tokenizer (from base model is fine)
print(f" Loading tokenizer from {args.base_model}...", flush=True)
tokenizer = AutoTokenizer.from_pretrained(args.base_model, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
# Load base model
print(f" Loading base model {args.base_model}...", flush=True)
base_model = AutoModelForCausalLM.from_pretrained(
args.base_model,
trust_remote_code=True,
torch_dtype=dtype,
device_map="auto",
)
# Load LoRA adapter from checkpoint
print(f" Loading LoRA adapter from {args.checkpoint}...", flush=True)
try:
model = PeftModel.from_pretrained(base_model, args.checkpoint)
# Merge LoRA into base for faster inference
print(f" Merging LoRA into base...", flush=True)
model = model.merge_and_unload()
print(f" Model loaded successfully.", flush=True)
except Exception as exc:
print(f" WARNING: Failed to load LoRA adapter: {exc}", flush=True)
print(f" Using base model without LoRA.", flush=True)
model = base_model
# Run evaluation on all 3 tasks
from training.evaluate import run_rollout
print(f"\n{'='*60}")
print(f" Running evaluation on all tasks...")
print(f"{'='*60}\n")
results = {}
for task_id in ["basic_validation", "edge_cases", "security_workflows"]:
print(f"\n--- Task: {task_id} ---")
result = run_rollout(
model, tokenizer,
task_id=task_id,
seed=args.seed,
max_steps=args.max_steps,
)
results[task_id] = result
print(f" reward={result['total_reward']:.3f}, "
f"bugs={result['bugs_found']}/{result['total_bugs']}, "
f"coverage={result['coverage_pct']:.1f}%")
# Print summary
print(f"\n{'='*60}")
print(f" RESULTS")
print(f"{'='*60}")
print(f"{'Task':<25} {'Reward':<10} {'Bugs':<10} {'Coverage':<10}")
print(f"{'-'*60}")
for task_id, r in results.items():
print(f"{task_id:<25} {r['total_reward']:<10.3f} "
f"{r['bugs_found']}/{r['total_bugs']:<8} "
f"{r['coverage_pct']:<10.1f}%")
print(f"{'='*60}\n")
avg = sum(r["total_reward"] for r in results.values()) / len(results)
print(f" Average reward: {avg:.3f}")
if __name__ == "__main__":
main()