stack_doctor / training /eval_stack_doctor.py
bledden's picture
Upload folder using huggingface_hub
8b92d51 verified
"""
Stack Doctor — Evaluation Script
Produces the 4 metrics for judges:
1. Root-cause accuracy
2. Fix-family accuracy
3. Average steps to resolution
4. Mean reward before vs after RL
Can evaluate any model (base or fine-tuned) against held-out eval scenarios.
"""
import json
import os
import sys
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_DIR = os.path.dirname(SCRIPT_DIR)
sys.path.insert(0, PROJECT_DIR)
from server.stack_doctor_environment import StackDoctorEnvironment
from server.scenarios import EVAL_SCENARIOS
from models import StackDoctorAction
from training.train_stack_doctor import (
SYSTEM_PROMPT,
format_scenario_prompt,
extract_actions,
)
def evaluate_model(model, tokenizer, scenarios, label="Model"):
"""Run model against scenarios and compute metrics."""
from unsloth import FastLanguageModel
FastLanguageModel.for_inference(model)
total_rc_correct = 0
total_fix_correct = 0
total_steps = 0
total_reward = 0.0
n = 0
for sc in scenarios:
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": format_scenario_prompt(sc)},
]
prompt = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, tokenize=False,
)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.3,
do_sample=True,
)
response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
actions = extract_actions(response)
if actions is None:
total_reward -= 5.0
n += 1
continue
env = StackDoctorEnvironment()
env.reset(scenario_id=sc.id)
cum_reward = 0.0
steps = 0
last_submit = None
for action_dict in actions:
if not isinstance(action_dict, dict):
continue
try:
obs = env.step(StackDoctorAction(message=json.dumps(action_dict)))
cum_reward += obs.reward
steps += 1
if action_dict.get("type") == "submit":
last_submit = action_dict
if obs.done:
break
except Exception:
break
if last_submit:
if last_submit.get("root_cause") == sc.root_cause:
total_rc_correct += 1
if last_submit.get("fix") == sc.correct_fix:
total_fix_correct += 1
total_steps += steps
total_reward += cum_reward
n += 1
print(f" {sc.id}: rc={'OK' if last_submit and last_submit.get('root_cause')==sc.root_cause else 'FAIL'} "
f"fix={'OK' if last_submit and last_submit.get('fix')==sc.correct_fix else 'FAIL'} "
f"steps={steps} reward={cum_reward:.1f}")
print(f"\n{'='*50}")
print(f"{label} Results ({n} episodes):")
print(f" Root-cause accuracy: {total_rc_correct/n:.1%}")
print(f" Fix accuracy: {total_fix_correct/n:.1%}")
print(f" Avg steps: {total_steps/n:.1f}")
print(f" Avg reward: {total_reward/n:.1f}")
print(f"{'='*50}")
return {
"rc_accuracy": total_rc_correct / n,
"fix_accuracy": total_fix_correct / n,
"avg_steps": total_steps / n,
"avg_reward": total_reward / n,
}
def main():
from unsloth import FastLanguageModel
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--model", default="unsloth/Qwen3-1.7B", help="Model name or path")
parser.add_argument("--lora", default=None, help="Path to LoRA adapter")
args = parser.parse_args()
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=args.model,
load_in_4bit=True,
max_seq_length=2048,
)
if args.lora:
from peft import PeftModel
model = PeftModel.from_pretrained(model, args.lora)
print(f"Evaluating {args.model}" + (f" + {args.lora}" if args.lora else ""))
print(f"Eval scenarios: {len(EVAL_SCENARIOS)}")
print()
evaluate_model(model, tokenizer, EVAL_SCENARIOS, label=args.model)
if __name__ == "__main__":
main()