Spaces:
Running
Running
File size: 4,367 Bytes
8b92d51 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | """
Stack Doctor — Evaluation Script
Produces the 4 metrics for judges:
1. Root-cause accuracy
2. Fix-family accuracy
3. Average steps to resolution
4. Mean reward before vs after RL
Can evaluate any model (base or fine-tuned) against held-out eval scenarios.
"""
import json
import os
import sys
SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_DIR = os.path.dirname(SCRIPT_DIR)
sys.path.insert(0, PROJECT_DIR)
from server.stack_doctor_environment import StackDoctorEnvironment
from server.scenarios import EVAL_SCENARIOS
from models import StackDoctorAction
from training.train_stack_doctor import (
SYSTEM_PROMPT,
format_scenario_prompt,
extract_actions,
)
def evaluate_model(model, tokenizer, scenarios, label="Model"):
"""Run model against scenarios and compute metrics."""
from unsloth import FastLanguageModel
FastLanguageModel.for_inference(model)
total_rc_correct = 0
total_fix_correct = 0
total_steps = 0
total_reward = 0.0
n = 0
for sc in scenarios:
messages = [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": format_scenario_prompt(sc)},
]
prompt = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, tokenize=False,
)
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
outputs = model.generate(
**inputs,
max_new_tokens=512,
temperature=0.3,
do_sample=True,
)
response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)
actions = extract_actions(response)
if actions is None:
total_reward -= 5.0
n += 1
continue
env = StackDoctorEnvironment()
env.reset(scenario_id=sc.id)
cum_reward = 0.0
steps = 0
last_submit = None
for action_dict in actions:
if not isinstance(action_dict, dict):
continue
try:
obs = env.step(StackDoctorAction(message=json.dumps(action_dict)))
cum_reward += obs.reward
steps += 1
if action_dict.get("type") == "submit":
last_submit = action_dict
if obs.done:
break
except Exception:
break
if last_submit:
if last_submit.get("root_cause") == sc.root_cause:
total_rc_correct += 1
if last_submit.get("fix") == sc.correct_fix:
total_fix_correct += 1
total_steps += steps
total_reward += cum_reward
n += 1
print(f" {sc.id}: rc={'OK' if last_submit and last_submit.get('root_cause')==sc.root_cause else 'FAIL'} "
f"fix={'OK' if last_submit and last_submit.get('fix')==sc.correct_fix else 'FAIL'} "
f"steps={steps} reward={cum_reward:.1f}")
print(f"\n{'='*50}")
print(f"{label} Results ({n} episodes):")
print(f" Root-cause accuracy: {total_rc_correct/n:.1%}")
print(f" Fix accuracy: {total_fix_correct/n:.1%}")
print(f" Avg steps: {total_steps/n:.1f}")
print(f" Avg reward: {total_reward/n:.1f}")
print(f"{'='*50}")
return {
"rc_accuracy": total_rc_correct / n,
"fix_accuracy": total_fix_correct / n,
"avg_steps": total_steps / n,
"avg_reward": total_reward / n,
}
def main():
from unsloth import FastLanguageModel
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--model", default="unsloth/Qwen3-1.7B", help="Model name or path")
parser.add_argument("--lora", default=None, help="Path to LoRA adapter")
args = parser.parse_args()
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=args.model,
load_in_4bit=True,
max_seq_length=2048,
)
if args.lora:
from peft import PeftModel
model = PeftModel.from_pretrained(model, args.lora)
print(f"Evaluating {args.model}" + (f" + {args.lora}" if args.lora else ""))
print(f"Eval scenarios: {len(EVAL_SCENARIOS)}")
print()
evaluate_model(model, tokenizer, EVAL_SCENARIOS, label=args.model)
if __name__ == "__main__":
main()
|