Spaces:
Running
Running
| """ | |
| Stack Doctor — Evaluation Script | |
| Produces the 4 metrics for judges: | |
| 1. Root-cause accuracy | |
| 2. Fix-family accuracy | |
| 3. Average steps to resolution | |
| 4. Mean reward before vs after RL | |
| Can evaluate any model (base or fine-tuned) against held-out eval scenarios. | |
| """ | |
| import json | |
| import os | |
| import sys | |
| SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) | |
| PROJECT_DIR = os.path.dirname(SCRIPT_DIR) | |
| sys.path.insert(0, PROJECT_DIR) | |
| from server.stack_doctor_environment import StackDoctorEnvironment | |
| from server.scenarios import EVAL_SCENARIOS | |
| from models import StackDoctorAction | |
| from training.train_stack_doctor import ( | |
| SYSTEM_PROMPT, | |
| format_scenario_prompt, | |
| extract_actions, | |
| ) | |
| def evaluate_model(model, tokenizer, scenarios, label="Model"): | |
| """Run model against scenarios and compute metrics.""" | |
| from unsloth import FastLanguageModel | |
| FastLanguageModel.for_inference(model) | |
| total_rc_correct = 0 | |
| total_fix_correct = 0 | |
| total_steps = 0 | |
| total_reward = 0.0 | |
| n = 0 | |
| for sc in scenarios: | |
| messages = [ | |
| {"role": "system", "content": SYSTEM_PROMPT}, | |
| {"role": "user", "content": format_scenario_prompt(sc)}, | |
| ] | |
| prompt = tokenizer.apply_chat_template( | |
| messages, add_generation_prompt=True, tokenize=False, | |
| ) | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| outputs = model.generate( | |
| **inputs, | |
| max_new_tokens=512, | |
| temperature=0.3, | |
| do_sample=True, | |
| ) | |
| response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True) | |
| actions = extract_actions(response) | |
| if actions is None: | |
| total_reward -= 5.0 | |
| n += 1 | |
| continue | |
| env = StackDoctorEnvironment() | |
| env.reset(scenario_id=sc.id) | |
| cum_reward = 0.0 | |
| steps = 0 | |
| last_submit = None | |
| for action_dict in actions: | |
| if not isinstance(action_dict, dict): | |
| continue | |
| try: | |
| obs = env.step(StackDoctorAction(message=json.dumps(action_dict))) | |
| cum_reward += obs.reward | |
| steps += 1 | |
| if action_dict.get("type") == "submit": | |
| last_submit = action_dict | |
| if obs.done: | |
| break | |
| except Exception: | |
| break | |
| if last_submit: | |
| if last_submit.get("root_cause") == sc.root_cause: | |
| total_rc_correct += 1 | |
| if last_submit.get("fix") == sc.correct_fix: | |
| total_fix_correct += 1 | |
| total_steps += steps | |
| total_reward += cum_reward | |
| n += 1 | |
| print(f" {sc.id}: rc={'OK' if last_submit and last_submit.get('root_cause')==sc.root_cause else 'FAIL'} " | |
| f"fix={'OK' if last_submit and last_submit.get('fix')==sc.correct_fix else 'FAIL'} " | |
| f"steps={steps} reward={cum_reward:.1f}") | |
| print(f"\n{'='*50}") | |
| print(f"{label} Results ({n} episodes):") | |
| print(f" Root-cause accuracy: {total_rc_correct/n:.1%}") | |
| print(f" Fix accuracy: {total_fix_correct/n:.1%}") | |
| print(f" Avg steps: {total_steps/n:.1f}") | |
| print(f" Avg reward: {total_reward/n:.1f}") | |
| print(f"{'='*50}") | |
| return { | |
| "rc_accuracy": total_rc_correct / n, | |
| "fix_accuracy": total_fix_correct / n, | |
| "avg_steps": total_steps / n, | |
| "avg_reward": total_reward / n, | |
| } | |
| def main(): | |
| from unsloth import FastLanguageModel | |
| import argparse | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument("--model", default="unsloth/Qwen3-1.7B", help="Model name or path") | |
| parser.add_argument("--lora", default=None, help="Path to LoRA adapter") | |
| args = parser.parse_args() | |
| model, tokenizer = FastLanguageModel.from_pretrained( | |
| model_name=args.model, | |
| load_in_4bit=True, | |
| max_seq_length=2048, | |
| ) | |
| if args.lora: | |
| from peft import PeftModel | |
| model = PeftModel.from_pretrained(model, args.lora) | |
| print(f"Evaluating {args.model}" + (f" + {args.lora}" if args.lora else "")) | |
| print(f"Eval scenarios: {len(EVAL_SCENARIOS)}") | |
| print() | |
| evaluate_model(model, tokenizer, EVAL_SCENARIOS, label=args.model) | |
| if __name__ == "__main__": | |
| main() | |