Spaces:

bledden
/

stack_doctor

Running

File size: 4,367 Bytes

8b92d51

"""
Stack Doctor — Evaluation Script

Produces the 4 metrics for judges:
1. Root-cause accuracy
2. Fix-family accuracy
3. Average steps to resolution
4. Mean reward before vs after RL

Can evaluate any model (base or fine-tuned) against held-out eval scenarios.
"""

import json
import os
import sys

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_DIR = os.path.dirname(SCRIPT_DIR)
sys.path.insert(0, PROJECT_DIR)

from server.stack_doctor_environment import StackDoctorEnvironment
from server.scenarios import EVAL_SCENARIOS
from models import StackDoctorAction
from training.train_stack_doctor import (
    SYSTEM_PROMPT,
    format_scenario_prompt,
    extract_actions,
)


def evaluate_model(model, tokenizer, scenarios, label="Model"):
    """Run model against scenarios and compute metrics."""
    from unsloth import FastLanguageModel
    FastLanguageModel.for_inference(model)

    total_rc_correct = 0
    total_fix_correct = 0
    total_steps = 0
    total_reward = 0.0
    n = 0

    for sc in scenarios:
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": format_scenario_prompt(sc)},
        ]
        prompt = tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, tokenize=False,
        )
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.3,
            do_sample=True,
        )
        response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)

        actions = extract_actions(response)
        if actions is None:
            total_reward -= 5.0
            n += 1
            continue

        env = StackDoctorEnvironment()
        env.reset(scenario_id=sc.id)

        cum_reward = 0.0
        steps = 0
        last_submit = None

        for action_dict in actions:
            if not isinstance(action_dict, dict):
                continue
            try:
                obs = env.step(StackDoctorAction(message=json.dumps(action_dict)))
                cum_reward += obs.reward
                steps += 1
                if action_dict.get("type") == "submit":
                    last_submit = action_dict
                if obs.done:
                    break
            except Exception:
                break

        if last_submit:
            if last_submit.get("root_cause") == sc.root_cause:
                total_rc_correct += 1
            if last_submit.get("fix") == sc.correct_fix:
                total_fix_correct += 1

        total_steps += steps
        total_reward += cum_reward
        n += 1

        print(f"  {sc.id}: rc={'OK' if last_submit and last_submit.get('root_cause')==sc.root_cause else 'FAIL'} "
              f"fix={'OK' if last_submit and last_submit.get('fix')==sc.correct_fix else 'FAIL'} "
              f"steps={steps} reward={cum_reward:.1f}")

    print(f"\n{'='*50}")
    print(f"{label} Results ({n} episodes):")
    print(f"  Root-cause accuracy: {total_rc_correct/n:.1%}")
    print(f"  Fix accuracy:        {total_fix_correct/n:.1%}")
    print(f"  Avg steps:           {total_steps/n:.1f}")
    print(f"  Avg reward:          {total_reward/n:.1f}")
    print(f"{'='*50}")

    return {
        "rc_accuracy": total_rc_correct / n,
        "fix_accuracy": total_fix_correct / n,
        "avg_steps": total_steps / n,
        "avg_reward": total_reward / n,
    }


def main():
    from unsloth import FastLanguageModel
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--model", default="unsloth/Qwen3-1.7B", help="Model name or path")
    parser.add_argument("--lora", default=None, help="Path to LoRA adapter")
    args = parser.parse_args()

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=args.model,
        load_in_4bit=True,
        max_seq_length=2048,
    )

    if args.lora:
        from peft import PeftModel
        model = PeftModel.from_pretrained(model, args.lora)

    print(f"Evaluating {args.model}" + (f" + {args.lora}" if args.lora else ""))
    print(f"Eval scenarios: {len(EVAL_SCENARIOS)}")
    print()

    evaluate_model(model, tokenizer, EVAL_SCENARIOS, label=args.model)


if __name__ == "__main__":
    main()