File size: 4,367 Bytes
8b92d51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""
Stack Doctor — Evaluation Script

Produces the 4 metrics for judges:
1. Root-cause accuracy
2. Fix-family accuracy
3. Average steps to resolution
4. Mean reward before vs after RL

Can evaluate any model (base or fine-tuned) against held-out eval scenarios.
"""

import json
import os
import sys

SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
PROJECT_DIR = os.path.dirname(SCRIPT_DIR)
sys.path.insert(0, PROJECT_DIR)

from server.stack_doctor_environment import StackDoctorEnvironment
from server.scenarios import EVAL_SCENARIOS
from models import StackDoctorAction
from training.train_stack_doctor import (
    SYSTEM_PROMPT,
    format_scenario_prompt,
    extract_actions,
)


def evaluate_model(model, tokenizer, scenarios, label="Model"):
    """Run model against scenarios and compute metrics."""
    from unsloth import FastLanguageModel
    FastLanguageModel.for_inference(model)

    total_rc_correct = 0
    total_fix_correct = 0
    total_steps = 0
    total_reward = 0.0
    n = 0

    for sc in scenarios:
        messages = [
            {"role": "system", "content": SYSTEM_PROMPT},
            {"role": "user", "content": format_scenario_prompt(sc)},
        ]
        prompt = tokenizer.apply_chat_template(
            messages, add_generation_prompt=True, tokenize=False,
        )
        inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
        outputs = model.generate(
            **inputs,
            max_new_tokens=512,
            temperature=0.3,
            do_sample=True,
        )
        response = tokenizer.decode(outputs[0][inputs.input_ids.shape[1]:], skip_special_tokens=True)

        actions = extract_actions(response)
        if actions is None:
            total_reward -= 5.0
            n += 1
            continue

        env = StackDoctorEnvironment()
        env.reset(scenario_id=sc.id)

        cum_reward = 0.0
        steps = 0
        last_submit = None

        for action_dict in actions:
            if not isinstance(action_dict, dict):
                continue
            try:
                obs = env.step(StackDoctorAction(message=json.dumps(action_dict)))
                cum_reward += obs.reward
                steps += 1
                if action_dict.get("type") == "submit":
                    last_submit = action_dict
                if obs.done:
                    break
            except Exception:
                break

        if last_submit:
            if last_submit.get("root_cause") == sc.root_cause:
                total_rc_correct += 1
            if last_submit.get("fix") == sc.correct_fix:
                total_fix_correct += 1

        total_steps += steps
        total_reward += cum_reward
        n += 1

        print(f"  {sc.id}: rc={'OK' if last_submit and last_submit.get('root_cause')==sc.root_cause else 'FAIL'} "
              f"fix={'OK' if last_submit and last_submit.get('fix')==sc.correct_fix else 'FAIL'} "
              f"steps={steps} reward={cum_reward:.1f}")

    print(f"\n{'='*50}")
    print(f"{label} Results ({n} episodes):")
    print(f"  Root-cause accuracy: {total_rc_correct/n:.1%}")
    print(f"  Fix accuracy:        {total_fix_correct/n:.1%}")
    print(f"  Avg steps:           {total_steps/n:.1f}")
    print(f"  Avg reward:          {total_reward/n:.1f}")
    print(f"{'='*50}")

    return {
        "rc_accuracy": total_rc_correct / n,
        "fix_accuracy": total_fix_correct / n,
        "avg_steps": total_steps / n,
        "avg_reward": total_reward / n,
    }


def main():
    from unsloth import FastLanguageModel
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument("--model", default="unsloth/Qwen3-1.7B", help="Model name or path")
    parser.add_argument("--lora", default=None, help="Path to LoRA adapter")
    args = parser.parse_args()

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=args.model,
        load_in_4bit=True,
        max_seq_length=2048,
    )

    if args.lora:
        from peft import PeftModel
        model = PeftModel.from_pretrained(model, args.lora)

    print(f"Evaluating {args.model}" + (f" + {args.lora}" if args.lora else ""))
    print(f"Eval scenarios: {len(EVAL_SCENARIOS)}")
    print()

    evaluate_model(model, tokenizer, EVAL_SCENARIOS, label=args.model)


if __name__ == "__main__":
    main()