File size: 9,515 Bytes
c8055f7
 
 
 
 
 
f4e9a2f
c8055f7
 
7d1d321
 
 
 
b5e4366
 
7d1d321
8457788
 
 
7d1d321
c8055f7
 
8457788
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bd351d2
 
 
 
 
 
 
 
 
 
 
 
c8055f7
8457788
c8055f7
 
7d1d321
 
 
 
 
 
 
 
 
 
c8055f7
7c8120d
 
 
 
 
 
8457788
 
 
 
bd351d2
8457788
 
 
 
 
 
 
 
 
 
 
bd351d2
8457788
 
bd351d2
8457788
 
bd351d2
8457788
c8055f7
8457788
 
 
c8055f7
8457788
 
 
bfb16e5
 
 
8457788
bfb16e5
8457788
 
 
bfb16e5
8457788
 
 
bfb16e5
8457788
bd351d2
c8055f7
8457788
c8055f7
8457788
bd351d2
c8055f7
 
8457788
 
bd351d2
7d1d321
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5e4366
c8055f7
 
 
 
 
 
 
 
 
8652b1a
8457788
8652b1a
 
8457788
8652b1a
 
 
 
bd351d2
8652b1a
8457788
 
 
f4e9a2f
8457788
 
f4e9a2f
 
 
 
 
 
8457788
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f4e9a2f
c8055f7
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
from __future__ import annotations

import json
import types
import unittest
from pathlib import Path
from unittest.mock import patch

from analyzer import analyze_trace_file
from model_runtime import (
    MODEL_CHOICES,
    MODEL_MAX_NEW_TOKENS,
    PRIMARY_MODEL_ID,
    QUICK_MODEL_ID,
    _chat_template_kwargs,
    _prepare_generation_inputs,
    parse_analysis_json,
    resolve_device,
    run_model_analysis,
)


ANALYSIS_JSON = {
    "verdict": {
        "tone": "partial",
        "headline": "Reroute landed with a caveat.",
        "detail": "The agent caught a wrong assumption about the upload shape and narrowed the fix.",
        "honesty": "candid",
    },
    "overall_patterns": {
        "difficulty_style": "One localization snag.",
        "detour_style": "A productive narrowing.",
        "recovery_style": "Reflective.",
        "risk_or_caveat": "Deployment path left unverified.",
    },
    "episodes": [
        {
            "start_index": 0,
            "end_index": 3,
            "title": "Upload boundary fix",
            "initial_intention": "Inspect the failing upload path.",
            "reported_difficulty": "The Gradio file object can arrive as a temporary path.",
            "difficulty_type": "localization_difficulty",
            "appraisal": "initial_hypothesis_wrong",
            "strategy_before": "Fix the parser.",
            "strategy_after": "Narrow the fix to the upload boundary.",
            "detour_type": "scope_narrowing",
            "resolution_mode": "defensive_handling",
            "recovery_pattern": "reflective_recovery",
            "outcome_claim": "resolved_with_caveat",
            "productive_detour": "yes",
            "evidence_quotes": ["my initial assumption about the upload shape was wrong"],
            "analyst_memo": "The agent names the wrong assumption and picks the smaller change.",
        }
    ],
}


class RecordingGenerator:
    """Stand-in for the local GPU generator that records its call arguments."""

    def __init__(self) -> None:
        self.calls: list[dict] = []

    def __call__(self, messages, *, model_id, max_new_tokens) -> str:
        self.calls.append(
            {"messages": messages, "model_id": model_id, "max_new_tokens": max_new_tokens}
        )
        return json.dumps(ANALYSIS_JSON)


class FakeTensor:
    def __init__(self, shape: tuple[int, ...]) -> None:
        self.shape = shape
        self.device = None

    def to(self, device: str) -> "FakeTensor":
        self.device = device
        return self


class ModelRuntimeTests(unittest.TestCase):
    def test_nemotron_label_does_not_call_it_small(self) -> None:
        label = str(MODEL_CHOICES["nemotron"]["label"])

        self.assertIn("NVIDIA Nemotron 3 Nano 30B-A3B", label)
        self.assertNotIn("small", label.lower())

    def test_minicpm_is_the_quick_engine(self) -> None:
        self.assertEqual(MODEL_CHOICES["minicpm"]["model_id"], QUICK_MODEL_ID)
        self.assertIn("MiniCPM5 1B", str(MODEL_CHOICES["minicpm"]["label"]))
        self.assertNotIn("qwen", MODEL_CHOICES)

    def test_minicpm_chat_template_disables_thinking(self) -> None:
        self.assertEqual(_chat_template_kwargs(QUICK_MODEL_ID), {"enable_thinking": False})
        self.assertEqual(_chat_template_kwargs(PRIMARY_MODEL_ID), {})

    def test_resolve_device_honors_explicit_override(self) -> None:
        self.assertEqual(resolve_device("cpu"), "cpu")
        self.assertEqual(resolve_device("cuda"), "cuda")
        self.assertEqual(resolve_device("mps"), "mps")

    def test_parse_analysis_json_validates_shape(self) -> None:
        parsed = parse_analysis_json(json.dumps(ANALYSIS_JSON))

        self.assertEqual(len(parsed["episodes"]), 1)
        self.assertEqual(parsed["verdict"]["tone"], "partial")

    def test_parse_analysis_json_recovers_from_code_fence(self) -> None:
        parsed = parse_analysis_json("```json\n" + json.dumps(ANALYSIS_JSON) + "\n```")

        self.assertEqual(parsed["episodes"][0]["difficulty_type"], "localization_difficulty")

    def test_parse_analysis_json_extracts_object_from_prose(self) -> None:
        raw = "Here is the report:\n" + json.dumps(ANALYSIS_JSON) + "\nDone."
        parsed = parse_analysis_json(raw)

        self.assertEqual(parsed["verdict"]["honesty"], "candid")

    def test_parse_analysis_json_uses_final_object_after_thinking_braces(self) -> None:
        raw = (
            "<think>Draft {not json} and a scratch object "
            '{"draft": "ignore this"} before the final answer.</think>\n'
            + json.dumps(ANALYSIS_JSON)
        )
        parsed = parse_analysis_json(raw)

        self.assertEqual(len(parsed["episodes"]), 1)

    def test_parse_analysis_json_requires_episodes_list(self) -> None:
        with self.assertRaises(ValueError):
            parse_analysis_json(json.dumps({"verdict": {}, "overall_patterns": {}}))

    def test_run_model_analysis_uses_selected_model(self) -> None:
        generate = RecordingGenerator()

        produced = run_model_analysis(
            engine="nemotron",
            numbered_narrative="[0] assistant 10:00: hello",
            generate=generate,
        )

        self.assertEqual(produced.model_id, PRIMARY_MODEL_ID)
        self.assertEqual(len(produced.analysis["episodes"]), 1)
        self.assertEqual(generate.calls[0]["model_id"], PRIMARY_MODEL_ID)
        self.assertEqual(generate.calls[0]["max_new_tokens"], MODEL_MAX_NEW_TOKENS)

    def test_prepare_generation_inputs_accepts_tensor_output(self) -> None:
        tensor = FakeTensor((1, 12))

        generation_inputs, prompt_tokens = _prepare_generation_inputs(tensor, device="cuda")

        self.assertEqual(generation_inputs, {"inputs": tensor})
        self.assertEqual(prompt_tokens, 12)
        self.assertEqual(tensor.device, "cuda")

    def test_prepare_generation_inputs_expands_batch_encoding_output(self) -> None:
        input_ids = FakeTensor((1, 21))
        attention_mask = FakeTensor((1, 21))

        generation_inputs, prompt_tokens = _prepare_generation_inputs(
            {"input_ids": input_ids, "attention_mask": attention_mask},
            device="cuda",
        )

        self.assertEqual(generation_inputs["input_ids"], input_ids)
        self.assertEqual(generation_inputs["attention_mask"], attention_mask)
        self.assertEqual(prompt_tokens, 21)

    def test_analyzer_records_unknown_engine_note(self) -> None:
        result, _ = analyze_trace_file(
            Path("examples/sample_trace_redacted.jsonl"),
            analysis_engine="missing-engine",
        )

        self.assertTrue(result.model_notes)
        self.assertIn("Unknown analysis engine", result.model_notes[0])

    def test_analyzer_model_error_note_avoids_double_period(self) -> None:
        with patch("analyzer.run_model_analysis", side_effect=ValueError("model unavailable.")):
            result, _ = analyze_trace_file(
                Path("examples/sample_trace_redacted.jsonl"),
                analysis_engine="minicpm",
            )

        self.assertTrue(result.model_notes)
        self.assertNotIn("..", result.model_notes[0])
        self.assertIn("ValueError: model unavailable.", result.model_notes[0])

    def test_analyzer_replaces_analysis_on_model_success(self) -> None:
        with patch("analyzer.run_model_analysis") as run:
            run.return_value = types.SimpleNamespace(
                model_id=PRIMARY_MODEL_ID,
                analysis=dict(ANALYSIS_JSON),
                note=f"Analysis produced by {PRIMARY_MODEL_ID}.",
            )
            result, _ = analyze_trace_file(
                Path("examples/sample_trace_redacted.jsonl"),
                analysis_engine="nemotron",
            )

        self.assertEqual(result.engine, PRIMARY_MODEL_ID)
        self.assertEqual(result.session_verdict["tone"], "partial")
        self.assertEqual(result.episodes[0].episode_id, "E01")
        self.assertEqual(result.episodes[0].difficulty_type, "localization_difficulty")

    def test_analyzer_strips_placeholder_echoes(self) -> None:
        bad = {
            "verdict": {"tone": "stable", "headline": "<= 12 words", "detail": "2-4 sentences", "honesty": "candid"},
            "overall_patterns": {},
            "episodes": [
                {
                    "start_index": 0,
                    "end_index": 0,
                    "title": "<= 10 words",
                    "reported_difficulty": "The build failed.",
                    "difficulty_type": "environment_blocker",
                    "analyst_memo": "1-3 sentences",
                    "evidence_quotes": ["short verbatim quote", "the build failed"],
                    "outcome_claim": "not_resolved",
                }
            ],
        }
        with patch("analyzer.run_model_analysis") as run:
            run.return_value = types.SimpleNamespace(model_id=QUICK_MODEL_ID, analysis=bad, note="ok")
            result, _ = analyze_trace_file(
                Path("examples/sample_trace_redacted.jsonl"), analysis_engine="minicpm"
            )

        episode = result.episodes[0]
        self.assertEqual(episode.title, "The build failed.")  # placeholder -> reported_difficulty
        self.assertEqual(episode.analyst_memo, "")  # "1-3 sentences" stripped
        self.assertEqual(episode.evidence_quotes, ["the build failed"])  # placeholder quote dropped
        self.assertNotIn("<", result.session_verdict["headline"])


if __name__ == "__main__":
    unittest.main()