trace-field-notes / tests /test_model_runtime.py
JacobLinCool's picture
feat: add privacy filtering and execution modes
8457788 verified
Raw
History Blame Contribute Delete
9.52 kB
from __future__ import annotations
import json
import types
import unittest
from pathlib import Path
from unittest.mock import patch
from analyzer import analyze_trace_file
from model_runtime import (
MODEL_CHOICES,
MODEL_MAX_NEW_TOKENS,
PRIMARY_MODEL_ID,
QUICK_MODEL_ID,
_chat_template_kwargs,
_prepare_generation_inputs,
parse_analysis_json,
resolve_device,
run_model_analysis,
)
ANALYSIS_JSON = {
"verdict": {
"tone": "partial",
"headline": "Reroute landed with a caveat.",
"detail": "The agent caught a wrong assumption about the upload shape and narrowed the fix.",
"honesty": "candid",
},
"overall_patterns": {
"difficulty_style": "One localization snag.",
"detour_style": "A productive narrowing.",
"recovery_style": "Reflective.",
"risk_or_caveat": "Deployment path left unverified.",
},
"episodes": [
{
"start_index": 0,
"end_index": 3,
"title": "Upload boundary fix",
"initial_intention": "Inspect the failing upload path.",
"reported_difficulty": "The Gradio file object can arrive as a temporary path.",
"difficulty_type": "localization_difficulty",
"appraisal": "initial_hypothesis_wrong",
"strategy_before": "Fix the parser.",
"strategy_after": "Narrow the fix to the upload boundary.",
"detour_type": "scope_narrowing",
"resolution_mode": "defensive_handling",
"recovery_pattern": "reflective_recovery",
"outcome_claim": "resolved_with_caveat",
"productive_detour": "yes",
"evidence_quotes": ["my initial assumption about the upload shape was wrong"],
"analyst_memo": "The agent names the wrong assumption and picks the smaller change.",
}
],
}
class RecordingGenerator:
"""Stand-in for the local GPU generator that records its call arguments."""
def __init__(self) -> None:
self.calls: list[dict] = []
def __call__(self, messages, *, model_id, max_new_tokens) -> str:
self.calls.append(
{"messages": messages, "model_id": model_id, "max_new_tokens": max_new_tokens}
)
return json.dumps(ANALYSIS_JSON)
class FakeTensor:
def __init__(self, shape: tuple[int, ...]) -> None:
self.shape = shape
self.device = None
def to(self, device: str) -> "FakeTensor":
self.device = device
return self
class ModelRuntimeTests(unittest.TestCase):
def test_nemotron_label_does_not_call_it_small(self) -> None:
label = str(MODEL_CHOICES["nemotron"]["label"])
self.assertIn("NVIDIA Nemotron 3 Nano 30B-A3B", label)
self.assertNotIn("small", label.lower())
def test_minicpm_is_the_quick_engine(self) -> None:
self.assertEqual(MODEL_CHOICES["minicpm"]["model_id"], QUICK_MODEL_ID)
self.assertIn("MiniCPM5 1B", str(MODEL_CHOICES["minicpm"]["label"]))
self.assertNotIn("qwen", MODEL_CHOICES)
def test_minicpm_chat_template_disables_thinking(self) -> None:
self.assertEqual(_chat_template_kwargs(QUICK_MODEL_ID), {"enable_thinking": False})
self.assertEqual(_chat_template_kwargs(PRIMARY_MODEL_ID), {})
def test_resolve_device_honors_explicit_override(self) -> None:
self.assertEqual(resolve_device("cpu"), "cpu")
self.assertEqual(resolve_device("cuda"), "cuda")
self.assertEqual(resolve_device("mps"), "mps")
def test_parse_analysis_json_validates_shape(self) -> None:
parsed = parse_analysis_json(json.dumps(ANALYSIS_JSON))
self.assertEqual(len(parsed["episodes"]), 1)
self.assertEqual(parsed["verdict"]["tone"], "partial")
def test_parse_analysis_json_recovers_from_code_fence(self) -> None:
parsed = parse_analysis_json("```json\n" + json.dumps(ANALYSIS_JSON) + "\n```")
self.assertEqual(parsed["episodes"][0]["difficulty_type"], "localization_difficulty")
def test_parse_analysis_json_extracts_object_from_prose(self) -> None:
raw = "Here is the report:\n" + json.dumps(ANALYSIS_JSON) + "\nDone."
parsed = parse_analysis_json(raw)
self.assertEqual(parsed["verdict"]["honesty"], "candid")
def test_parse_analysis_json_uses_final_object_after_thinking_braces(self) -> None:
raw = (
"<think>Draft {not json} and a scratch object "
'{"draft": "ignore this"} before the final answer.</think>\n'
+ json.dumps(ANALYSIS_JSON)
)
parsed = parse_analysis_json(raw)
self.assertEqual(len(parsed["episodes"]), 1)
def test_parse_analysis_json_requires_episodes_list(self) -> None:
with self.assertRaises(ValueError):
parse_analysis_json(json.dumps({"verdict": {}, "overall_patterns": {}}))
def test_run_model_analysis_uses_selected_model(self) -> None:
generate = RecordingGenerator()
produced = run_model_analysis(
engine="nemotron",
numbered_narrative="[0] assistant 10:00: hello",
generate=generate,
)
self.assertEqual(produced.model_id, PRIMARY_MODEL_ID)
self.assertEqual(len(produced.analysis["episodes"]), 1)
self.assertEqual(generate.calls[0]["model_id"], PRIMARY_MODEL_ID)
self.assertEqual(generate.calls[0]["max_new_tokens"], MODEL_MAX_NEW_TOKENS)
def test_prepare_generation_inputs_accepts_tensor_output(self) -> None:
tensor = FakeTensor((1, 12))
generation_inputs, prompt_tokens = _prepare_generation_inputs(tensor, device="cuda")
self.assertEqual(generation_inputs, {"inputs": tensor})
self.assertEqual(prompt_tokens, 12)
self.assertEqual(tensor.device, "cuda")
def test_prepare_generation_inputs_expands_batch_encoding_output(self) -> None:
input_ids = FakeTensor((1, 21))
attention_mask = FakeTensor((1, 21))
generation_inputs, prompt_tokens = _prepare_generation_inputs(
{"input_ids": input_ids, "attention_mask": attention_mask},
device="cuda",
)
self.assertEqual(generation_inputs["input_ids"], input_ids)
self.assertEqual(generation_inputs["attention_mask"], attention_mask)
self.assertEqual(prompt_tokens, 21)
def test_analyzer_records_unknown_engine_note(self) -> None:
result, _ = analyze_trace_file(
Path("examples/sample_trace_redacted.jsonl"),
analysis_engine="missing-engine",
)
self.assertTrue(result.model_notes)
self.assertIn("Unknown analysis engine", result.model_notes[0])
def test_analyzer_model_error_note_avoids_double_period(self) -> None:
with patch("analyzer.run_model_analysis", side_effect=ValueError("model unavailable.")):
result, _ = analyze_trace_file(
Path("examples/sample_trace_redacted.jsonl"),
analysis_engine="minicpm",
)
self.assertTrue(result.model_notes)
self.assertNotIn("..", result.model_notes[0])
self.assertIn("ValueError: model unavailable.", result.model_notes[0])
def test_analyzer_replaces_analysis_on_model_success(self) -> None:
with patch("analyzer.run_model_analysis") as run:
run.return_value = types.SimpleNamespace(
model_id=PRIMARY_MODEL_ID,
analysis=dict(ANALYSIS_JSON),
note=f"Analysis produced by {PRIMARY_MODEL_ID}.",
)
result, _ = analyze_trace_file(
Path("examples/sample_trace_redacted.jsonl"),
analysis_engine="nemotron",
)
self.assertEqual(result.engine, PRIMARY_MODEL_ID)
self.assertEqual(result.session_verdict["tone"], "partial")
self.assertEqual(result.episodes[0].episode_id, "E01")
self.assertEqual(result.episodes[0].difficulty_type, "localization_difficulty")
def test_analyzer_strips_placeholder_echoes(self) -> None:
bad = {
"verdict": {"tone": "stable", "headline": "<= 12 words", "detail": "2-4 sentences", "honesty": "candid"},
"overall_patterns": {},
"episodes": [
{
"start_index": 0,
"end_index": 0,
"title": "<= 10 words",
"reported_difficulty": "The build failed.",
"difficulty_type": "environment_blocker",
"analyst_memo": "1-3 sentences",
"evidence_quotes": ["short verbatim quote", "the build failed"],
"outcome_claim": "not_resolved",
}
],
}
with patch("analyzer.run_model_analysis") as run:
run.return_value = types.SimpleNamespace(model_id=QUICK_MODEL_ID, analysis=bad, note="ok")
result, _ = analyze_trace_file(
Path("examples/sample_trace_redacted.jsonl"), analysis_engine="minicpm"
)
episode = result.episodes[0]
self.assertEqual(episode.title, "The build failed.") # placeholder -> reported_difficulty
self.assertEqual(episode.analyst_memo, "") # "1-3 sentences" stripped
self.assertEqual(episode.evidence_quotes, ["the build failed"]) # placeholder quote dropped
self.assertNotIn("<", result.session_verdict["headline"])
if __name__ == "__main__":
unittest.main()