Spaces:

build-small-hackathon
/

trace-field-notes

Running on Zero

App Files Files Community

trace-field-notes / tests /test_model_runtime.py

JacobLinCool

feat: add privacy filtering and execution modes

8457788 verified 20 days ago

Raw

History Blame Contribute Delete

9.52 kB

	from __future__ import annotations

	import json
	import types
	import unittest
	from pathlib import Path
	from unittest.mock import patch

	from analyzer import analyze_trace_file
	from model_runtime import (
	MODEL_CHOICES,
	MODEL_MAX_NEW_TOKENS,
	PRIMARY_MODEL_ID,
	QUICK_MODEL_ID,
	_chat_template_kwargs,
	_prepare_generation_inputs,
	parse_analysis_json,
	resolve_device,
	run_model_analysis,
	)


	ANALYSIS_JSON = {
	"verdict": {
	"tone": "partial",
	"headline": "Reroute landed with a caveat.",
	"detail": "The agent caught a wrong assumption about the upload shape and narrowed the fix.",
	"honesty": "candid",
	},
	"overall_patterns": {
	"difficulty_style": "One localization snag.",
	"detour_style": "A productive narrowing.",
	"recovery_style": "Reflective.",
	"risk_or_caveat": "Deployment path left unverified.",
	},
	"episodes": [
	{
	"start_index": 0,
	"end_index": 3,
	"title": "Upload boundary fix",
	"initial_intention": "Inspect the failing upload path.",
	"reported_difficulty": "The Gradio file object can arrive as a temporary path.",
	"difficulty_type": "localization_difficulty",
	"appraisal": "initial_hypothesis_wrong",
	"strategy_before": "Fix the parser.",
	"strategy_after": "Narrow the fix to the upload boundary.",
	"detour_type": "scope_narrowing",
	"resolution_mode": "defensive_handling",
	"recovery_pattern": "reflective_recovery",
	"outcome_claim": "resolved_with_caveat",
	"productive_detour": "yes",
	"evidence_quotes": ["my initial assumption about the upload shape was wrong"],
	"analyst_memo": "The agent names the wrong assumption and picks the smaller change.",
	}
	],
	}


	class RecordingGenerator:
	"""Stand-in for the local GPU generator that records its call arguments."""

	def __init__(self) -> None:
	self.calls: list[dict] = []

	def __call__(self, messages, *, model_id, max_new_tokens) -> str:
	self.calls.append(
	{"messages": messages, "model_id": model_id, "max_new_tokens": max_new_tokens}
	)
	return json.dumps(ANALYSIS_JSON)


	class FakeTensor:
	def __init__(self, shape: tuple[int, ...]) -> None:
	self.shape = shape
	self.device = None

	def to(self, device: str) -> "FakeTensor":
	self.device = device
	return self


	class ModelRuntimeTests(unittest.TestCase):
	def test_nemotron_label_does_not_call_it_small(self) -> None:
	label = str(MODEL_CHOICES["nemotron"]["label"])

	self.assertIn("NVIDIA Nemotron 3 Nano 30B-A3B", label)
	self.assertNotIn("small", label.lower())

	def test_minicpm_is_the_quick_engine(self) -> None:
	self.assertEqual(MODEL_CHOICES["minicpm"]["model_id"], QUICK_MODEL_ID)
	self.assertIn("MiniCPM5 1B", str(MODEL_CHOICES["minicpm"]["label"]))
	self.assertNotIn("qwen", MODEL_CHOICES)

	def test_minicpm_chat_template_disables_thinking(self) -> None:
	self.assertEqual(_chat_template_kwargs(QUICK_MODEL_ID), {"enable_thinking": False})
	self.assertEqual(_chat_template_kwargs(PRIMARY_MODEL_ID), {})

	def test_resolve_device_honors_explicit_override(self) -> None:
	self.assertEqual(resolve_device("cpu"), "cpu")
	self.assertEqual(resolve_device("cuda"), "cuda")
	self.assertEqual(resolve_device("mps"), "mps")

	def test_parse_analysis_json_validates_shape(self) -> None:
	parsed = parse_analysis_json(json.dumps(ANALYSIS_JSON))

	self.assertEqual(len(parsed["episodes"]), 1)
	self.assertEqual(parsed["verdict"]["tone"], "partial")

	def test_parse_analysis_json_recovers_from_code_fence(self) -> None:
	parsed = parse_analysis_json("```json\n" + json.dumps(ANALYSIS_JSON) + "\n```")

	self.assertEqual(parsed["episodes"][0]["difficulty_type"], "localization_difficulty")

	def test_parse_analysis_json_extracts_object_from_prose(self) -> None:
	raw = "Here is the report:\n" + json.dumps(ANALYSIS_JSON) + "\nDone."
	parsed = parse_analysis_json(raw)

	self.assertEqual(parsed["verdict"]["honesty"], "candid")

	def test_parse_analysis_json_uses_final_object_after_thinking_braces(self) -> None:
	raw = (
	"<think>Draft {not json} and a scratch object "
	'{"draft": "ignore this"} before the final answer.</think>\n'
	+ json.dumps(ANALYSIS_JSON)
	)
	parsed = parse_analysis_json(raw)

	self.assertEqual(len(parsed["episodes"]), 1)

	def test_parse_analysis_json_requires_episodes_list(self) -> None:
	with self.assertRaises(ValueError):
	parse_analysis_json(json.dumps({"verdict": {}, "overall_patterns": {}}))

	def test_run_model_analysis_uses_selected_model(self) -> None:
	generate = RecordingGenerator()

	produced = run_model_analysis(
	engine="nemotron",
	numbered_narrative="[0] assistant 10:00: hello",
	generate=generate,
	)

	self.assertEqual(produced.model_id, PRIMARY_MODEL_ID)
	self.assertEqual(len(produced.analysis["episodes"]), 1)
	self.assertEqual(generate.calls[0]["model_id"], PRIMARY_MODEL_ID)
	self.assertEqual(generate.calls[0]["max_new_tokens"], MODEL_MAX_NEW_TOKENS)

	def test_prepare_generation_inputs_accepts_tensor_output(self) -> None:
	tensor = FakeTensor((1, 12))

	generation_inputs, prompt_tokens = _prepare_generation_inputs(tensor, device="cuda")

	self.assertEqual(generation_inputs, {"inputs": tensor})
	self.assertEqual(prompt_tokens, 12)
	self.assertEqual(tensor.device, "cuda")

	def test_prepare_generation_inputs_expands_batch_encoding_output(self) -> None:
	input_ids = FakeTensor((1, 21))
	attention_mask = FakeTensor((1, 21))

	generation_inputs, prompt_tokens = _prepare_generation_inputs(
	{"input_ids": input_ids, "attention_mask": attention_mask},
	device="cuda",
	)

	self.assertEqual(generation_inputs["input_ids"], input_ids)
	self.assertEqual(generation_inputs["attention_mask"], attention_mask)
	self.assertEqual(prompt_tokens, 21)

	def test_analyzer_records_unknown_engine_note(self) -> None:
	result, _ = analyze_trace_file(
	Path("examples/sample_trace_redacted.jsonl"),
	analysis_engine="missing-engine",
	)

	self.assertTrue(result.model_notes)
	self.assertIn("Unknown analysis engine", result.model_notes[0])

	def test_analyzer_model_error_note_avoids_double_period(self) -> None:
	with patch("analyzer.run_model_analysis", side_effect=ValueError("model unavailable.")):
	result, _ = analyze_trace_file(
	Path("examples/sample_trace_redacted.jsonl"),
	analysis_engine="minicpm",
	)

	self.assertTrue(result.model_notes)
	self.assertNotIn("..", result.model_notes[0])
	self.assertIn("ValueError: model unavailable.", result.model_notes[0])

	def test_analyzer_replaces_analysis_on_model_success(self) -> None:
	with patch("analyzer.run_model_analysis") as run:
	run.return_value = types.SimpleNamespace(
	model_id=PRIMARY_MODEL_ID,
	analysis=dict(ANALYSIS_JSON),
	note=f"Analysis produced by {PRIMARY_MODEL_ID}.",
	)
	result, _ = analyze_trace_file(
	Path("examples/sample_trace_redacted.jsonl"),
	analysis_engine="nemotron",
	)

	self.assertEqual(result.engine, PRIMARY_MODEL_ID)
	self.assertEqual(result.session_verdict["tone"], "partial")
	self.assertEqual(result.episodes[0].episode_id, "E01")
	self.assertEqual(result.episodes[0].difficulty_type, "localization_difficulty")

	def test_analyzer_strips_placeholder_echoes(self) -> None:
	bad = {
	"verdict": {"tone": "stable", "headline": "<= 12 words", "detail": "2-4 sentences", "honesty": "candid"},
	"overall_patterns": {},
	"episodes": [
	{
	"start_index": 0,
	"end_index": 0,
	"title": "<= 10 words",
	"reported_difficulty": "The build failed.",
	"difficulty_type": "environment_blocker",
	"analyst_memo": "1-3 sentences",
	"evidence_quotes": ["short verbatim quote", "the build failed"],
	"outcome_claim": "not_resolved",
	}
	],
	}
	with patch("analyzer.run_model_analysis") as run:
	run.return_value = types.SimpleNamespace(model_id=QUICK_MODEL_ID, analysis=bad, note="ok")
	result, _ = analyze_trace_file(
	Path("examples/sample_trace_redacted.jsonl"), analysis_engine="minicpm"
	)

	episode = result.episodes[0]
	self.assertEqual(episode.title, "The build failed.") # placeholder -> reported_difficulty
	self.assertEqual(episode.analyst_memo, "") # "1-3 sentences" stripped
	self.assertEqual(episode.evidence_quotes, ["the build failed"]) # placeholder quote dropped
	self.assertNotIn("<", result.session_verdict["headline"])


	if __name__ == "__main__":
	unittest.main()