Spaces:

build-small-hackathon
/

trace-field-notes

Running on Zero

File size: 9,515 Bytes

from __future__ import annotations

import json
import types
import unittest
from pathlib import Path
from unittest.mock import patch

from analyzer import analyze_trace_file
from model_runtime import (
    MODEL_CHOICES,
    MODEL_MAX_NEW_TOKENS,
    PRIMARY_MODEL_ID,
    QUICK_MODEL_ID,
    _chat_template_kwargs,
    _prepare_generation_inputs,
    parse_analysis_json,
    resolve_device,
    run_model_analysis,
)


ANALYSIS_JSON = {
    "verdict": {
        "tone": "partial",
        "headline": "Reroute landed with a caveat.",
        "detail": "The agent caught a wrong assumption about the upload shape and narrowed the fix.",
        "honesty": "candid",
    },
    "overall_patterns": {
        "difficulty_style": "One localization snag.",
        "detour_style": "A productive narrowing.",
        "recovery_style": "Reflective.",
        "risk_or_caveat": "Deployment path left unverified.",
    },
    "episodes": [
        {
            "start_index": 0,
            "end_index": 3,
            "title": "Upload boundary fix",
            "initial_intention": "Inspect the failing upload path.",
            "reported_difficulty": "The Gradio file object can arrive as a temporary path.",
            "difficulty_type": "localization_difficulty",
            "appraisal": "initial_hypothesis_wrong",
            "strategy_before": "Fix the parser.",
            "strategy_after": "Narrow the fix to the upload boundary.",
            "detour_type": "scope_narrowing",
            "resolution_mode": "defensive_handling",
            "recovery_pattern": "reflective_recovery",
            "outcome_claim": "resolved_with_caveat",
            "productive_detour": "yes",
            "evidence_quotes": ["my initial assumption about the upload shape was wrong"],
            "analyst_memo": "The agent names the wrong assumption and picks the smaller change.",
        }
    ],
}


class RecordingGenerator:
    """Stand-in for the local GPU generator that records its call arguments."""

    def __init__(self) -> None:
        self.calls: list[dict] = []

    def __call__(self, messages, *, model_id, max_new_tokens) -> str:
        self.calls.append(
            {"messages": messages, "model_id": model_id, "max_new_tokens": max_new_tokens}
        )
        return json.dumps(ANALYSIS_JSON)


class FakeTensor:
    def __init__(self, shape: tuple[int, ...]) -> None:
        self.shape = shape
        self.device = None

    def to(self, device: str) -> "FakeTensor":
        self.device = device
        return self


class ModelRuntimeTests(unittest.TestCase):
    def test_nemotron_label_does_not_call_it_small(self) -> None:
        label = str(MODEL_CHOICES["nemotron"]["label"])

        self.assertIn("NVIDIA Nemotron 3 Nano 30B-A3B", label)
        self.assertNotIn("small", label.lower())

    def test_minicpm_is_the_quick_engine(self) -> None:
        self.assertEqual(MODEL_CHOICES["minicpm"]["model_id"], QUICK_MODEL_ID)
        self.assertIn("MiniCPM5 1B", str(MODEL_CHOICES["minicpm"]["label"]))
        self.assertNotIn("qwen", MODEL_CHOICES)

    def test_minicpm_chat_template_disables_thinking(self) -> None:
        self.assertEqual(_chat_template_kwargs(QUICK_MODEL_ID), {"enable_thinking": False})
        self.assertEqual(_chat_template_kwargs(PRIMARY_MODEL_ID), {})

    def test_resolve_device_honors_explicit_override(self) -> None:
        self.assertEqual(resolve_device("cpu"), "cpu")
        self.assertEqual(resolve_device("cuda"), "cuda")
        self.assertEqual(resolve_device("mps"), "mps")

    def test_parse_analysis_json_validates_shape(self) -> None:
        parsed = parse_analysis_json(json.dumps(ANALYSIS_JSON))

        self.assertEqual(len(parsed["episodes"]), 1)
        self.assertEqual(parsed["verdict"]["tone"], "partial")

    def test_parse_analysis_json_recovers_from_code_fence(self) -> None:
        parsed = parse_analysis_json("```json\n" + json.dumps(ANALYSIS_JSON) + "\n```")

        self.assertEqual(parsed["episodes"][0]["difficulty_type"], "localization_difficulty")

    def test_parse_analysis_json_extracts_object_from_prose(self) -> None:
        raw = "Here is the report:\n" + json.dumps(ANALYSIS_JSON) + "\nDone."
        parsed = parse_analysis_json(raw)

        self.assertEqual(parsed["verdict"]["honesty"], "candid")

    def test_parse_analysis_json_uses_final_object_after_thinking_braces(self) -> None:
        raw = (
            "<think>Draft {not json} and a scratch object "
            '{"draft": "ignore this"} before the final answer.</think>\n'
            + json.dumps(ANALYSIS_JSON)
        )
        parsed = parse_analysis_json(raw)

        self.assertEqual(len(parsed["episodes"]), 1)

    def test_parse_analysis_json_requires_episodes_list(self) -> None:
        with self.assertRaises(ValueError):
            parse_analysis_json(json.dumps({"verdict": {}, "overall_patterns": {}}))

    def test_run_model_analysis_uses_selected_model(self) -> None:
        generate = RecordingGenerator()

        produced = run_model_analysis(
            engine="nemotron",
            numbered_narrative="[0] assistant 10:00: hello",
            generate=generate,
        )

        self.assertEqual(produced.model_id, PRIMARY_MODEL_ID)
        self.assertEqual(len(produced.analysis["episodes"]), 1)
        self.assertEqual(generate.calls[0]["model_id"], PRIMARY_MODEL_ID)
        self.assertEqual(generate.calls[0]["max_new_tokens"], MODEL_MAX_NEW_TOKENS)

    def test_prepare_generation_inputs_accepts_tensor_output(self) -> None:
        tensor = FakeTensor((1, 12))

        generation_inputs, prompt_tokens = _prepare_generation_inputs(tensor, device="cuda")

        self.assertEqual(generation_inputs, {"inputs": tensor})
        self.assertEqual(prompt_tokens, 12)
        self.assertEqual(tensor.device, "cuda")

    def test_prepare_generation_inputs_expands_batch_encoding_output(self) -> None:
        input_ids = FakeTensor((1, 21))
        attention_mask = FakeTensor((1, 21))

        generation_inputs, prompt_tokens = _prepare_generation_inputs(
            {"input_ids": input_ids, "attention_mask": attention_mask},
            device="cuda",
        )

        self.assertEqual(generation_inputs["input_ids"], input_ids)
        self.assertEqual(generation_inputs["attention_mask"], attention_mask)
        self.assertEqual(prompt_tokens, 21)

    def test_analyzer_records_unknown_engine_note(self) -> None:
        result, _ = analyze_trace_file(
            Path("examples/sample_trace_redacted.jsonl"),
            analysis_engine="missing-engine",
        )

        self.assertTrue(result.model_notes)
        self.assertIn("Unknown analysis engine", result.model_notes[0])

    def test_analyzer_model_error_note_avoids_double_period(self) -> None:
        with patch("analyzer.run_model_analysis", side_effect=ValueError("model unavailable.")):
            result, _ = analyze_trace_file(
                Path("examples/sample_trace_redacted.jsonl"),
                analysis_engine="minicpm",
            )

        self.assertTrue(result.model_notes)
        self.assertNotIn("..", result.model_notes[0])
        self.assertIn("ValueError: model unavailable.", result.model_notes[0])

    def test_analyzer_replaces_analysis_on_model_success(self) -> None:
        with patch("analyzer.run_model_analysis") as run:
            run.return_value = types.SimpleNamespace(
                model_id=PRIMARY_MODEL_ID,
                analysis=dict(ANALYSIS_JSON),
                note=f"Analysis produced by {PRIMARY_MODEL_ID}.",
            )
            result, _ = analyze_trace_file(
                Path("examples/sample_trace_redacted.jsonl"),
                analysis_engine="nemotron",
            )

        self.assertEqual(result.engine, PRIMARY_MODEL_ID)
        self.assertEqual(result.session_verdict["tone"], "partial")
        self.assertEqual(result.episodes[0].episode_id, "E01")
        self.assertEqual(result.episodes[0].difficulty_type, "localization_difficulty")

    def test_analyzer_strips_placeholder_echoes(self) -> None:
        bad = {
            "verdict": {"tone": "stable", "headline": "<= 12 words", "detail": "2-4 sentences", "honesty": "candid"},
            "overall_patterns": {},
            "episodes": [
                {
                    "start_index": 0,
                    "end_index": 0,
                    "title": "<= 10 words",
                    "reported_difficulty": "The build failed.",
                    "difficulty_type": "environment_blocker",
                    "analyst_memo": "1-3 sentences",
                    "evidence_quotes": ["short verbatim quote", "the build failed"],
                    "outcome_claim": "not_resolved",
                }
            ],
        }
        with patch("analyzer.run_model_analysis") as run:
            run.return_value = types.SimpleNamespace(model_id=QUICK_MODEL_ID, analysis=bad, note="ok")
            result, _ = analyze_trace_file(
                Path("examples/sample_trace_redacted.jsonl"), analysis_engine="minicpm"
            )

        episode = result.episodes[0]
        self.assertEqual(episode.title, "The build failed.")  # placeholder -> reported_difficulty
        self.assertEqual(episode.analyst_memo, "")  # "1-3 sentences" stripped
        self.assertEqual(episode.evidence_quotes, ["the build failed"])  # placeholder quote dropped
        self.assertNotIn("<", result.session_verdict["headline"])


if __name__ == "__main__":
    unittest.main()