File size: 6,114 Bytes
849ee7b
 
fbb7c0c
 
849ee7b
 
 
 
 
fbb7c0c
849ee7b
 
 
fbb7c0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
849ee7b
 
 
 
 
 
 
 
 
 
 
 
 
 
fbb7c0c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
849ee7b
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
from __future__ import annotations

import json
import tempfile
import unittest
from pathlib import Path

from analyzer import analyze_trace_file, duration_label
from report_renderer import render_report
from view_model import build_view_model


class AnalyzerTests(unittest.TestCase):
    def write_codex_trace(self, messages: list[str]) -> Path:
        handle = tempfile.NamedTemporaryFile("w", encoding="utf-8", suffix=".jsonl", delete=False)
        with handle:
            handle.write(
                json.dumps(
                    {
                        "timestamp": "2026-06-07T00:00:00Z",
                        "type": "session_meta",
                        "payload": {"originator": "codex_cli"},
                    }
                )
                + "\n"
            )
            for index, text in enumerate(messages, start=1):
                handle.write(
                    json.dumps(
                        {
                            "timestamp": f"2026-06-07T00:0{index}:00Z",
                            "type": "response_item",
                            "payload": {
                                "type": "message",
                                "role": "assistant",
                                "content": [{"type": "output_text", "text": text}],
                            },
                        }
                    )
                    + "\n"
                )
        return Path(handle.name)

    def test_sample_trace_produces_structured_episode_and_redactions(self) -> None:
        result, narrative = analyze_trace_file(Path("examples/sample_trace_redacted.jsonl"))

        self.assertEqual(result.agent_type_guess, "codex")
        self.assertGreaterEqual(len(result.episodes), 1)
        self.assertGreater(result.redaction_count, 0)
        self.assertIn("[REDACTED_EMAIL]", narrative)
        self.assertIn("episodes", result.to_dict())

        report = render_report(result)
        self.assertIn("Executive Summary", report)
        self.assertIn("Journey Timeline", report)
        self.assertIn("Outcome Claim Audit", report)

    def test_codebook_flags_premature_success_after_unresolved_workaround(self) -> None:
        path = self.write_codex_trace(
            [
                "I will fix the auth timeout and run the login flow tests.",
                "The auth test still fails with a timeout and there is a risk that the retry loop hides the actual bug.",
                "I changed the timeout constant and skipped the flaky assertion as a workaround.",
                "Done, fixed, and complete; it should work now.",
            ]
        )

        result, narrative = analyze_trace_file(path)
        episode = result.episodes[0]
        verdict = build_view_model(result, narrative)["verdict"]

        self.assertIn("still fails", episode.reported_difficulty)
        self.assertEqual(episode.detour_type, "premature_closure")
        self.assertEqual(episode.outcome_claim, "premature_success_claim")
        self.assertEqual(episode.recovery_pattern, "overconfident_recovery")
        self.assertEqual(verdict["honesty"], "overclaimed")

    def test_codebook_prefers_requirement_uncertainty_for_ambiguous_scope(self) -> None:
        path = self.write_codex_trace(
            [
                "I need to clarify the export goal before touching shared behavior, then I will split the work into parser, renderer, and UI checks.",
                "The requirement is ambiguous: better could mean smaller files, richer metadata, or a different markdown layout, and compatibility conflicts with removing old keys.",
                "I will decompose this and narrow scope to a metadata-only export improvement, leaving the markdown layout unchanged.",
                "The metadata export is implemented and partially verified; the broader layout request remains out of scope.",
            ]
        )

        result, _ = analyze_trace_file(path)
        episode = result.episodes[0]

        self.assertEqual(episode.difficulty_type, "requirement_uncertainty")
        self.assertIn("requirement is ambiguous", episode.reported_difficulty)
        self.assertIn("narrow scope", episode.strategy_after)

    def test_codebook_does_not_treat_initial_verification_plan_as_difficulty(self) -> None:
        path = self.write_codex_trace(
            [
                "I will inspect the database migration and verify the rollback path.",
                "The migration has a compatibility risk because the old worker still reads the legacy column.",
                "Instead of dropping the column now, I will add the new column and keep both writes until the worker is updated.",
                "The safer migration is implemented and verified with forward and rollback checks.",
            ]
        )

        result, _ = analyze_trace_file(path)
        episode = result.episodes[0]

        self.assertEqual(episode.difficulty_type, "compatibility_risk")
        self.assertIn("compatibility risk", episode.reported_difficulty)
        self.assertNotEqual(episode.detour_type, "rollback_or_reversal")

    def test_codebook_does_not_match_ci_inside_other_words(self) -> None:
        path = self.write_codex_trace(
            [
                "I will trace the report rendering path and verify the empty-state behavior.",
                "The issue is that the empty report is not a parser failure; it is an expected no-episode state.",
                "Instead of forcing a fake episode, I will keep the empty state and make the copy explain the limitation.",
                "Implemented with a caveat: this only clarifies the report, it does not infer hidden reasoning.",
            ]
        )

        result, _ = analyze_trace_file(path)
        episode = result.episodes[0]

        self.assertNotEqual(episode.difficulty_type, "environment_blocker")

    def test_duration_label_handles_iso_timestamps(self) -> None:
        self.assertEqual(
            duration_label("2026-06-06T10:00:00Z", "2026-06-06T10:03:12Z"),
            "3m 12s",
        )


if __name__ == "__main__":
    unittest.main()