""" Unit tests for parse_output() and build_user_prompt() in app.py. These tests NEVER load model weights. The BUREAUCAT_NO_MODEL env var is set before app is imported so the module-scope load_model() call is skipped. Tests run in seconds (pure Python, no GPU required). Test coverage: - Well-formed output → all fields populated, severity int - Transcription isolation from displayed sections - Truncated output (no SEVERITY line) → severity=None, no crash - Non-English body with fixed English headings → language-invariance (Pitfall 3) - Empty raw string → empty fields, severity=None, no exception - SEVERITY regex tolerates trailing whitespace/newline - build_user_prompt() beginner vs standard: structural invariance (D-08 unit guard) """ import os import sys # Set escape hatch BEFORE importing app so model weights are never downloaded. os.environ["BUREAUCAT_NO_MODEL"] = "1" # Ensure the project root is on the path when running from eval/ sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) import app from app import parse_output, build_user_prompt, StructuredResult # --------------------------------------------------------------------------- # Fixtures # --------------------------------------------------------------------------- WELL_FORMED = """\ Faktura 2026-06-15 Belopp: 1 234 kr OCR: 555666777 Bureaucat says: This invoice wants your money, and it's not even polite about it. ## TL;DR You have received an invoice for 1 234 kr due on 2026-06-15. ## Why you got this You purchased a service and this is the bill. ## What you need to do - Pay the invoice by 2026-06-15 - Use OCR number 555666777 ## Deadlines & money - 2026-06-15 - sista betalningsdag - 1 234 kr - fakturabelopp - 555666777 - OCR-nummer SEVERITY: 4""" NON_ENGLISH_BODY = """\ Skatteverket deklaration Bureaucat says: The taxman cometh (and speaketh Swedish). ## TL;DR आपको अपना कर विवरण जमा करना होगा। ## Why you got this आप स्वीडिश करदाता हैं। ## What you need to do - कर फार्म भरें ## Deadlines & money - 15 जून 2026 - अंतिम तारीख SEVERITY: 3""" TRUNCATED = """\ Short letter Bureaucat says: The suspense is killing me. ## TL;DR This letter got cut off before the severity was emitted. ## Why you got this Unknown - truncated.""" ARABIC_BODY = """\ مكتوب Bureaucat says: Even in Arabic, deadlines are stressful. ## TL;DR محتوى عربي ## Why you got this سبب عربي ## What you need to do - إجراء عربي ## Deadlines & money - 2026-07-01 - موعد أخير SEVERITY: 2""" # --------------------------------------------------------------------------- # Tests # --------------------------------------------------------------------------- def test_well_formed_all_fields_populated(): """Well-formed output parses into fully-populated StructuredResult.""" r = parse_output(WELL_FORMED) assert r.transcription, "transcription should be non-empty" assert r.quip, "quip should be non-empty" assert r.tldr, "tldr should be non-empty" assert r.why, "why should be non-empty" assert r.actions, "actions should be non-empty" assert r.deadlines, "deadlines should be non-empty" assert r.severity == 4, f"expected severity=4, got {r.severity}" assert r.raw == WELL_FORMED def test_transcription_isolated_from_sections(): """ The block is stripped from the displayed body (D-04): - result.transcription contains the verbatim OCR text - The XML tags are NOT present in sections or quip - Deadlines section can legitimately contain values that also appear in the transcription (verbatim extraction is the goal); what we test is that the raw transcription BLOCK itself is stripped from the displayed output """ r = parse_output(WELL_FORMED) # Transcription field is populated with OCR text assert "1 234 kr" in r.transcription, "verbatim value must be in transcription field" assert "Faktura" in r.transcription, "OCR text must be in transcription field" # The XML tags must not appear anywhere in the section fields assert "" not in r.tldr assert "" not in r.deadlines assert "" not in r.tldr assert "" not in r.deadlines # The raw transcription block text (e.g., OCR-only content that would not # naturally appear in a section) must not appear verbatim in sections assert "OCR: 555666777" not in r.tldr, ( "raw transcription text must not bleed into tldr section" ) assert "Belopp: 1 234 kr" not in r.tldr, ( "raw transcription text must not bleed into tldr (only extracted values belong)" ) def test_truncated_output_severity_none_no_crash(): """Truncated output (no SEVERITY line) → severity is None, other fields still parse.""" r = parse_output(TRUNCATED) assert r.severity is None, f"expected None for truncated output, got {r.severity}" assert r.tldr, "tldr should still parse even without SEVERITY line" assert r.quip, "quip should parse from truncated output" def test_non_english_body_language_invariance(): """ Hindi body with fixed English headings → all four sections split non-empty. (Pitfall 3: sections must anchor on English headings regardless of prose language.) """ r = parse_output(NON_ENGLISH_BODY) assert r.tldr, f"tldr empty — language invariance broken: {r.tldr!r}" assert r.why, f"why empty — language invariance broken: {r.why!r}" assert r.actions, f"actions empty — language invariance broken: {r.actions!r}" assert r.deadlines, f"deadlines empty — language invariance broken: {r.deadlines!r}" assert r.severity == 3 def test_arabic_body_language_invariance(): """Arabic body with fixed English headings → all four sections split non-empty.""" r = parse_output(ARABIC_BODY) assert r.tldr, "tldr should be non-empty for Arabic body" assert r.why, "why should be non-empty for Arabic body" assert r.actions, "actions should be non-empty for Arabic body" assert r.deadlines, "deadlines should be non-empty for Arabic body" assert r.severity == 2 def test_empty_raw_no_exception(): """Empty raw string → StructuredResult with empty fields and severity=None, no crash.""" r = parse_output("") assert r.severity is None assert r.transcription == "" assert r.quip == "" assert r.tldr == "" assert r.why == "" assert r.actions == "" assert r.deadlines == "" assert r.raw == "" def test_severity_regex_tolerates_trailing_whitespace(): """SEVERITY regex matches with trailing whitespace/newline and only accepts 1-5.""" for sev in range(1, 6): raw = f"x\nBureaucat says: hi\n## TL;DR\na\n## Why you got this\nb\n## What you need to do\nc\n## Deadlines & money\nd\nSEVERITY: {sev} \n" r = parse_output(raw) assert r.severity == sev, f"Expected {sev}, got {r.severity}" # Value 0 and 6 should not match for bad in ("0", "6", "10"): raw = f"x\nBureaucat says: hi\n## TL;DR\na\n## Why you got this\nb\n## What you need to do\nc\n## Deadlines & money\nd\nSEVERITY: {bad}\n" r = parse_output(raw) assert r.severity is None, f"Severity {bad} should not parse to int, got {r.severity}" def test_build_user_prompt_beginner_mode_invariance(): """ D-08 unit guard: beginner and standard prompts differ only by appended inline-explanation guidance. Neither adds/removes sections, alters the SEVERITY line reference, or touches the transcription block. """ standard = build_user_prompt("English", beginner_mode=False) beginner = build_user_prompt("English", beginner_mode=True) # Beginner must be strictly longer (has additional guidance appended) assert len(beginner) > len(standard), "beginner prompt must be longer than standard" # Standard must be a prefix of beginner (beginner only appends, never replaces) assert beginner.startswith(standard), ( "beginner prompt must start with the full standard prompt" ) # The extra beginner content must not mention adding/removing sections extra = beginner[len(standard):] assert "new section" not in extra.lower() or "do not add new sections" in extra.lower(), ( "beginner extra guidance must not instruct adding new sections" ) # Neither prompt should reference SEVERITY or transcription in a way that # would alter those structural elements (those are in SYSTEM_PROMPT only) assert "SEVERITY" not in standard assert "SEVERITY" not in beginner assert "" not in standard assert "" not in beginner def test_build_user_prompt_language_interpolation(): """Language is correctly embedded in the prompt for different languages.""" for lang in ("English", "Hindi", "Arabic", "Spanish", "Swedish"): p = build_user_prompt(lang, beginner_mode=False) assert lang in p, f"Language '{lang}' must appear in prompt" def test_structured_result_is_dataclass(): """StructuredResult can be instantiated directly and has all required fields.""" r = StructuredResult( transcription="t", quip="q", tldr="tl", why="w", actions="a", deadlines="d", severity=3, raw="raw" ) assert r.severity == 3 assert r.transcription == "t" def test_model_is_none_under_no_model_flag(): """BUREAUCAT_NO_MODEL=1 → app.model is None (set in os.environ before import).""" assert app.model is None, "model should be None when BUREAUCAT_NO_MODEL is set" assert app.processor is None, "processor should be None when BUREAUCAT_NO_MODEL is set" if __name__ == "__main__": import pytest pytest.main([__file__, "-v"])