Spaces:
Running on Zero
Running on Zero
| """ | |
| Unit tests for parse_output() and build_user_prompt() in app.py. | |
| These tests NEVER load model weights. The BUREAUCAT_NO_MODEL env var is set | |
| before app is imported so the module-scope load_model() call is skipped. | |
| Tests run in seconds (pure Python, no GPU required). | |
| Test coverage: | |
| - Well-formed output → all fields populated, severity int | |
| - Transcription isolation from displayed sections | |
| - Truncated output (no SEVERITY line) → severity=None, no crash | |
| - Non-English body with fixed English headings → language-invariance (Pitfall 3) | |
| - Empty raw string → empty fields, severity=None, no exception | |
| - SEVERITY regex tolerates trailing whitespace/newline | |
| - build_user_prompt() beginner vs standard: structural invariance (D-08 unit guard) | |
| """ | |
| import os | |
| import sys | |
| # Set escape hatch BEFORE importing app so model weights are never downloaded. | |
| os.environ["BUREAUCAT_NO_MODEL"] = "1" | |
| # Ensure the project root is on the path when running from eval/ | |
| sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| import app | |
| from app import parse_output, build_user_prompt, StructuredResult | |
| # --------------------------------------------------------------------------- | |
| # Fixtures | |
| # --------------------------------------------------------------------------- | |
| WELL_FORMED = """\ | |
| <transcription> | |
| Faktura 2026-06-15 | |
| Belopp: 1 234 kr | |
| OCR: 555666777 | |
| </transcription> | |
| Bureaucat says: This invoice wants your money, and it's not even polite about it. | |
| ## TL;DR | |
| You have received an invoice for 1 234 kr due on 2026-06-15. | |
| ## Why you got this | |
| You purchased a service and this is the bill. | |
| ## What you need to do | |
| - Pay the invoice by 2026-06-15 | |
| - Use OCR number 555666777 | |
| ## Deadlines & money | |
| - 2026-06-15 - sista betalningsdag | |
| - 1 234 kr - fakturabelopp | |
| - 555666777 - OCR-nummer | |
| SEVERITY: 4""" | |
| NON_ENGLISH_BODY = """\ | |
| <transcription> | |
| Skatteverket deklaration | |
| </transcription> | |
| Bureaucat says: The taxman cometh (and speaketh Swedish). | |
| ## TL;DR | |
| आपको अपना कर विवरण जमा करना होगा। | |
| ## Why you got this | |
| आप स्वीडिश करदाता हैं। | |
| ## What you need to do | |
| - कर फार्म भरें | |
| ## Deadlines & money | |
| - 15 जून 2026 - अंतिम तारीख | |
| SEVERITY: 3""" | |
| TRUNCATED = """\ | |
| <transcription> | |
| Short letter | |
| </transcription> | |
| Bureaucat says: The suspense is killing me. | |
| ## TL;DR | |
| This letter got cut off before the severity was emitted. | |
| ## Why you got this | |
| Unknown - truncated.""" | |
| ARABIC_BODY = """\ | |
| <transcription> | |
| مكتوب | |
| </transcription> | |
| Bureaucat says: Even in Arabic, deadlines are stressful. | |
| ## TL;DR | |
| محتوى عربي | |
| ## Why you got this | |
| سبب عربي | |
| ## What you need to do | |
| - إجراء عربي | |
| ## Deadlines & money | |
| - 2026-07-01 - موعد أخير | |
| SEVERITY: 2""" | |
| # --------------------------------------------------------------------------- | |
| # Tests | |
| # --------------------------------------------------------------------------- | |
| def test_well_formed_all_fields_populated(): | |
| """Well-formed output parses into fully-populated StructuredResult.""" | |
| r = parse_output(WELL_FORMED) | |
| assert r.transcription, "transcription should be non-empty" | |
| assert r.quip, "quip should be non-empty" | |
| assert r.tldr, "tldr should be non-empty" | |
| assert r.why, "why should be non-empty" | |
| assert r.actions, "actions should be non-empty" | |
| assert r.deadlines, "deadlines should be non-empty" | |
| assert r.severity == 4, f"expected severity=4, got {r.severity}" | |
| assert r.raw == WELL_FORMED | |
| def test_transcription_isolated_from_sections(): | |
| """ | |
| The <transcription> block is stripped from the displayed body (D-04): | |
| - result.transcription contains the verbatim OCR text | |
| - The XML <transcription> tags are NOT present in sections or quip | |
| - Deadlines section can legitimately contain values that also appear in the | |
| transcription (verbatim extraction is the goal); what we test is that the | |
| raw transcription BLOCK itself is stripped from the displayed output | |
| """ | |
| r = parse_output(WELL_FORMED) | |
| # Transcription field is populated with OCR text | |
| assert "1 234 kr" in r.transcription, "verbatim value must be in transcription field" | |
| assert "Faktura" in r.transcription, "OCR text must be in transcription field" | |
| # The <transcription> XML tags must not appear anywhere in the section fields | |
| assert "<transcription>" not in r.tldr | |
| assert "<transcription>" not in r.deadlines | |
| assert "</transcription>" not in r.tldr | |
| assert "</transcription>" not in r.deadlines | |
| # The raw transcription block text (e.g., OCR-only content that would not | |
| # naturally appear in a section) must not appear verbatim in sections | |
| assert "OCR: 555666777" not in r.tldr, ( | |
| "raw transcription text must not bleed into tldr section" | |
| ) | |
| assert "Belopp: 1 234 kr" not in r.tldr, ( | |
| "raw transcription text must not bleed into tldr (only extracted values belong)" | |
| ) | |
| def test_truncated_output_severity_none_no_crash(): | |
| """Truncated output (no SEVERITY line) → severity is None, other fields still parse.""" | |
| r = parse_output(TRUNCATED) | |
| assert r.severity is None, f"expected None for truncated output, got {r.severity}" | |
| assert r.tldr, "tldr should still parse even without SEVERITY line" | |
| assert r.quip, "quip should parse from truncated output" | |
| def test_non_english_body_language_invariance(): | |
| """ | |
| Hindi body with fixed English headings → all four sections split non-empty. | |
| (Pitfall 3: sections must anchor on English headings regardless of prose language.) | |
| """ | |
| r = parse_output(NON_ENGLISH_BODY) | |
| assert r.tldr, f"tldr empty — language invariance broken: {r.tldr!r}" | |
| assert r.why, f"why empty — language invariance broken: {r.why!r}" | |
| assert r.actions, f"actions empty — language invariance broken: {r.actions!r}" | |
| assert r.deadlines, f"deadlines empty — language invariance broken: {r.deadlines!r}" | |
| assert r.severity == 3 | |
| def test_arabic_body_language_invariance(): | |
| """Arabic body with fixed English headings → all four sections split non-empty.""" | |
| r = parse_output(ARABIC_BODY) | |
| assert r.tldr, "tldr should be non-empty for Arabic body" | |
| assert r.why, "why should be non-empty for Arabic body" | |
| assert r.actions, "actions should be non-empty for Arabic body" | |
| assert r.deadlines, "deadlines should be non-empty for Arabic body" | |
| assert r.severity == 2 | |
| def test_empty_raw_no_exception(): | |
| """Empty raw string → StructuredResult with empty fields and severity=None, no crash.""" | |
| r = parse_output("") | |
| assert r.severity is None | |
| assert r.transcription == "" | |
| assert r.quip == "" | |
| assert r.tldr == "" | |
| assert r.why == "" | |
| assert r.actions == "" | |
| assert r.deadlines == "" | |
| assert r.raw == "" | |
| def test_severity_regex_tolerates_trailing_whitespace(): | |
| """SEVERITY regex matches with trailing whitespace/newline and only accepts 1-5.""" | |
| for sev in range(1, 6): | |
| raw = f"<transcription>x</transcription>\nBureaucat says: hi\n## TL;DR\na\n## Why you got this\nb\n## What you need to do\nc\n## Deadlines & money\nd\nSEVERITY: {sev} \n" | |
| r = parse_output(raw) | |
| assert r.severity == sev, f"Expected {sev}, got {r.severity}" | |
| # Value 0 and 6 should not match | |
| for bad in ("0", "6", "10"): | |
| raw = f"<transcription>x</transcription>\nBureaucat says: hi\n## TL;DR\na\n## Why you got this\nb\n## What you need to do\nc\n## Deadlines & money\nd\nSEVERITY: {bad}\n" | |
| r = parse_output(raw) | |
| assert r.severity is None, f"Severity {bad} should not parse to int, got {r.severity}" | |
| def test_build_user_prompt_beginner_mode_invariance(): | |
| """ | |
| D-08 unit guard: beginner and standard prompts differ only by appended | |
| inline-explanation guidance. Neither adds/removes sections, alters the | |
| SEVERITY line reference, or touches the transcription block. | |
| """ | |
| standard = build_user_prompt("English", beginner_mode=False) | |
| beginner = build_user_prompt("English", beginner_mode=True) | |
| # Beginner must be strictly longer (has additional guidance appended) | |
| assert len(beginner) > len(standard), "beginner prompt must be longer than standard" | |
| # Standard must be a prefix of beginner (beginner only appends, never replaces) | |
| assert beginner.startswith(standard), ( | |
| "beginner prompt must start with the full standard prompt" | |
| ) | |
| # The extra beginner content must not mention adding/removing sections | |
| extra = beginner[len(standard):] | |
| assert "new section" not in extra.lower() or "do not add new sections" in extra.lower(), ( | |
| "beginner extra guidance must not instruct adding new sections" | |
| ) | |
| # Neither prompt should reference SEVERITY or transcription in a way that | |
| # would alter those structural elements (those are in SYSTEM_PROMPT only) | |
| assert "SEVERITY" not in standard | |
| assert "SEVERITY" not in beginner | |
| assert "<transcription>" not in standard | |
| assert "<transcription>" not in beginner | |
| def test_build_user_prompt_language_interpolation(): | |
| """Language is correctly embedded in the prompt for different languages.""" | |
| for lang in ("English", "Hindi", "Arabic", "Spanish", "Swedish"): | |
| p = build_user_prompt(lang, beginner_mode=False) | |
| assert lang in p, f"Language '{lang}' must appear in prompt" | |
| def test_structured_result_is_dataclass(): | |
| """StructuredResult can be instantiated directly and has all required fields.""" | |
| r = StructuredResult( | |
| transcription="t", quip="q", tldr="tl", why="w", | |
| actions="a", deadlines="d", severity=3, raw="raw" | |
| ) | |
| assert r.severity == 3 | |
| assert r.transcription == "t" | |
| def test_model_is_none_under_no_model_flag(): | |
| """BUREAUCAT_NO_MODEL=1 → app.model is None (set in os.environ before import).""" | |
| assert app.model is None, "model should be None when BUREAUCAT_NO_MODEL is set" | |
| assert app.processor is None, "processor should be None when BUREAUCAT_NO_MODEL is set" | |
| if __name__ == "__main__": | |
| import pytest | |
| pytest.main([__file__, "-v"]) | |