bureaucat / eval /test_parse_output.py
ravinsingh15's picture
Bureaucat — Build Small Hackathon submission (Qwen3-VL-8B, ZeroGPU, gr.Server)
6b5e47d
Raw
History Blame Contribute Delete
10.1 kB
"""
Unit tests for parse_output() and build_user_prompt() in app.py.
These tests NEVER load model weights. The BUREAUCAT_NO_MODEL env var is set
before app is imported so the module-scope load_model() call is skipped.
Tests run in seconds (pure Python, no GPU required).
Test coverage:
- Well-formed output → all fields populated, severity int
- Transcription isolation from displayed sections
- Truncated output (no SEVERITY line) → severity=None, no crash
- Non-English body with fixed English headings → language-invariance (Pitfall 3)
- Empty raw string → empty fields, severity=None, no exception
- SEVERITY regex tolerates trailing whitespace/newline
- build_user_prompt() beginner vs standard: structural invariance (D-08 unit guard)
"""
import os
import sys
# Set escape hatch BEFORE importing app so model weights are never downloaded.
os.environ["BUREAUCAT_NO_MODEL"] = "1"
# Ensure the project root is on the path when running from eval/
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
import app
from app import parse_output, build_user_prompt, StructuredResult
# ---------------------------------------------------------------------------
# Fixtures
# ---------------------------------------------------------------------------
WELL_FORMED = """\
<transcription>
Faktura 2026-06-15
Belopp: 1 234 kr
OCR: 555666777
</transcription>
Bureaucat says: This invoice wants your money, and it's not even polite about it.
## TL;DR
You have received an invoice for 1 234 kr due on 2026-06-15.
## Why you got this
You purchased a service and this is the bill.
## What you need to do
- Pay the invoice by 2026-06-15
- Use OCR number 555666777
## Deadlines & money
- 2026-06-15 - sista betalningsdag
- 1 234 kr - fakturabelopp
- 555666777 - OCR-nummer
SEVERITY: 4"""
NON_ENGLISH_BODY = """\
<transcription>
Skatteverket deklaration
</transcription>
Bureaucat says: The taxman cometh (and speaketh Swedish).
## TL;DR
आपको अपना कर विवरण जमा करना होगा।
## Why you got this
आप स्वीडिश करदाता हैं।
## What you need to do
- कर फार्म भरें
## Deadlines & money
- 15 जून 2026 - अंतिम तारीख
SEVERITY: 3"""
TRUNCATED = """\
<transcription>
Short letter
</transcription>
Bureaucat says: The suspense is killing me.
## TL;DR
This letter got cut off before the severity was emitted.
## Why you got this
Unknown - truncated."""
ARABIC_BODY = """\
<transcription>
مكتوب
</transcription>
Bureaucat says: Even in Arabic, deadlines are stressful.
## TL;DR
محتوى عربي
## Why you got this
سبب عربي
## What you need to do
- إجراء عربي
## Deadlines & money
- 2026-07-01 - موعد أخير
SEVERITY: 2"""
# ---------------------------------------------------------------------------
# Tests
# ---------------------------------------------------------------------------
def test_well_formed_all_fields_populated():
"""Well-formed output parses into fully-populated StructuredResult."""
r = parse_output(WELL_FORMED)
assert r.transcription, "transcription should be non-empty"
assert r.quip, "quip should be non-empty"
assert r.tldr, "tldr should be non-empty"
assert r.why, "why should be non-empty"
assert r.actions, "actions should be non-empty"
assert r.deadlines, "deadlines should be non-empty"
assert r.severity == 4, f"expected severity=4, got {r.severity}"
assert r.raw == WELL_FORMED
def test_transcription_isolated_from_sections():
"""
The <transcription> block is stripped from the displayed body (D-04):
- result.transcription contains the verbatim OCR text
- The XML <transcription> tags are NOT present in sections or quip
- Deadlines section can legitimately contain values that also appear in the
transcription (verbatim extraction is the goal); what we test is that the
raw transcription BLOCK itself is stripped from the displayed output
"""
r = parse_output(WELL_FORMED)
# Transcription field is populated with OCR text
assert "1 234 kr" in r.transcription, "verbatim value must be in transcription field"
assert "Faktura" in r.transcription, "OCR text must be in transcription field"
# The <transcription> XML tags must not appear anywhere in the section fields
assert "<transcription>" not in r.tldr
assert "<transcription>" not in r.deadlines
assert "</transcription>" not in r.tldr
assert "</transcription>" not in r.deadlines
# The raw transcription block text (e.g., OCR-only content that would not
# naturally appear in a section) must not appear verbatim in sections
assert "OCR: 555666777" not in r.tldr, (
"raw transcription text must not bleed into tldr section"
)
assert "Belopp: 1 234 kr" not in r.tldr, (
"raw transcription text must not bleed into tldr (only extracted values belong)"
)
def test_truncated_output_severity_none_no_crash():
"""Truncated output (no SEVERITY line) → severity is None, other fields still parse."""
r = parse_output(TRUNCATED)
assert r.severity is None, f"expected None for truncated output, got {r.severity}"
assert r.tldr, "tldr should still parse even without SEVERITY line"
assert r.quip, "quip should parse from truncated output"
def test_non_english_body_language_invariance():
"""
Hindi body with fixed English headings → all four sections split non-empty.
(Pitfall 3: sections must anchor on English headings regardless of prose language.)
"""
r = parse_output(NON_ENGLISH_BODY)
assert r.tldr, f"tldr empty — language invariance broken: {r.tldr!r}"
assert r.why, f"why empty — language invariance broken: {r.why!r}"
assert r.actions, f"actions empty — language invariance broken: {r.actions!r}"
assert r.deadlines, f"deadlines empty — language invariance broken: {r.deadlines!r}"
assert r.severity == 3
def test_arabic_body_language_invariance():
"""Arabic body with fixed English headings → all four sections split non-empty."""
r = parse_output(ARABIC_BODY)
assert r.tldr, "tldr should be non-empty for Arabic body"
assert r.why, "why should be non-empty for Arabic body"
assert r.actions, "actions should be non-empty for Arabic body"
assert r.deadlines, "deadlines should be non-empty for Arabic body"
assert r.severity == 2
def test_empty_raw_no_exception():
"""Empty raw string → StructuredResult with empty fields and severity=None, no crash."""
r = parse_output("")
assert r.severity is None
assert r.transcription == ""
assert r.quip == ""
assert r.tldr == ""
assert r.why == ""
assert r.actions == ""
assert r.deadlines == ""
assert r.raw == ""
def test_severity_regex_tolerates_trailing_whitespace():
"""SEVERITY regex matches with trailing whitespace/newline and only accepts 1-5."""
for sev in range(1, 6):
raw = f"<transcription>x</transcription>\nBureaucat says: hi\n## TL;DR\na\n## Why you got this\nb\n## What you need to do\nc\n## Deadlines & money\nd\nSEVERITY: {sev} \n"
r = parse_output(raw)
assert r.severity == sev, f"Expected {sev}, got {r.severity}"
# Value 0 and 6 should not match
for bad in ("0", "6", "10"):
raw = f"<transcription>x</transcription>\nBureaucat says: hi\n## TL;DR\na\n## Why you got this\nb\n## What you need to do\nc\n## Deadlines & money\nd\nSEVERITY: {bad}\n"
r = parse_output(raw)
assert r.severity is None, f"Severity {bad} should not parse to int, got {r.severity}"
def test_build_user_prompt_beginner_mode_invariance():
"""
D-08 unit guard: beginner and standard prompts differ only by appended
inline-explanation guidance. Neither adds/removes sections, alters the
SEVERITY line reference, or touches the transcription block.
"""
standard = build_user_prompt("English", beginner_mode=False)
beginner = build_user_prompt("English", beginner_mode=True)
# Beginner must be strictly longer (has additional guidance appended)
assert len(beginner) > len(standard), "beginner prompt must be longer than standard"
# Standard must be a prefix of beginner (beginner only appends, never replaces)
assert beginner.startswith(standard), (
"beginner prompt must start with the full standard prompt"
)
# The extra beginner content must not mention adding/removing sections
extra = beginner[len(standard):]
assert "new section" not in extra.lower() or "do not add new sections" in extra.lower(), (
"beginner extra guidance must not instruct adding new sections"
)
# Neither prompt should reference SEVERITY or transcription in a way that
# would alter those structural elements (those are in SYSTEM_PROMPT only)
assert "SEVERITY" not in standard
assert "SEVERITY" not in beginner
assert "<transcription>" not in standard
assert "<transcription>" not in beginner
def test_build_user_prompt_language_interpolation():
"""Language is correctly embedded in the prompt for different languages."""
for lang in ("English", "Hindi", "Arabic", "Spanish", "Swedish"):
p = build_user_prompt(lang, beginner_mode=False)
assert lang in p, f"Language '{lang}' must appear in prompt"
def test_structured_result_is_dataclass():
"""StructuredResult can be instantiated directly and has all required fields."""
r = StructuredResult(
transcription="t", quip="q", tldr="tl", why="w",
actions="a", deadlines="d", severity=3, raw="raw"
)
assert r.severity == 3
assert r.transcription == "t"
def test_model_is_none_under_no_model_flag():
"""BUREAUCAT_NO_MODEL=1 → app.model is None (set in os.environ before import)."""
assert app.model is None, "model should be None when BUREAUCAT_NO_MODEL is set"
assert app.processor is None, "processor should be None when BUREAUCAT_NO_MODEL is set"
if __name__ == "__main__":
import pytest
pytest.main([__file__, "-v"])