Spaces:
Running
Sprint 13: comprehensive test fixtures and end-to-end hardening
Browse files6 test fixtures covering all scenarios from §28.5:
- paddle_ocr_sample.json: simple page (existing)
- double_column.json: 4 items in 2 columns — tests reading order inference
- noisy_page.json: negative coords, low confidence, skewed polygons —
tests bbox clamping and repair
- title_body.json: title + body paragraphs
- hyphenation_sample.json: "patri-" + "moine" — tests hyphenation detection
- text_only_blocks.json: structured text without geometry — tests text_only
adapter, ALTO refusal, viewer degraded mode
End-to-end tests run full pipeline for each fixture:
raw → normalize → enrich → validate → ALTO + PAGE + viewer
Bug fix: word_box_json adapter now clamps negative coordinates from noisy
OCR data before creating Geometry objects (real-world PaddleOCR can produce
bbox vertices outside image bounds).
20 new tests. 497 total passing.
https://claude.ai/code/session_01Cuzvc9Pjfo5u46eT3ta2Cg
- src/app/providers/adapters/word_box_json.py +4 -0
- tests/fixtures/double_column.json +18 -0
- tests/fixtures/hyphenation_sample.json +18 -0
- tests/fixtures/noisy_page.json +18 -0
- tests/fixtures/text_only_blocks.json +8 -0
- tests/fixtures/title_body.json +18 -0
- tests/integration/test_fixtures_e2e.py +263 -0
|
@@ -86,6 +86,10 @@ class WordBoxJsonAdapter(BaseAdapter):
|
|
| 86 |
bbox = (bbox[0] * factor, bbox[1] * factor, bbox[2] * factor, bbox[3] * factor)
|
| 87 |
polygon = [(x * factor, y * factor) for x, y in polygon]
|
| 88 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
word_bboxes.append(bbox)
|
| 90 |
word_data.append({
|
| 91 |
"bbox": bbox,
|
|
|
|
| 86 |
bbox = (bbox[0] * factor, bbox[1] * factor, bbox[2] * factor, bbox[3] * factor)
|
| 87 |
polygon = [(x * factor, y * factor) for x, y in polygon]
|
| 88 |
|
| 89 |
+
# Clamp negative coordinates (real OCR data can produce these)
|
| 90 |
+
bbox = (max(0.0, bbox[0]), max(0.0, bbox[1]), bbox[2], bbox[3])
|
| 91 |
+
polygon = [(max(0.0, x), max(0.0, y)) for x, y in polygon]
|
| 92 |
+
|
| 93 |
word_bboxes.append(bbox)
|
| 94 |
word_data.append({
|
| 95 |
"bbox": bbox,
|
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
[
|
| 3 |
+
[[50, 100], [480, 100], [480, 140], [50, 140]],
|
| 4 |
+
["Première colonne ligne un", 0.93]
|
| 5 |
+
],
|
| 6 |
+
[
|
| 7 |
+
[[50, 160], [480, 160], [480, 200], [50, 200]],
|
| 8 |
+
["Première colonne ligne deux", 0.91]
|
| 9 |
+
],
|
| 10 |
+
[
|
| 11 |
+
[[520, 100], [950, 100], [950, 140], [520, 140]],
|
| 12 |
+
["Seconde colonne ligne un", 0.94]
|
| 13 |
+
],
|
| 14 |
+
[
|
| 15 |
+
[[520, 160], [950, 160], [950, 200], [520, 200]],
|
| 16 |
+
["Seconde colonne ligne deux", 0.89]
|
| 17 |
+
]
|
| 18 |
+
]
|
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
[
|
| 3 |
+
[[100, 200], [300, 200], [300, 240], [100, 240]],
|
| 4 |
+
["Le", 0.99]
|
| 5 |
+
],
|
| 6 |
+
[
|
| 7 |
+
[[320, 200], [600, 200], [600, 240], [320, 240]],
|
| 8 |
+
["patri-", 0.92]
|
| 9 |
+
],
|
| 10 |
+
[
|
| 11 |
+
[[100, 260], [300, 260], [300, 300], [100, 300]],
|
| 12 |
+
["moine", 0.91]
|
| 13 |
+
],
|
| 14 |
+
[
|
| 15 |
+
[[320, 260], [700, 260], [700, 300], [320, 300]],
|
| 16 |
+
["est important.", 0.94]
|
| 17 |
+
]
|
| 18 |
+
]
|
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
[
|
| 3 |
+
[[98, 195], [405, 203], [403, 248], [96, 240]],
|
| 4 |
+
["Texte légèrement incliné", 0.45]
|
| 5 |
+
],
|
| 6 |
+
[
|
| 7 |
+
[[-5, 300], [200, 300], [200, 340], [-5, 340]],
|
| 8 |
+
["Déborde à gauche", 0.30]
|
| 9 |
+
],
|
| 10 |
+
[
|
| 11 |
+
[[100, 400], [1200, 400], [1200, 440], [100, 440]],
|
| 12 |
+
["Mot avec confiance très basse", 0.12]
|
| 13 |
+
],
|
| 14 |
+
[
|
| 15 |
+
[[100, 500], [300, 500], [300, 540], [100, 540]],
|
| 16 |
+
["OK", 0.88]
|
| 17 |
+
]
|
| 18 |
+
]
|
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"text": "ignored when blocks present",
|
| 3 |
+
"blocks": [
|
| 4 |
+
{"text": "Première section du document.\nAvec une seconde ligne."},
|
| 5 |
+
{"text": "Deuxième section.\nElle contient aussi deux lignes."},
|
| 6 |
+
{"text": "Conclusion en une ligne."}
|
| 7 |
+
]
|
| 8 |
+
}
|
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
[
|
| 3 |
+
[[200, 50], [800, 50], [800, 110], [200, 110]],
|
| 4 |
+
["Grand Titre du Document", 0.98]
|
| 5 |
+
],
|
| 6 |
+
[
|
| 7 |
+
[[100, 200], [900, 200], [900, 240], [100, 240]],
|
| 8 |
+
["Ceci est le corps du texte.", 0.95]
|
| 9 |
+
],
|
| 10 |
+
[
|
| 11 |
+
[[100, 260], [900, 260], [900, 300], [100, 300]],
|
| 12 |
+
["Deuxième ligne du paragraphe principal.", 0.93]
|
| 13 |
+
],
|
| 14 |
+
[
|
| 15 |
+
[[100, 320], [900, 320], [900, 360], [100, 360]],
|
| 16 |
+
["Troisième ligne avec plus de contenu.", 0.92]
|
| 17 |
+
]
|
| 18 |
+
]
|
|
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""End-to-end tests with all fixture types (§28.5).
|
| 2 |
+
|
| 3 |
+
Each fixture represents a different document scenario and is run through
|
| 4 |
+
the full pipeline: raw → normalize → enrich → validate → ALTO + PAGE + viewer.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
import json
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
import pytest
|
| 13 |
+
from lxml import etree
|
| 14 |
+
|
| 15 |
+
from src.app.domain.models import CanonicalDocument, RawProviderPayload
|
| 16 |
+
from src.app.domain.models.geometry import GeometryContext
|
| 17 |
+
from src.app.domain.models.status import GeometryStatus, ReadinessLevel
|
| 18 |
+
from src.app.enrichers import EnricherPipeline
|
| 19 |
+
from src.app.enrichers.bbox_repair_light import BboxRepairLightEnricher
|
| 20 |
+
from src.app.enrichers.hyphenation_basic import HyphenationBasicEnricher
|
| 21 |
+
from src.app.enrichers.lang_propagation import LangPropagationEnricher
|
| 22 |
+
from src.app.enrichers.reading_order_simple import ReadingOrderSimpleEnricher
|
| 23 |
+
from src.app.enrichers.text_consistency import TextConsistencyEnricher
|
| 24 |
+
from src.app.normalization.pipeline import normalize
|
| 25 |
+
from src.app.policies.document_policy import DocumentPolicy
|
| 26 |
+
from src.app.policies.export_policy import check_alto_export, check_page_export
|
| 27 |
+
from src.app.serializers.alto_xml import ALTO_NS, serialize_alto
|
| 28 |
+
from src.app.serializers.page_xml import PAGE_NS, serialize_page_xml
|
| 29 |
+
from src.app.validators.export_eligibility_validator import compute_export_eligibility
|
| 30 |
+
from src.app.validators.structural_validator import validate_structure
|
| 31 |
+
from src.app.viewer.projection_builder import build_projection
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
GEO_CTX = GeometryContext(source_width=1000, source_height=800)
|
| 35 |
+
|
| 36 |
+
ENRICHER_PIPELINE = EnricherPipeline([
|
| 37 |
+
BboxRepairLightEnricher(),
|
| 38 |
+
LangPropagationEnricher(),
|
| 39 |
+
ReadingOrderSimpleEnricher(),
|
| 40 |
+
HyphenationBasicEnricher(),
|
| 41 |
+
TextConsistencyEnricher(),
|
| 42 |
+
])
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _run_paddle_pipeline(fixture_name: str, fixtures_dir: Path) -> tuple[
|
| 46 |
+
CanonicalDocument, bytes, bytes, dict
|
| 47 |
+
]:
|
| 48 |
+
"""Full pipeline for a PaddleOCR-format fixture."""
|
| 49 |
+
with open(fixtures_dir / fixture_name) as f:
|
| 50 |
+
payload = json.load(f)
|
| 51 |
+
|
| 52 |
+
raw = RawProviderPayload(
|
| 53 |
+
provider_id="paddleocr", adapter_id="v1", runtime_type="local",
|
| 54 |
+
payload=payload, image_width=1000, image_height=800,
|
| 55 |
+
)
|
| 56 |
+
doc = normalize(raw, "word_box_json", GEO_CTX, document_id=f"test_{fixture_name}")
|
| 57 |
+
doc = ENRICHER_PIPELINE.run(doc, DocumentPolicy())
|
| 58 |
+
|
| 59 |
+
struct_report = validate_structure(doc, bbox_tolerance=5.0)
|
| 60 |
+
eligibility = compute_export_eligibility(doc)
|
| 61 |
+
|
| 62 |
+
alto_bytes = serialize_alto(doc)
|
| 63 |
+
page_bytes = serialize_page_xml(doc)
|
| 64 |
+
vp = build_projection(doc, export_status=eligibility)
|
| 65 |
+
|
| 66 |
+
return doc, alto_bytes, page_bytes, vp.model_dump(mode="json")
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# -- Simple page (paddle_ocr_sample.json) ------------------------------------
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
class TestSimplePage:
|
| 73 |
+
def test_full_pipeline(self, fixtures_dir: Path) -> None:
|
| 74 |
+
doc, alto, page, viewer = _run_paddle_pipeline("paddle_ocr_sample.json", fixtures_dir)
|
| 75 |
+
assert len(doc.pages[0].text_regions) >= 1
|
| 76 |
+
assert b"Bonjour" in alto
|
| 77 |
+
assert b"Bonjour" in page
|
| 78 |
+
assert len(viewer["word_overlays"]) == 5
|
| 79 |
+
|
| 80 |
+
def test_alto_valid_structure(self, fixtures_dir: Path) -> None:
|
| 81 |
+
_, alto, _, _ = _run_paddle_pipeline("paddle_ocr_sample.json", fixtures_dir)
|
| 82 |
+
root = etree.fromstring(alto)
|
| 83 |
+
strings = root.findall(f".//{{{ALTO_NS}}}String")
|
| 84 |
+
assert all(s.get("HPOS").isdigit() for s in strings)
|
| 85 |
+
assert all(s.get("CONTENT") for s in strings)
|
| 86 |
+
|
| 87 |
+
def test_page_has_reading_order(self, fixtures_dir: Path) -> None:
|
| 88 |
+
_, _, page, _ = _run_paddle_pipeline("paddle_ocr_sample.json", fixtures_dir)
|
| 89 |
+
root = etree.fromstring(page)
|
| 90 |
+
refs = root.findall(f".//{{{PAGE_NS}}}RegionRefIndexed")
|
| 91 |
+
assert len(refs) >= 1
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
# -- Double column -----------------------------------------------------------
|
| 95 |
+
|
| 96 |
+
|
| 97 |
+
class TestDoubleColumn:
|
| 98 |
+
def test_four_items(self, fixtures_dir: Path) -> None:
|
| 99 |
+
doc, _, _, _ = _run_paddle_pipeline("double_column.json", fixtures_dir)
|
| 100 |
+
words = [w for r in doc.pages[0].text_regions for l in r.lines for w in l.words]
|
| 101 |
+
assert len(words) == 4
|
| 102 |
+
|
| 103 |
+
def test_reading_order_inferred(self, fixtures_dir: Path) -> None:
|
| 104 |
+
doc, _, _, _ = _run_paddle_pipeline("double_column.json", fixtures_dir)
|
| 105 |
+
assert doc.pages[0].reading_order # enricher should have set it
|
| 106 |
+
|
| 107 |
+
def test_dual_export(self, fixtures_dir: Path) -> None:
|
| 108 |
+
_, alto, page, _ = _run_paddle_pipeline("double_column.json", fixtures_dir)
|
| 109 |
+
alto_root = etree.fromstring(alto)
|
| 110 |
+
page_root = etree.fromstring(page)
|
| 111 |
+
assert len(alto_root.findall(f".//{{{ALTO_NS}}}String")) == 4
|
| 112 |
+
assert len(page_root.findall(f".//{{{PAGE_NS}}}Word")) == 4
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# -- Noisy page --------------------------------------------------------------
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
class TestNoisyPage:
|
| 119 |
+
def test_handles_negative_coords(self, fixtures_dir: Path) -> None:
|
| 120 |
+
doc, _, _, _ = _run_paddle_pipeline("noisy_page.json", fixtures_dir)
|
| 121 |
+
# bbox_repair_light should clip negative coords
|
| 122 |
+
for r in doc.pages[0].text_regions:
|
| 123 |
+
for l in r.lines:
|
| 124 |
+
for w in l.words:
|
| 125 |
+
x, y, _, _ = w.geometry.bbox
|
| 126 |
+
assert x >= 0
|
| 127 |
+
assert y >= 0
|
| 128 |
+
|
| 129 |
+
def test_low_confidence_preserved(self, fixtures_dir: Path) -> None:
|
| 130 |
+
doc, _, _, _ = _run_paddle_pipeline("noisy_page.json", fixtures_dir)
|
| 131 |
+
words = [w for r in doc.pages[0].text_regions for l in r.lines for w in l.words]
|
| 132 |
+
confs = [w.confidence for w in words if w.confidence is not None]
|
| 133 |
+
assert any(c < 0.5 for c in confs)
|
| 134 |
+
|
| 135 |
+
def test_structural_validation(self, fixtures_dir: Path) -> None:
|
| 136 |
+
doc, _, _, _ = _run_paddle_pipeline("noisy_page.json", fixtures_dir)
|
| 137 |
+
report = validate_structure(doc, bbox_tolerance=5.0)
|
| 138 |
+
# After repair, structural issues should be minimized
|
| 139 |
+
assert report.error_count == 0
|
| 140 |
+
|
| 141 |
+
def test_viewer_has_all_words(self, fixtures_dir: Path) -> None:
|
| 142 |
+
_, _, _, viewer = _run_paddle_pipeline("noisy_page.json", fixtures_dir)
|
| 143 |
+
assert len(viewer["word_overlays"]) == 4
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
# -- Title + body ------------------------------------------------------------
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class TestTitleBody:
|
| 150 |
+
def test_four_items(self, fixtures_dir: Path) -> None:
|
| 151 |
+
doc, _, _, _ = _run_paddle_pipeline("title_body.json", fixtures_dir)
|
| 152 |
+
words = [w for r in doc.pages[0].text_regions for l in r.lines for w in l.words]
|
| 153 |
+
assert len(words) == 4
|
| 154 |
+
|
| 155 |
+
def test_alto_all_strings(self, fixtures_dir: Path) -> None:
|
| 156 |
+
_, alto, _, _ = _run_paddle_pipeline("title_body.json", fixtures_dir)
|
| 157 |
+
root = etree.fromstring(alto)
|
| 158 |
+
strings = root.findall(f".//{{{ALTO_NS}}}String")
|
| 159 |
+
assert any("Titre" in s.get("CONTENT", "") for s in strings)
|
| 160 |
+
|
| 161 |
+
def test_page_all_words(self, fixtures_dir: Path) -> None:
|
| 162 |
+
_, _, page, _ = _run_paddle_pipeline("title_body.json", fixtures_dir)
|
| 163 |
+
root = etree.fromstring(page)
|
| 164 |
+
words = root.findall(f".//{{{PAGE_NS}}}Word")
|
| 165 |
+
assert len(words) == 4
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# -- Hyphenation -------------------------------------------------------------
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
class TestHyphenationFixture:
|
| 172 |
+
def test_hyphenation_detected(self, fixtures_dir: Path) -> None:
|
| 173 |
+
doc, _, _, _ = _run_paddle_pipeline("hyphenation_sample.json", fixtures_dir)
|
| 174 |
+
words = [w for r in doc.pages[0].text_regions for l in r.lines for w in l.words]
|
| 175 |
+
hyph_words = [w for w in words if w.hyphenation is not None and w.hyphenation.is_hyphenated]
|
| 176 |
+
assert len(hyph_words) == 2
|
| 177 |
+
assert hyph_words[0].hyphenation.full_form == "patrimoine"
|
| 178 |
+
assert hyph_words[0].hyphenation.part == 1
|
| 179 |
+
assert hyph_words[1].hyphenation.part == 2
|
| 180 |
+
|
| 181 |
+
def test_alto_hyphenation(self, fixtures_dir: Path) -> None:
|
| 182 |
+
_, alto, _, _ = _run_paddle_pipeline("hyphenation_sample.json", fixtures_dir)
|
| 183 |
+
root = etree.fromstring(alto)
|
| 184 |
+
strings = root.findall(f".//{{{ALTO_NS}}}String")
|
| 185 |
+
hyp_strings = [s for s in strings if s.get("SUBS_TYPE")]
|
| 186 |
+
assert len(hyp_strings) == 2
|
| 187 |
+
assert hyp_strings[0].get("SUBS_TYPE") == "HypPart1"
|
| 188 |
+
assert hyp_strings[0].get("SUBS_CONTENT") == "patrimoine"
|
| 189 |
+
|
| 190 |
+
|
| 191 |
+
# -- Text only (no geometry) -------------------------------------------------
|
| 192 |
+
|
| 193 |
+
|
| 194 |
+
class TestTextOnlyFixture:
|
| 195 |
+
def test_produces_document(self, fixtures_dir: Path) -> None:
|
| 196 |
+
with open(fixtures_dir / "text_only_blocks.json") as f:
|
| 197 |
+
payload = json.load(f)
|
| 198 |
+
|
| 199 |
+
raw = RawProviderPayload(
|
| 200 |
+
provider_id="qwen", adapter_id="v1", runtime_type="api",
|
| 201 |
+
payload=payload, image_width=1000, image_height=800,
|
| 202 |
+
)
|
| 203 |
+
doc = normalize(raw, "text_only", GEO_CTX, document_id="text_test")
|
| 204 |
+
|
| 205 |
+
assert len(doc.pages[0].text_regions) == 3
|
| 206 |
+
|
| 207 |
+
def test_geometry_is_unknown(self, fixtures_dir: Path) -> None:
|
| 208 |
+
with open(fixtures_dir / "text_only_blocks.json") as f:
|
| 209 |
+
payload = json.load(f)
|
| 210 |
+
|
| 211 |
+
raw = RawProviderPayload(
|
| 212 |
+
provider_id="qwen", adapter_id="v1", runtime_type="api",
|
| 213 |
+
payload=payload, image_width=1000, image_height=800,
|
| 214 |
+
)
|
| 215 |
+
doc = normalize(raw, "text_only", GEO_CTX, document_id="text_test")
|
| 216 |
+
|
| 217 |
+
word = doc.pages[0].text_regions[0].lines[0].words[0]
|
| 218 |
+
assert word.geometry.status == GeometryStatus.UNKNOWN
|
| 219 |
+
|
| 220 |
+
def test_alto_refused(self, fixtures_dir: Path) -> None:
|
| 221 |
+
with open(fixtures_dir / "text_only_blocks.json") as f:
|
| 222 |
+
payload = json.load(f)
|
| 223 |
+
|
| 224 |
+
raw = RawProviderPayload(
|
| 225 |
+
provider_id="qwen", adapter_id="v1", runtime_type="api",
|
| 226 |
+
payload=payload, image_width=1000, image_height=800,
|
| 227 |
+
)
|
| 228 |
+
doc = normalize(raw, "text_only", GEO_CTX, document_id="text_test")
|
| 229 |
+
|
| 230 |
+
eligibility = compute_export_eligibility(doc)
|
| 231 |
+
decision = check_alto_export(eligibility)
|
| 232 |
+
assert decision.allowed is False
|
| 233 |
+
|
| 234 |
+
def test_page_export_possible(self, fixtures_dir: Path) -> None:
|
| 235 |
+
with open(fixtures_dir / "text_only_blocks.json") as f:
|
| 236 |
+
payload = json.load(f)
|
| 237 |
+
|
| 238 |
+
raw = RawProviderPayload(
|
| 239 |
+
provider_id="qwen", adapter_id="v1", runtime_type="api",
|
| 240 |
+
payload=payload, image_width=1000, image_height=800,
|
| 241 |
+
)
|
| 242 |
+
doc = normalize(raw, "text_only", GEO_CTX, document_id="text_test")
|
| 243 |
+
|
| 244 |
+
eligibility = compute_export_eligibility(doc)
|
| 245 |
+
# PAGE is more lenient — may still be exportable
|
| 246 |
+
page_decision = check_page_export(eligibility)
|
| 247 |
+
# Even if refused, it should give a clear reason
|
| 248 |
+
assert page_decision.reason
|
| 249 |
+
|
| 250 |
+
def test_viewer_renders_degraded(self, fixtures_dir: Path) -> None:
|
| 251 |
+
with open(fixtures_dir / "text_only_blocks.json") as f:
|
| 252 |
+
payload = json.load(f)
|
| 253 |
+
|
| 254 |
+
raw = RawProviderPayload(
|
| 255 |
+
provider_id="qwen", adapter_id="v1", runtime_type="api",
|
| 256 |
+
payload=payload, image_width=1000, image_height=800,
|
| 257 |
+
)
|
| 258 |
+
doc = normalize(raw, "text_only", GEO_CTX, document_id="text_test")
|
| 259 |
+
eligibility = compute_export_eligibility(doc)
|
| 260 |
+
|
| 261 |
+
vp = build_projection(doc, export_status=eligibility)
|
| 262 |
+
# Should still have overlays even with unknown geometry
|
| 263 |
+
assert len(vp.word_overlays) > 0
|