Claude commited on
Commit
84ced9b
·
unverified ·
1 Parent(s): 0b24cca

Sprint 13: comprehensive test fixtures and end-to-end hardening

Browse files

6 test fixtures covering all scenarios from §28.5:
- paddle_ocr_sample.json: simple page (existing)
- double_column.json: 4 items in 2 columns — tests reading order inference
- noisy_page.json: negative coords, low confidence, skewed polygons —
tests bbox clamping and repair
- title_body.json: title + body paragraphs
- hyphenation_sample.json: "patri-" + "moine" — tests hyphenation detection
- text_only_blocks.json: structured text without geometry — tests text_only
adapter, ALTO refusal, viewer degraded mode

End-to-end tests run full pipeline for each fixture:
raw → normalize → enrich → validate → ALTO + PAGE + viewer

Bug fix: word_box_json adapter now clamps negative coordinates from noisy
OCR data before creating Geometry objects (real-world PaddleOCR can produce
bbox vertices outside image bounds).

20 new tests. 497 total passing.

https://claude.ai/code/session_01Cuzvc9Pjfo5u46eT3ta2Cg

src/app/providers/adapters/word_box_json.py CHANGED
@@ -86,6 +86,10 @@ class WordBoxJsonAdapter(BaseAdapter):
86
  bbox = (bbox[0] * factor, bbox[1] * factor, bbox[2] * factor, bbox[3] * factor)
87
  polygon = [(x * factor, y * factor) for x, y in polygon]
88
 
 
 
 
 
89
  word_bboxes.append(bbox)
90
  word_data.append({
91
  "bbox": bbox,
 
86
  bbox = (bbox[0] * factor, bbox[1] * factor, bbox[2] * factor, bbox[3] * factor)
87
  polygon = [(x * factor, y * factor) for x, y in polygon]
88
 
89
+ # Clamp negative coordinates (real OCR data can produce these)
90
+ bbox = (max(0.0, bbox[0]), max(0.0, bbox[1]), bbox[2], bbox[3])
91
+ polygon = [(max(0.0, x), max(0.0, y)) for x, y in polygon]
92
+
93
  word_bboxes.append(bbox)
94
  word_data.append({
95
  "bbox": bbox,
tests/fixtures/double_column.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ [
3
+ [[50, 100], [480, 100], [480, 140], [50, 140]],
4
+ ["Première colonne ligne un", 0.93]
5
+ ],
6
+ [
7
+ [[50, 160], [480, 160], [480, 200], [50, 200]],
8
+ ["Première colonne ligne deux", 0.91]
9
+ ],
10
+ [
11
+ [[520, 100], [950, 100], [950, 140], [520, 140]],
12
+ ["Seconde colonne ligne un", 0.94]
13
+ ],
14
+ [
15
+ [[520, 160], [950, 160], [950, 200], [520, 200]],
16
+ ["Seconde colonne ligne deux", 0.89]
17
+ ]
18
+ ]
tests/fixtures/hyphenation_sample.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ [
3
+ [[100, 200], [300, 200], [300, 240], [100, 240]],
4
+ ["Le", 0.99]
5
+ ],
6
+ [
7
+ [[320, 200], [600, 200], [600, 240], [320, 240]],
8
+ ["patri-", 0.92]
9
+ ],
10
+ [
11
+ [[100, 260], [300, 260], [300, 300], [100, 300]],
12
+ ["moine", 0.91]
13
+ ],
14
+ [
15
+ [[320, 260], [700, 260], [700, 300], [320, 300]],
16
+ ["est important.", 0.94]
17
+ ]
18
+ ]
tests/fixtures/noisy_page.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ [
3
+ [[98, 195], [405, 203], [403, 248], [96, 240]],
4
+ ["Texte légèrement incliné", 0.45]
5
+ ],
6
+ [
7
+ [[-5, 300], [200, 300], [200, 340], [-5, 340]],
8
+ ["Déborde à gauche", 0.30]
9
+ ],
10
+ [
11
+ [[100, 400], [1200, 400], [1200, 440], [100, 440]],
12
+ ["Mot avec confiance très basse", 0.12]
13
+ ],
14
+ [
15
+ [[100, 500], [300, 500], [300, 540], [100, 540]],
16
+ ["OK", 0.88]
17
+ ]
18
+ ]
tests/fixtures/text_only_blocks.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "text": "ignored when blocks present",
3
+ "blocks": [
4
+ {"text": "Première section du document.\nAvec une seconde ligne."},
5
+ {"text": "Deuxième section.\nElle contient aussi deux lignes."},
6
+ {"text": "Conclusion en une ligne."}
7
+ ]
8
+ }
tests/fixtures/title_body.json ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ [
3
+ [[200, 50], [800, 50], [800, 110], [200, 110]],
4
+ ["Grand Titre du Document", 0.98]
5
+ ],
6
+ [
7
+ [[100, 200], [900, 200], [900, 240], [100, 240]],
8
+ ["Ceci est le corps du texte.", 0.95]
9
+ ],
10
+ [
11
+ [[100, 260], [900, 260], [900, 300], [100, 300]],
12
+ ["Deuxième ligne du paragraphe principal.", 0.93]
13
+ ],
14
+ [
15
+ [[100, 320], [900, 320], [900, 360], [100, 360]],
16
+ ["Troisième ligne avec plus de contenu.", 0.92]
17
+ ]
18
+ ]
tests/integration/test_fixtures_e2e.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """End-to-end tests with all fixture types (§28.5).
2
+
3
+ Each fixture represents a different document scenario and is run through
4
+ the full pipeline: raw → normalize → enrich → validate → ALTO + PAGE + viewer.
5
+ """
6
+
7
+ from __future__ import annotations
8
+
9
+ import json
10
+ from pathlib import Path
11
+
12
+ import pytest
13
+ from lxml import etree
14
+
15
+ from src.app.domain.models import CanonicalDocument, RawProviderPayload
16
+ from src.app.domain.models.geometry import GeometryContext
17
+ from src.app.domain.models.status import GeometryStatus, ReadinessLevel
18
+ from src.app.enrichers import EnricherPipeline
19
+ from src.app.enrichers.bbox_repair_light import BboxRepairLightEnricher
20
+ from src.app.enrichers.hyphenation_basic import HyphenationBasicEnricher
21
+ from src.app.enrichers.lang_propagation import LangPropagationEnricher
22
+ from src.app.enrichers.reading_order_simple import ReadingOrderSimpleEnricher
23
+ from src.app.enrichers.text_consistency import TextConsistencyEnricher
24
+ from src.app.normalization.pipeline import normalize
25
+ from src.app.policies.document_policy import DocumentPolicy
26
+ from src.app.policies.export_policy import check_alto_export, check_page_export
27
+ from src.app.serializers.alto_xml import ALTO_NS, serialize_alto
28
+ from src.app.serializers.page_xml import PAGE_NS, serialize_page_xml
29
+ from src.app.validators.export_eligibility_validator import compute_export_eligibility
30
+ from src.app.validators.structural_validator import validate_structure
31
+ from src.app.viewer.projection_builder import build_projection
32
+
33
+
34
+ GEO_CTX = GeometryContext(source_width=1000, source_height=800)
35
+
36
+ ENRICHER_PIPELINE = EnricherPipeline([
37
+ BboxRepairLightEnricher(),
38
+ LangPropagationEnricher(),
39
+ ReadingOrderSimpleEnricher(),
40
+ HyphenationBasicEnricher(),
41
+ TextConsistencyEnricher(),
42
+ ])
43
+
44
+
45
+ def _run_paddle_pipeline(fixture_name: str, fixtures_dir: Path) -> tuple[
46
+ CanonicalDocument, bytes, bytes, dict
47
+ ]:
48
+ """Full pipeline for a PaddleOCR-format fixture."""
49
+ with open(fixtures_dir / fixture_name) as f:
50
+ payload = json.load(f)
51
+
52
+ raw = RawProviderPayload(
53
+ provider_id="paddleocr", adapter_id="v1", runtime_type="local",
54
+ payload=payload, image_width=1000, image_height=800,
55
+ )
56
+ doc = normalize(raw, "word_box_json", GEO_CTX, document_id=f"test_{fixture_name}")
57
+ doc = ENRICHER_PIPELINE.run(doc, DocumentPolicy())
58
+
59
+ struct_report = validate_structure(doc, bbox_tolerance=5.0)
60
+ eligibility = compute_export_eligibility(doc)
61
+
62
+ alto_bytes = serialize_alto(doc)
63
+ page_bytes = serialize_page_xml(doc)
64
+ vp = build_projection(doc, export_status=eligibility)
65
+
66
+ return doc, alto_bytes, page_bytes, vp.model_dump(mode="json")
67
+
68
+
69
+ # -- Simple page (paddle_ocr_sample.json) ------------------------------------
70
+
71
+
72
+ class TestSimplePage:
73
+ def test_full_pipeline(self, fixtures_dir: Path) -> None:
74
+ doc, alto, page, viewer = _run_paddle_pipeline("paddle_ocr_sample.json", fixtures_dir)
75
+ assert len(doc.pages[0].text_regions) >= 1
76
+ assert b"Bonjour" in alto
77
+ assert b"Bonjour" in page
78
+ assert len(viewer["word_overlays"]) == 5
79
+
80
+ def test_alto_valid_structure(self, fixtures_dir: Path) -> None:
81
+ _, alto, _, _ = _run_paddle_pipeline("paddle_ocr_sample.json", fixtures_dir)
82
+ root = etree.fromstring(alto)
83
+ strings = root.findall(f".//{{{ALTO_NS}}}String")
84
+ assert all(s.get("HPOS").isdigit() for s in strings)
85
+ assert all(s.get("CONTENT") for s in strings)
86
+
87
+ def test_page_has_reading_order(self, fixtures_dir: Path) -> None:
88
+ _, _, page, _ = _run_paddle_pipeline("paddle_ocr_sample.json", fixtures_dir)
89
+ root = etree.fromstring(page)
90
+ refs = root.findall(f".//{{{PAGE_NS}}}RegionRefIndexed")
91
+ assert len(refs) >= 1
92
+
93
+
94
+ # -- Double column -----------------------------------------------------------
95
+
96
+
97
+ class TestDoubleColumn:
98
+ def test_four_items(self, fixtures_dir: Path) -> None:
99
+ doc, _, _, _ = _run_paddle_pipeline("double_column.json", fixtures_dir)
100
+ words = [w for r in doc.pages[0].text_regions for l in r.lines for w in l.words]
101
+ assert len(words) == 4
102
+
103
+ def test_reading_order_inferred(self, fixtures_dir: Path) -> None:
104
+ doc, _, _, _ = _run_paddle_pipeline("double_column.json", fixtures_dir)
105
+ assert doc.pages[0].reading_order # enricher should have set it
106
+
107
+ def test_dual_export(self, fixtures_dir: Path) -> None:
108
+ _, alto, page, _ = _run_paddle_pipeline("double_column.json", fixtures_dir)
109
+ alto_root = etree.fromstring(alto)
110
+ page_root = etree.fromstring(page)
111
+ assert len(alto_root.findall(f".//{{{ALTO_NS}}}String")) == 4
112
+ assert len(page_root.findall(f".//{{{PAGE_NS}}}Word")) == 4
113
+
114
+
115
+ # -- Noisy page --------------------------------------------------------------
116
+
117
+
118
+ class TestNoisyPage:
119
+ def test_handles_negative_coords(self, fixtures_dir: Path) -> None:
120
+ doc, _, _, _ = _run_paddle_pipeline("noisy_page.json", fixtures_dir)
121
+ # bbox_repair_light should clip negative coords
122
+ for r in doc.pages[0].text_regions:
123
+ for l in r.lines:
124
+ for w in l.words:
125
+ x, y, _, _ = w.geometry.bbox
126
+ assert x >= 0
127
+ assert y >= 0
128
+
129
+ def test_low_confidence_preserved(self, fixtures_dir: Path) -> None:
130
+ doc, _, _, _ = _run_paddle_pipeline("noisy_page.json", fixtures_dir)
131
+ words = [w for r in doc.pages[0].text_regions for l in r.lines for w in l.words]
132
+ confs = [w.confidence for w in words if w.confidence is not None]
133
+ assert any(c < 0.5 for c in confs)
134
+
135
+ def test_structural_validation(self, fixtures_dir: Path) -> None:
136
+ doc, _, _, _ = _run_paddle_pipeline("noisy_page.json", fixtures_dir)
137
+ report = validate_structure(doc, bbox_tolerance=5.0)
138
+ # After repair, structural issues should be minimized
139
+ assert report.error_count == 0
140
+
141
+ def test_viewer_has_all_words(self, fixtures_dir: Path) -> None:
142
+ _, _, _, viewer = _run_paddle_pipeline("noisy_page.json", fixtures_dir)
143
+ assert len(viewer["word_overlays"]) == 4
144
+
145
+
146
+ # -- Title + body ------------------------------------------------------------
147
+
148
+
149
+ class TestTitleBody:
150
+ def test_four_items(self, fixtures_dir: Path) -> None:
151
+ doc, _, _, _ = _run_paddle_pipeline("title_body.json", fixtures_dir)
152
+ words = [w for r in doc.pages[0].text_regions for l in r.lines for w in l.words]
153
+ assert len(words) == 4
154
+
155
+ def test_alto_all_strings(self, fixtures_dir: Path) -> None:
156
+ _, alto, _, _ = _run_paddle_pipeline("title_body.json", fixtures_dir)
157
+ root = etree.fromstring(alto)
158
+ strings = root.findall(f".//{{{ALTO_NS}}}String")
159
+ assert any("Titre" in s.get("CONTENT", "") for s in strings)
160
+
161
+ def test_page_all_words(self, fixtures_dir: Path) -> None:
162
+ _, _, page, _ = _run_paddle_pipeline("title_body.json", fixtures_dir)
163
+ root = etree.fromstring(page)
164
+ words = root.findall(f".//{{{PAGE_NS}}}Word")
165
+ assert len(words) == 4
166
+
167
+
168
+ # -- Hyphenation -------------------------------------------------------------
169
+
170
+
171
+ class TestHyphenationFixture:
172
+ def test_hyphenation_detected(self, fixtures_dir: Path) -> None:
173
+ doc, _, _, _ = _run_paddle_pipeline("hyphenation_sample.json", fixtures_dir)
174
+ words = [w for r in doc.pages[0].text_regions for l in r.lines for w in l.words]
175
+ hyph_words = [w for w in words if w.hyphenation is not None and w.hyphenation.is_hyphenated]
176
+ assert len(hyph_words) == 2
177
+ assert hyph_words[0].hyphenation.full_form == "patrimoine"
178
+ assert hyph_words[0].hyphenation.part == 1
179
+ assert hyph_words[1].hyphenation.part == 2
180
+
181
+ def test_alto_hyphenation(self, fixtures_dir: Path) -> None:
182
+ _, alto, _, _ = _run_paddle_pipeline("hyphenation_sample.json", fixtures_dir)
183
+ root = etree.fromstring(alto)
184
+ strings = root.findall(f".//{{{ALTO_NS}}}String")
185
+ hyp_strings = [s for s in strings if s.get("SUBS_TYPE")]
186
+ assert len(hyp_strings) == 2
187
+ assert hyp_strings[0].get("SUBS_TYPE") == "HypPart1"
188
+ assert hyp_strings[0].get("SUBS_CONTENT") == "patrimoine"
189
+
190
+
191
+ # -- Text only (no geometry) -------------------------------------------------
192
+
193
+
194
+ class TestTextOnlyFixture:
195
+ def test_produces_document(self, fixtures_dir: Path) -> None:
196
+ with open(fixtures_dir / "text_only_blocks.json") as f:
197
+ payload = json.load(f)
198
+
199
+ raw = RawProviderPayload(
200
+ provider_id="qwen", adapter_id="v1", runtime_type="api",
201
+ payload=payload, image_width=1000, image_height=800,
202
+ )
203
+ doc = normalize(raw, "text_only", GEO_CTX, document_id="text_test")
204
+
205
+ assert len(doc.pages[0].text_regions) == 3
206
+
207
+ def test_geometry_is_unknown(self, fixtures_dir: Path) -> None:
208
+ with open(fixtures_dir / "text_only_blocks.json") as f:
209
+ payload = json.load(f)
210
+
211
+ raw = RawProviderPayload(
212
+ provider_id="qwen", adapter_id="v1", runtime_type="api",
213
+ payload=payload, image_width=1000, image_height=800,
214
+ )
215
+ doc = normalize(raw, "text_only", GEO_CTX, document_id="text_test")
216
+
217
+ word = doc.pages[0].text_regions[0].lines[0].words[0]
218
+ assert word.geometry.status == GeometryStatus.UNKNOWN
219
+
220
+ def test_alto_refused(self, fixtures_dir: Path) -> None:
221
+ with open(fixtures_dir / "text_only_blocks.json") as f:
222
+ payload = json.load(f)
223
+
224
+ raw = RawProviderPayload(
225
+ provider_id="qwen", adapter_id="v1", runtime_type="api",
226
+ payload=payload, image_width=1000, image_height=800,
227
+ )
228
+ doc = normalize(raw, "text_only", GEO_CTX, document_id="text_test")
229
+
230
+ eligibility = compute_export_eligibility(doc)
231
+ decision = check_alto_export(eligibility)
232
+ assert decision.allowed is False
233
+
234
+ def test_page_export_possible(self, fixtures_dir: Path) -> None:
235
+ with open(fixtures_dir / "text_only_blocks.json") as f:
236
+ payload = json.load(f)
237
+
238
+ raw = RawProviderPayload(
239
+ provider_id="qwen", adapter_id="v1", runtime_type="api",
240
+ payload=payload, image_width=1000, image_height=800,
241
+ )
242
+ doc = normalize(raw, "text_only", GEO_CTX, document_id="text_test")
243
+
244
+ eligibility = compute_export_eligibility(doc)
245
+ # PAGE is more lenient — may still be exportable
246
+ page_decision = check_page_export(eligibility)
247
+ # Even if refused, it should give a clear reason
248
+ assert page_decision.reason
249
+
250
+ def test_viewer_renders_degraded(self, fixtures_dir: Path) -> None:
251
+ with open(fixtures_dir / "text_only_blocks.json") as f:
252
+ payload = json.load(f)
253
+
254
+ raw = RawProviderPayload(
255
+ provider_id="qwen", adapter_id="v1", runtime_type="api",
256
+ payload=payload, image_width=1000, image_height=800,
257
+ )
258
+ doc = normalize(raw, "text_only", GEO_CTX, document_id="text_test")
259
+ eligibility = compute_export_eligibility(doc)
260
+
261
+ vp = build_projection(doc, export_status=eligibility)
262
+ # Should still have overlays even with unknown geometry
263
+ assert len(vp.word_overlays) > 0