maribakulj commited on
Commit
024c169
·
unverified ·
2 Parent(s): 38ae153bbbfba8

Merge pull request #5 from maribakulj/claude/repo-audit-deployment-71Dac

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .github/workflows/ci.yml +29 -0
  2. Dockerfile +7 -0
  3. README.md +12 -0
  4. pyproject.toml +1 -1
  5. src/app/api/__init__.py +5 -1
  6. src/app/api/routes_exports.py +4 -1
  7. src/app/api/routes_jobs.py +6 -3
  8. src/app/domain/errors/__init__.py +6 -3
  9. src/app/domain/models/__init__.py +1 -1
  10. src/app/domain/models/canonical_document.py +3 -4
  11. src/app/domain/models/geometry.py +0 -1
  12. src/app/domain/models/raw_payload.py +2 -2
  13. src/app/domain/models/status.py +11 -12
  14. src/app/enrichers/__init__.py +4 -1
  15. src/app/enrichers/bbox_repair_light.py +6 -2
  16. src/app/enrichers/hyphenation_basic.py +6 -2
  17. src/app/enrichers/lang_propagation.py +6 -2
  18. src/app/enrichers/polygon_to_bbox.py +7 -3
  19. src/app/enrichers/reading_order_simple.py +6 -2
  20. src/app/enrichers/text_consistency.py +6 -2
  21. src/app/geometry/baseline.py +0 -1
  22. src/app/geometry/bbox.py +0 -1
  23. src/app/geometry/normalization.py +5 -1
  24. src/app/geometry/polygon.py +3 -1
  25. src/app/geometry/quantization.py +5 -3
  26. src/app/geometry/transforms.py +4 -3
  27. src/app/jobs/events.py +11 -8
  28. src/app/jobs/models.py +4 -4
  29. src/app/jobs/service.py +17 -14
  30. src/app/main.py +5 -2
  31. src/app/normalization/pipeline.py +6 -2
  32. src/app/persistence/db.py +13 -6
  33. src/app/persistence/file_store.py +19 -9
  34. src/app/policies/document_policy.py +2 -2
  35. src/app/policies/export_policy.py +8 -2
  36. src/app/providers/adapters/base.py +4 -2
  37. src/app/providers/adapters/line_box_json.py +5 -1
  38. src/app/providers/adapters/text_only.py +11 -3
  39. src/app/providers/adapters/word_box_json.py +7 -2
  40. src/app/providers/profiles.py +7 -7
  41. src/app/providers/registry.py +6 -2
  42. src/app/providers/resolver.py +13 -4
  43. src/app/providers/runtimes/api_runtime.py +6 -3
  44. src/app/providers/runtimes/base.py +5 -3
  45. src/app/providers/runtimes/hub_runtime.py +6 -3
  46. src/app/providers/runtimes/local_runtime.py +6 -3
  47. src/app/serializers/alto_xml.py +5 -1
  48. src/app/serializers/page_xml.py +14 -14
  49. src/app/settings.py +5 -3
  50. src/app/validators/export_eligibility_validator.py +8 -4
.github/workflows/ci.yml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: CI
2
+
3
+ on:
4
+ push:
5
+ branches: [main]
6
+ pull_request:
7
+ branches: [main]
8
+
9
+ jobs:
10
+ lint:
11
+ runs-on: ubuntu-latest
12
+ steps:
13
+ - uses: actions/checkout@v4
14
+ - uses: actions/setup-python@v5
15
+ with:
16
+ python-version: "3.11"
17
+ - run: pip install ruff
18
+ - run: ruff check src/ tests/
19
+
20
+ test:
21
+ runs-on: ubuntu-latest
22
+ steps:
23
+ - uses: actions/checkout@v4
24
+ - uses: actions/setup-python@v5
25
+ with:
26
+ python-version: "3.11"
27
+ - run: pip install -e ".[dev]"
28
+ - run: pytest --tb=short -q
29
+ - run: pytest --cov=src --cov-report=term-missing --cov-fail-under=90
Dockerfile CHANGED
@@ -16,6 +16,13 @@ COPY src/ src/
16
  COPY frontend/ frontend/
17
  COPY AGENTS.md ./
18
 
 
 
 
 
 
 
 
19
  # Default storage root — overridden in Space mode via /data
20
  ENV STORAGE_ROOT=/app/data
21
  ENV HOST=0.0.0.0
 
16
  COPY frontend/ frontend/
17
  COPY AGENTS.md ./
18
 
19
+ # HF Spaces requires a non-root user
20
+ RUN useradd -m -u 1000 appuser \
21
+ && mkdir -p /app/data /data \
22
+ && chown -R appuser:appuser /app /data
23
+
24
+ USER appuser
25
+
26
  # Default storage root — overridden in Space mode via /data
27
  ENV STORAGE_ROOT=/app/data
28
  ENV HOST=0.0.0.0
README.md CHANGED
@@ -1,3 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
  # XmLLM
2
 
3
  **Canonical-first document structure engine** that converts OCR/VLM provider outputs into validated **ALTO XML** and **PAGE XML**, via an internal canonical representation.
 
1
+ ---
2
+ title: XmLLM
3
+ emoji: "\U0001F4C4"
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: docker
7
+ app_port: 7860
8
+ pinned: false
9
+ license: apache-2.0
10
+ short_description: "Document structure engine: OCR output to ALTO XML & PAGE XML"
11
+ ---
12
+
13
  # XmLLM
14
 
15
  **Canonical-first document structure engine** that converts OCR/VLM provider outputs into validated **ALTO XML** and **PAGE XML**, via an internal canonical representation.
pyproject.toml CHANGED
@@ -54,7 +54,7 @@ line-length = 100
54
  src = ["src", "tests"]
55
 
56
  [tool.ruff.lint]
57
- select = ["E", "F", "I", "N", "W", "UP", "B", "SIM", "TCH"]
58
 
59
  [tool.mypy]
60
  python_version = "3.11"
 
54
  src = ["src", "tests"]
55
 
56
  [tool.ruff.lint]
57
+ select = ["E", "F", "I", "N", "W", "UP", "B", "SIM"]
58
 
59
  [tool.mypy]
60
  python_version = "3.11"
src/app/api/__init__.py CHANGED
@@ -6,10 +6,14 @@ JobService, scoped to the application lifespan.
6
 
7
  from __future__ import annotations
8
 
 
 
9
  from src.app.jobs.service import JobService
10
  from src.app.persistence.db import Database
11
  from src.app.persistence.file_store import FileStore
12
- from src.app.settings import Settings
 
 
13
 
14
  # Module-level singletons, initialized during lifespan startup.
15
  _db: Database | None = None
 
6
 
7
  from __future__ import annotations
8
 
9
+ from typing import TYPE_CHECKING
10
+
11
  from src.app.jobs.service import JobService
12
  from src.app.persistence.db import Database
13
  from src.app.persistence.file_store import FileStore
14
+
15
+ if TYPE_CHECKING:
16
+ from src.app.settings import Settings
17
 
18
  # Module-level singletons, initialized during lifespan startup.
19
  _db: Database | None = None
src/app/api/routes_exports.py CHANGED
@@ -28,7 +28,10 @@ async def get_canonical(job_id: str) -> dict[str, Any]:
28
  store = get_file_store()
29
  data = store.load_canonical(job_id)
30
  if data is None:
31
- raise HTTPException(status_code=404, detail=f"Canonical document not found for job '{job_id}'")
 
 
 
32
  return data
33
 
34
 
 
28
  store = get_file_store()
29
  data = store.load_canonical(job_id)
30
  if data is None:
31
+ raise HTTPException(
32
+ status_code=404,
33
+ detail=f"Canonical document not found for job '{job_id}'",
34
+ )
35
  return data
36
 
37
 
src/app/api/routes_jobs.py CHANGED
@@ -29,12 +29,15 @@ async def create_job(
29
  """
30
  svc = get_job_service()
31
 
32
- # Read raw payload
33
- content = await raw_payload_file.read()
 
 
 
34
  try:
35
  payload_data = json.loads(content)
36
  except json.JSONDecodeError as e:
37
- raise HTTPException(status_code=422, detail=f"Invalid JSON payload: {e}")
38
 
39
  raw = RawProviderPayload(
40
  provider_id=provider_id,
 
29
  """
30
  svc = get_job_service()
31
 
32
+ # Read raw payload (limit to 50 MB to prevent DoS)
33
+ max_size = 52_428_800
34
+ content = await raw_payload_file.read(max_size + 1)
35
+ if len(content) > max_size:
36
+ raise HTTPException(status_code=413, detail="Payload too large (max 50 MB)")
37
  try:
38
  payload_data = json.loads(content)
39
  except json.JSONDecodeError as e:
40
+ raise HTTPException(status_code=422, detail=f"Invalid JSON payload: {e}") from None
41
 
42
  raw = RawProviderPayload(
43
  provider_id=provider_id,
src/app/domain/errors/__init__.py CHANGED
@@ -2,12 +2,12 @@
2
 
3
  from __future__ import annotations
4
 
5
- from enum import Enum
6
 
7
  from pydantic import BaseModel, ConfigDict, Field
8
 
9
 
10
- class Severity(str, Enum):
11
  """Severity level for validation entries."""
12
 
13
  ERROR = "error"
@@ -22,7 +22,10 @@ class ValidationEntry(BaseModel):
22
 
23
  validator: str = Field(min_length=1)
24
  severity: Severity
25
- path: str = Field(min_length=1, description="Path in the document, e.g. pages[0].text_regions[1].lines[3]")
 
 
 
26
  message: str = Field(min_length=1)
27
  code: str | None = None
28
 
 
2
 
3
  from __future__ import annotations
4
 
5
+ from enum import StrEnum
6
 
7
  from pydantic import BaseModel, ConfigDict, Field
8
 
9
 
10
+ class Severity(StrEnum):
11
  """Severity level for validation entries."""
12
 
13
  ERROR = "error"
 
22
 
23
  validator: str = Field(min_length=1)
24
  severity: Severity
25
+ path: str = Field(
26
+ min_length=1,
27
+ description="Path in the document, e.g. pages[0].text_regions[1].lines[3]",
28
+ )
29
  message: str = Field(min_length=1)
30
  code: str | None = None
31
 
src/app/domain/models/__init__.py CHANGED
@@ -17,8 +17,8 @@ from src.app.domain.models.canonical_document import (
17
  Word,
18
  )
19
  from src.app.domain.models.geometry import (
20
- BBox,
21
  Baseline,
 
22
  ClipRect,
23
  Geometry,
24
  GeometryContext,
 
17
  Word,
18
  )
19
  from src.app.domain.models.geometry import (
 
20
  Baseline,
21
+ BBox,
22
  ClipRect,
23
  Geometry,
24
  GeometryContext,
src/app/domain/models/canonical_document.py CHANGED
@@ -11,10 +11,10 @@ Every node carries geometry + provenance. No exceptions.
11
 
12
  from __future__ import annotations
13
 
14
- from datetime import datetime, timezone
15
  from typing import Any
16
 
17
- from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator
18
 
19
  from src.app.domain.models.geometry import Geometry
20
  from src.app.domain.models.provenance import Provenance
@@ -33,7 +33,6 @@ from src.app.domain.models.status import (
33
  Unit,
34
  )
35
 
36
-
37
  # -- Source ------------------------------------------------------------------
38
 
39
 
@@ -238,7 +237,7 @@ class CanonicalDocument(BaseModel):
238
  schema_version: str = Field(default="1.0.0", pattern=r"^\d+\.\d+\.\d+$")
239
  document_id: str = Field(min_length=1)
240
  source: Source
241
- created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
242
 
243
  pages: list[Page] = Field(min_length=1)
244
 
 
11
 
12
  from __future__ import annotations
13
 
14
+ from datetime import UTC, datetime
15
  from typing import Any
16
 
17
+ from pydantic import BaseModel, ConfigDict, Field, model_validator
18
 
19
  from src.app.domain.models.geometry import Geometry
20
  from src.app.domain.models.provenance import Provenance
 
33
  Unit,
34
  )
35
 
 
36
  # -- Source ------------------------------------------------------------------
37
 
38
 
 
237
  schema_version: str = Field(default="1.0.0", pattern=r"^\d+\.\d+\.\d+$")
238
  document_id: str = Field(min_length=1)
239
  source: Source
240
+ created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
241
 
242
  pages: list[Page] = Field(min_length=1)
243
 
src/app/domain/models/geometry.py CHANGED
@@ -15,7 +15,6 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
15
 
16
  from src.app.domain.models.status import CoordinateOrigin, GeometryStatus, Unit
17
 
18
-
19
  # -- Primitives --------------------------------------------------------------
20
 
21
 
 
15
 
16
  from src.app.domain.models.status import CoordinateOrigin, GeometryStatus, Unit
17
 
 
18
  # -- Primitives --------------------------------------------------------------
19
 
20
 
src/app/domain/models/raw_payload.py CHANGED
@@ -7,7 +7,7 @@ It is never used for export or rendering.
7
 
8
  from __future__ import annotations
9
 
10
- from datetime import datetime, timezone
11
  from typing import Any
12
 
13
  from pydantic import BaseModel, ConfigDict, Field
@@ -26,7 +26,7 @@ class RawProviderPayload(BaseModel):
26
  payload: dict[str, Any] | list[Any]
27
  """The raw JSON-serialisable output from the provider."""
28
 
29
- received_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
30
 
31
  image_width: int | None = Field(default=None, gt=0)
32
  image_height: int | None = Field(default=None, gt=0)
 
7
 
8
  from __future__ import annotations
9
 
10
+ from datetime import UTC, datetime
11
  from typing import Any
12
 
13
  from pydantic import BaseModel, ConfigDict, Field
 
26
  payload: dict[str, Any] | list[Any]
27
  """The raw JSON-serialisable output from the provider."""
28
 
29
+ received_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
30
 
31
  image_width: int | None = Field(default=None, gt=0)
32
  image_height: int | None = Field(default=None, gt=0)
src/app/domain/models/status.py CHANGED
@@ -6,13 +6,12 @@ and viewer projection models. They carry no logic — only values.
6
 
7
  from __future__ import annotations
8
 
9
- from enum import Enum
10
-
11
 
12
  # -- Geometry ----------------------------------------------------------------
13
 
14
 
15
- class GeometryStatus(str, Enum):
16
  """How a piece of geometry was obtained."""
17
 
18
  EXACT = "exact"
@@ -21,13 +20,13 @@ class GeometryStatus(str, Enum):
21
  UNKNOWN = "unknown"
22
 
23
 
24
- class CoordinateOrigin(str, Enum):
25
  """Origin of the coordinate system. Always top_left in canonical model."""
26
 
27
  TOP_LEFT = "top_left"
28
 
29
 
30
- class Unit(str, Enum):
31
  """Measurement unit. Always px in canonical model."""
32
 
33
  PX = "px"
@@ -36,7 +35,7 @@ class Unit(str, Enum):
36
  # -- Provenance --------------------------------------------------------------
37
 
38
 
39
- class EvidenceType(str, Enum):
40
  """How a piece of data was produced."""
41
 
42
  PROVIDER_NATIVE = "provider_native"
@@ -48,7 +47,7 @@ class EvidenceType(str, Enum):
48
  # -- Document structure ------------------------------------------------------
49
 
50
 
51
- class BlockRole(str, Enum):
52
  """Semantic role of a text block within the page."""
53
 
54
  BODY = "body"
@@ -62,7 +61,7 @@ class BlockRole(str, Enum):
62
  OTHER = "other"
63
 
64
 
65
- class NonTextKind(str, Enum):
66
  """Type of non-textual region."""
67
 
68
  ILLUSTRATION = "illustration"
@@ -76,7 +75,7 @@ class NonTextKind(str, Enum):
76
  # -- Source ------------------------------------------------------------------
77
 
78
 
79
- class InputType(str, Enum):
80
  """Type of the original input document."""
81
 
82
  IMAGE = "image"
@@ -92,7 +91,7 @@ class InputType(str, Enum):
92
  # -- Readiness ---------------------------------------------------------------
93
 
94
 
95
- class ReadinessLevel(str, Enum):
96
  """How ready a document / page / element is for export."""
97
 
98
  FULL = "full"
@@ -101,7 +100,7 @@ class ReadinessLevel(str, Enum):
101
  NONE = "none"
102
 
103
 
104
- class MissingCapability(str, Enum):
105
  """Specific capabilities that may be missing for export readiness."""
106
 
107
  PAGE_DIMENSIONS = "page_dimensions"
@@ -117,7 +116,7 @@ class MissingCapability(str, Enum):
117
  # -- Overlay (viewer) --------------------------------------------------------
118
 
119
 
120
- class OverlayLevel(str, Enum):
121
  """Granularity level for viewer overlays."""
122
 
123
  BLOCK = "block"
 
6
 
7
  from __future__ import annotations
8
 
9
+ from enum import StrEnum
 
10
 
11
  # -- Geometry ----------------------------------------------------------------
12
 
13
 
14
+ class GeometryStatus(StrEnum):
15
  """How a piece of geometry was obtained."""
16
 
17
  EXACT = "exact"
 
20
  UNKNOWN = "unknown"
21
 
22
 
23
+ class CoordinateOrigin(StrEnum):
24
  """Origin of the coordinate system. Always top_left in canonical model."""
25
 
26
  TOP_LEFT = "top_left"
27
 
28
 
29
+ class Unit(StrEnum):
30
  """Measurement unit. Always px in canonical model."""
31
 
32
  PX = "px"
 
35
  # -- Provenance --------------------------------------------------------------
36
 
37
 
38
+ class EvidenceType(StrEnum):
39
  """How a piece of data was produced."""
40
 
41
  PROVIDER_NATIVE = "provider_native"
 
47
  # -- Document structure ------------------------------------------------------
48
 
49
 
50
+ class BlockRole(StrEnum):
51
  """Semantic role of a text block within the page."""
52
 
53
  BODY = "body"
 
61
  OTHER = "other"
62
 
63
 
64
+ class NonTextKind(StrEnum):
65
  """Type of non-textual region."""
66
 
67
  ILLUSTRATION = "illustration"
 
75
  # -- Source ------------------------------------------------------------------
76
 
77
 
78
+ class InputType(StrEnum):
79
  """Type of the original input document."""
80
 
81
  IMAGE = "image"
 
91
  # -- Readiness ---------------------------------------------------------------
92
 
93
 
94
+ class ReadinessLevel(StrEnum):
95
  """How ready a document / page / element is for export."""
96
 
97
  FULL = "full"
 
100
  NONE = "none"
101
 
102
 
103
+ class MissingCapability(StrEnum):
104
  """Specific capabilities that may be missing for export readiness."""
105
 
106
  PAGE_DIMENSIONS = "page_dimensions"
 
116
  # -- Overlay (viewer) --------------------------------------------------------
117
 
118
 
119
+ class OverlayLevel(StrEnum):
120
  """Granularity level for viewer overlays."""
121
 
122
  BLOCK = "block"
src/app/enrichers/__init__.py CHANGED
@@ -13,10 +13,13 @@ Every enricher MUST:
13
  from __future__ import annotations
14
 
15
  from abc import ABC, abstractmethod
 
16
 
17
- from src.app.domain.models import CanonicalDocument
18
  from src.app.policies.document_policy import DocumentPolicy
19
 
 
 
 
20
 
21
  class BaseEnricher(ABC):
22
  """Abstract base for enrichers."""
 
13
  from __future__ import annotations
14
 
15
  from abc import ABC, abstractmethod
16
+ from typing import TYPE_CHECKING
17
 
 
18
  from src.app.policies.document_policy import DocumentPolicy
19
 
20
+ if TYPE_CHECKING:
21
+ from src.app.domain.models import CanonicalDocument
22
+
23
 
24
  class BaseEnricher(ABC):
25
  """Abstract base for enrichers."""
src/app/enrichers/bbox_repair_light.py CHANGED
@@ -8,12 +8,16 @@ Light repair only:
8
 
9
  from __future__ import annotations
10
 
11
- from src.app.domain.models import CanonicalDocument
 
12
  from src.app.domain.models.status import GeometryStatus
13
  from src.app.enrichers import BaseEnricher
14
  from src.app.geometry.bbox import contains
15
  from src.app.geometry.transforms import clip_bbox_to_page
16
- from src.app.policies.document_policy import DocumentPolicy
 
 
 
17
 
18
 
19
  class BboxRepairLightEnricher(BaseEnricher):
 
8
 
9
  from __future__ import annotations
10
 
11
+ from typing import TYPE_CHECKING
12
+
13
  from src.app.domain.models.status import GeometryStatus
14
  from src.app.enrichers import BaseEnricher
15
  from src.app.geometry.bbox import contains
16
  from src.app.geometry.transforms import clip_bbox_to_page
17
+
18
+ if TYPE_CHECKING:
19
+ from src.app.domain.models import CanonicalDocument
20
+ from src.app.policies.document_policy import DocumentPolicy
21
 
22
 
23
  class BboxRepairLightEnricher(BaseEnricher):
src/app/enrichers/hyphenation_basic.py CHANGED
@@ -6,9 +6,13 @@ a lowercase word, marks both as hyphenated with the combined full_form.
6
 
7
  from __future__ import annotations
8
 
 
 
9
  from src.app.domain.models import CanonicalDocument, Hyphenation, TextLine
10
  from src.app.enrichers import BaseEnricher
11
- from src.app.policies.document_policy import DocumentPolicy
 
 
12
 
13
 
14
  class HyphenationBasicEnricher(BaseEnricher):
@@ -52,7 +56,7 @@ class HyphenationBasicEnricher(BaseEnricher):
52
  continue
53
 
54
  last_word = line_a.words[-1]
55
- first_word = line_b.words[-0]
56
 
57
  # Skip if already hyphenated
58
  if last_word.hyphenation is not None:
 
6
 
7
  from __future__ import annotations
8
 
9
+ from typing import TYPE_CHECKING
10
+
11
  from src.app.domain.models import CanonicalDocument, Hyphenation, TextLine
12
  from src.app.enrichers import BaseEnricher
13
+
14
+ if TYPE_CHECKING:
15
+ from src.app.policies.document_policy import DocumentPolicy
16
 
17
 
18
  class HyphenationBasicEnricher(BaseEnricher):
 
56
  continue
57
 
58
  last_word = line_a.words[-1]
59
+ first_word = line_b.words[0]
60
 
61
  # Skip if already hyphenated
62
  if last_word.hyphenation is not None:
src/app/enrichers/lang_propagation.py CHANGED
@@ -6,9 +6,13 @@ propagated downward. The word's provenance is not changed — only lang is set.
6
 
7
  from __future__ import annotations
8
 
9
- from src.app.domain.models import CanonicalDocument
 
10
  from src.app.enrichers import BaseEnricher
11
- from src.app.policies.document_policy import DocumentPolicy
 
 
 
12
 
13
 
14
  class LangPropagationEnricher(BaseEnricher):
 
6
 
7
  from __future__ import annotations
8
 
9
+ from typing import TYPE_CHECKING
10
+
11
  from src.app.enrichers import BaseEnricher
12
+
13
+ if TYPE_CHECKING:
14
+ from src.app.domain.models import CanonicalDocument
15
+ from src.app.policies.document_policy import DocumentPolicy
16
 
17
 
18
  class LangPropagationEnricher(BaseEnricher):
src/app/enrichers/polygon_to_bbox.py CHANGED
@@ -6,11 +6,15 @@ axis-aligned bounding box and marks the geometry as 'inferred'.
6
 
7
  from __future__ import annotations
8
 
9
- from src.app.domain.models import CanonicalDocument, Geometry, Provenance
10
- from src.app.domain.models.status import EvidenceType, GeometryStatus
 
11
  from src.app.enrichers import BaseEnricher
12
  from src.app.geometry.polygon import polygon_to_bbox as _poly_to_bbox
13
- from src.app.policies.document_policy import DocumentPolicy
 
 
 
14
 
15
 
16
  class PolygonToBboxEnricher(BaseEnricher):
 
6
 
7
  from __future__ import annotations
8
 
9
+ from typing import TYPE_CHECKING
10
+
11
+ from src.app.domain.models.status import GeometryStatus
12
  from src.app.enrichers import BaseEnricher
13
  from src.app.geometry.polygon import polygon_to_bbox as _poly_to_bbox
14
+
15
+ if TYPE_CHECKING:
16
+ from src.app.domain.models import CanonicalDocument
17
+ from src.app.policies.document_policy import DocumentPolicy
18
 
19
 
20
  class PolygonToBboxEnricher(BaseEnricher):
src/app/enrichers/reading_order_simple.py CHANGED
@@ -6,10 +6,14 @@ left-to-right using the center of each region's bbox.
6
 
7
  from __future__ import annotations
8
 
9
- from src.app.domain.models import CanonicalDocument
 
10
  from src.app.enrichers import BaseEnricher
11
  from src.app.geometry.bbox import center
12
- from src.app.policies.document_policy import DocumentPolicy
 
 
 
13
 
14
 
15
  class ReadingOrderSimpleEnricher(BaseEnricher):
 
6
 
7
  from __future__ import annotations
8
 
9
+ from typing import TYPE_CHECKING
10
+
11
  from src.app.enrichers import BaseEnricher
12
  from src.app.geometry.bbox import center
13
+
14
+ if TYPE_CHECKING:
15
+ from src.app.domain.models import CanonicalDocument
16
+ from src.app.policies.document_policy import DocumentPolicy
17
 
18
 
19
  class ReadingOrderSimpleEnricher(BaseEnricher):
src/app/enrichers/text_consistency.py CHANGED
@@ -7,9 +7,13 @@ This enricher does NOT modify text — it only adds warnings.
7
 
8
  from __future__ import annotations
9
 
10
- from src.app.domain.models import CanonicalDocument
 
11
  from src.app.enrichers import BaseEnricher
12
- from src.app.policies.document_policy import DocumentPolicy
 
 
 
13
 
14
 
15
  class TextConsistencyEnricher(BaseEnricher):
 
7
 
8
  from __future__ import annotations
9
 
10
+ from typing import TYPE_CHECKING
11
+
12
  from src.app.enrichers import BaseEnricher
13
+
14
+ if TYPE_CHECKING:
15
+ from src.app.domain.models import CanonicalDocument
16
+ from src.app.policies.document_policy import DocumentPolicy
17
 
18
 
19
  class TextConsistencyEnricher(BaseEnricher):
src/app/geometry/baseline.py CHANGED
@@ -5,7 +5,6 @@ A baseline is defined by a sequence of points, typically from left to right.
5
 
6
  from __future__ import annotations
7
 
8
-
9
  BaselinePoints = list[tuple[float, float]]
10
 
11
 
 
5
 
6
  from __future__ import annotations
7
 
 
8
  BaselinePoints = list[tuple[float, float]]
9
 
10
 
src/app/geometry/bbox.py CHANGED
@@ -6,7 +6,6 @@ x = left edge, y = top edge, width > 0, height > 0.
6
 
7
  from __future__ import annotations
8
 
9
-
10
  # Type alias for the canonical bbox tuple
11
  BBoxTuple = tuple[float, float, float, float]
12
 
 
6
 
7
  from __future__ import annotations
8
 
 
9
  # Type alias for the canonical bbox tuple
10
  BBoxTuple = tuple[float, float, float, float]
11
 
src/app/geometry/normalization.py CHANGED
@@ -7,9 +7,13 @@ always (x, y, width, height) with origin at top_left, unit px.
7
 
8
  from __future__ import annotations
9
 
10
- from src.app.geometry.bbox import BBoxTuple
 
11
  from src.app.geometry.polygon import PolygonPoints, polygon_to_bbox
12
 
 
 
 
13
 
14
  def xyxy_to_xywh(xyxy: tuple[float, float, float, float]) -> BBoxTuple:
15
  """Convert (x1, y1, x2, y2) to canonical (x, y, width, height).
 
7
 
8
  from __future__ import annotations
9
 
10
+ from typing import TYPE_CHECKING
11
+
12
  from src.app.geometry.polygon import PolygonPoints, polygon_to_bbox
13
 
14
+ if TYPE_CHECKING:
15
+ from src.app.geometry.bbox import BBoxTuple
16
+
17
 
18
  def xyxy_to_xywh(xyxy: tuple[float, float, float, float]) -> BBoxTuple:
19
  """Convert (x1, y1, x2, y2) to canonical (x, y, width, height).
src/app/geometry/polygon.py CHANGED
@@ -5,8 +5,10 @@ Polygons are represented as list[tuple[float, float]] — ordered (x, y) vertice
5
 
6
  from __future__ import annotations
7
 
8
- from src.app.geometry.bbox import BBoxTuple
9
 
 
 
10
 
11
  PolygonPoints = list[tuple[float, float]]
12
 
 
5
 
6
  from __future__ import annotations
7
 
8
+ from typing import TYPE_CHECKING
9
 
10
+ if TYPE_CHECKING:
11
+ from src.app.geometry.bbox import BBoxTuple
12
 
13
  PolygonPoints = list[tuple[float, float]]
14
 
src/app/geometry/quantization.py CHANGED
@@ -8,12 +8,14 @@ rounding and tolerance for containment checks.
8
  from __future__ import annotations
9
 
10
  import math
11
- from enum import Enum
 
12
 
13
- from src.app.geometry.bbox import BBoxTuple
 
14
 
15
 
16
- class RoundingStrategy(str, Enum):
17
  """How to round float coordinates to integers."""
18
 
19
  ROUND = "round"
 
8
  from __future__ import annotations
9
 
10
  import math
11
+ from enum import StrEnum
12
+ from typing import TYPE_CHECKING
13
 
14
+ if TYPE_CHECKING:
15
+ from src.app.geometry.bbox import BBoxTuple
16
 
17
 
18
+ class RoundingStrategy(StrEnum):
19
  """How to round float coordinates to integers."""
20
 
21
  ROUND = "round"
src/app/geometry/transforms.py CHANGED
@@ -6,10 +6,11 @@ is allowed in serializers (see AGENTS.md rule §5).
6
 
7
  from __future__ import annotations
8
 
9
- import math
10
 
11
- from src.app.geometry.bbox import BBoxTuple
12
- from src.app.geometry.polygon import PolygonPoints
 
13
 
14
 
15
  def rescale_bbox(bbox: BBoxTuple, factor: float) -> BBoxTuple:
 
6
 
7
  from __future__ import annotations
8
 
9
+ from typing import TYPE_CHECKING
10
 
11
+ if TYPE_CHECKING:
12
+ from src.app.geometry.bbox import BBoxTuple
13
+ from src.app.geometry.polygon import PolygonPoints
14
 
15
 
16
  def rescale_bbox(bbox: BBoxTuple, factor: float) -> BBoxTuple:
src/app/jobs/events.py CHANGED
@@ -4,14 +4,17 @@ from __future__ import annotations
4
 
5
  import time
6
  from contextlib import contextmanager
7
- from datetime import datetime, timezone
8
- from enum import Enum
9
- from typing import Generator
10
 
11
  from pydantic import BaseModel, Field
12
 
 
 
13
 
14
- class JobStep(str, Enum):
 
15
  """Named steps in the processing pipeline."""
16
 
17
  RECEIVE_FILE = "receive_file"
@@ -34,7 +37,7 @@ class JobEvent(BaseModel):
34
 
35
  step: JobStep
36
  status: str = "started" # started, completed, failed, skipped
37
- started_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
38
  completed_at: datetime | None = None
39
  duration_ms: float | None = None
40
  message: str | None = None
@@ -67,7 +70,7 @@ class EventLog:
67
  idx = len(self._events) - 1
68
  self._events[idx] = event.model_copy(update={
69
  "status": "completed",
70
- "completed_at": datetime.now(timezone.utc),
71
  "duration_ms": elapsed,
72
  })
73
  except Exception as exc:
@@ -75,7 +78,7 @@ class EventLog:
75
  idx = len(self._events) - 1
76
  self._events[idx] = event.model_copy(update={
77
  "status": "failed",
78
- "completed_at": datetime.now(timezone.utc),
79
  "duration_ms": elapsed,
80
  "error": str(exc),
81
  })
@@ -86,7 +89,7 @@ class EventLog:
86
  self._events.append(JobEvent(
87
  step=step,
88
  status="skipped",
89
- completed_at=datetime.now(timezone.utc),
90
  message=reason,
91
  ))
92
 
 
4
 
5
  import time
6
  from contextlib import contextmanager
7
+ from datetime import UTC, datetime
8
+ from enum import StrEnum
9
+ from typing import TYPE_CHECKING
10
 
11
  from pydantic import BaseModel, Field
12
 
13
+ if TYPE_CHECKING:
14
+ from collections.abc import Generator
15
 
16
+
17
+ class JobStep(StrEnum):
18
  """Named steps in the processing pipeline."""
19
 
20
  RECEIVE_FILE = "receive_file"
 
37
 
38
  step: JobStep
39
  status: str = "started" # started, completed, failed, skipped
40
+ started_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
41
  completed_at: datetime | None = None
42
  duration_ms: float | None = None
43
  message: str | None = None
 
70
  idx = len(self._events) - 1
71
  self._events[idx] = event.model_copy(update={
72
  "status": "completed",
73
+ "completed_at": datetime.now(UTC),
74
  "duration_ms": elapsed,
75
  })
76
  except Exception as exc:
 
78
  idx = len(self._events) - 1
79
  self._events[idx] = event.model_copy(update={
80
  "status": "failed",
81
+ "completed_at": datetime.now(UTC),
82
  "duration_ms": elapsed,
83
  "error": str(exc),
84
  })
 
89
  self._events.append(JobEvent(
90
  step=step,
91
  status="skipped",
92
+ completed_at=datetime.now(UTC),
93
  message=reason,
94
  ))
95
 
src/app/jobs/models.py CHANGED
@@ -2,14 +2,14 @@
2
 
3
  from __future__ import annotations
4
 
5
- from datetime import datetime, timezone
6
- from enum import Enum
7
  from typing import Any
8
 
9
  from pydantic import BaseModel, Field
10
 
11
 
12
- class JobStatus(str, Enum):
13
  """State machine for job lifecycle."""
14
 
15
  QUEUED = "queued"
@@ -40,7 +40,7 @@ class Job(BaseModel):
40
  has_viewer: bool = False
41
 
42
  # Timing
43
- created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
44
  started_at: datetime | None = None
45
  completed_at: datetime | None = None
46
 
 
2
 
3
  from __future__ import annotations
4
 
5
+ from datetime import UTC, datetime
6
+ from enum import StrEnum
7
  from typing import Any
8
 
9
  from pydantic import BaseModel, Field
10
 
11
 
12
+ class JobStatus(StrEnum):
13
  """State machine for job lifecycle."""
14
 
15
  QUEUED = "queued"
 
40
  has_viewer: bool = False
41
 
42
  # Timing
43
+ created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
44
  started_at: datetime | None = None
45
  completed_at: datetime | None = None
46
 
src/app/jobs/service.py CHANGED
@@ -19,10 +19,9 @@ Pipeline steps (§23.1):
19
  from __future__ import annotations
20
 
21
  import uuid
22
- from datetime import datetime, timezone
23
- from pathlib import Path
24
 
25
- from src.app.domain.models import CanonicalDocument, RawProviderPayload
26
  from src.app.domain.models.geometry import GeometryContext
27
  from src.app.enrichers import EnricherPipeline
28
  from src.app.enrichers.bbox_repair_light import BboxRepairLightEnricher
@@ -34,8 +33,6 @@ from src.app.enrichers.text_consistency import TextConsistencyEnricher
34
  from src.app.jobs.events import EventLog, JobStep
35
  from src.app.jobs.models import Job, JobStatus
36
  from src.app.normalization.pipeline import normalize
37
- from src.app.persistence.db import Database
38
- from src.app.persistence.file_store import FileStore
39
  from src.app.policies.document_policy import DocumentPolicy
40
  from src.app.policies.export_policy import check_alto_export, check_page_export
41
  from src.app.serializers.alto_xml import serialize_alto
@@ -44,6 +41,13 @@ from src.app.validators.export_eligibility_validator import compute_export_eligi
44
  from src.app.validators.structural_validator import validate_structure
45
  from src.app.viewer.projection_builder import build_projection
46
 
 
 
 
 
 
 
 
47
 
48
  def _default_enricher_pipeline() -> EnricherPipeline:
49
  return EnricherPipeline([
@@ -103,7 +107,7 @@ class JobService:
103
  events = EventLog()
104
  job = job.model_copy(update={
105
  "status": JobStatus.RUNNING,
106
- "started_at": datetime.now(timezone.utc),
107
  "image_width": image_width,
108
  "image_height": image_height,
109
  })
@@ -193,24 +197,23 @@ class JobService:
193
  self._store.save_events(job.job_id, events.to_dicts())
194
 
195
  # Determine final status
196
- if job.has_alto or job.has_page_xml:
197
- if job.has_alto and job.has_page_xml:
198
- final_status = JobStatus.SUCCEEDED
199
- else:
200
- final_status = JobStatus.PARTIAL_SUCCESS
201
- else:
202
  final_status = JobStatus.PARTIAL_SUCCESS
 
 
203
 
204
  job = job.model_copy(update={
205
  "status": final_status,
206
- "completed_at": datetime.now(timezone.utc),
207
  "warnings": warnings,
208
  })
209
 
210
  except Exception as exc:
211
  job = job.model_copy(update={
212
  "status": JobStatus.FAILED,
213
- "completed_at": datetime.now(timezone.utc),
214
  "error": str(exc),
215
  "warnings": warnings,
216
  })
 
19
  from __future__ import annotations
20
 
21
  import uuid
22
+ from datetime import UTC, datetime
23
+ from typing import TYPE_CHECKING
24
 
 
25
  from src.app.domain.models.geometry import GeometryContext
26
  from src.app.enrichers import EnricherPipeline
27
  from src.app.enrichers.bbox_repair_light import BboxRepairLightEnricher
 
33
  from src.app.jobs.events import EventLog, JobStep
34
  from src.app.jobs.models import Job, JobStatus
35
  from src.app.normalization.pipeline import normalize
 
 
36
  from src.app.policies.document_policy import DocumentPolicy
37
  from src.app.policies.export_policy import check_alto_export, check_page_export
38
  from src.app.serializers.alto_xml import serialize_alto
 
41
  from src.app.validators.structural_validator import validate_structure
42
  from src.app.viewer.projection_builder import build_projection
43
 
44
+ if TYPE_CHECKING:
45
+ from pathlib import Path
46
+
47
+ from src.app.domain.models import CanonicalDocument, RawProviderPayload
48
+ from src.app.persistence.db import Database
49
+ from src.app.persistence.file_store import FileStore
50
+
51
 
52
  def _default_enricher_pipeline() -> EnricherPipeline:
53
  return EnricherPipeline([
 
107
  events = EventLog()
108
  job = job.model_copy(update={
109
  "status": JobStatus.RUNNING,
110
+ "started_at": datetime.now(UTC),
111
  "image_width": image_width,
112
  "image_height": image_height,
113
  })
 
197
  self._store.save_events(job.job_id, events.to_dicts())
198
 
199
  # Determine final status
200
+ if job.has_alto and job.has_page_xml:
201
+ final_status = JobStatus.SUCCEEDED
202
+ elif job.has_alto or job.has_page_xml:
 
 
 
203
  final_status = JobStatus.PARTIAL_SUCCESS
204
+ else:
205
+ final_status = JobStatus.FAILED
206
 
207
  job = job.model_copy(update={
208
  "status": final_status,
209
+ "completed_at": datetime.now(UTC),
210
  "warnings": warnings,
211
  })
212
 
213
  except Exception as exc:
214
  job = job.model_copy(update={
215
  "status": JobStatus.FAILED,
216
+ "completed_at": datetime.now(UTC),
217
  "error": str(exc),
218
  "warnings": warnings,
219
  })
src/app/main.py CHANGED
@@ -7,7 +7,7 @@ from __future__ import annotations
7
 
8
  from contextlib import asynccontextmanager
9
  from pathlib import Path
10
- from typing import AsyncIterator
11
 
12
  from fastapi import FastAPI
13
  from fastapi.middleware.cors import CORSMiddleware
@@ -22,6 +22,9 @@ from src.app.api.routes_providers import router as providers_router
22
  from src.app.api.routes_viewer import router as viewer_router
23
  from src.app.settings import get_settings
24
 
 
 
 
25
 
26
  @asynccontextmanager
27
  async def lifespan(app: FastAPI) -> AsyncIterator[None]:
@@ -43,7 +46,7 @@ app = FastAPI(
43
  app.add_middleware(
44
  CORSMiddleware,
45
  allow_origins=["*"],
46
- allow_credentials=True,
47
  allow_methods=["*"],
48
  allow_headers=["*"],
49
  )
 
7
 
8
  from contextlib import asynccontextmanager
9
  from pathlib import Path
10
+ from typing import TYPE_CHECKING
11
 
12
  from fastapi import FastAPI
13
  from fastapi.middleware.cors import CORSMiddleware
 
22
  from src.app.api.routes_viewer import router as viewer_router
23
  from src.app.settings import get_settings
24
 
25
+ if TYPE_CHECKING:
26
+ from collections.abc import AsyncIterator
27
+
28
 
29
  @asynccontextmanager
30
  async def lifespan(app: FastAPI) -> AsyncIterator[None]:
 
46
  app.add_middleware(
47
  CORSMiddleware,
48
  allow_origins=["*"],
49
+ allow_credentials=False,
50
  allow_methods=["*"],
51
  allow_headers=["*"],
52
  )
src/app/normalization/pipeline.py CHANGED
@@ -8,10 +8,14 @@ The pipeline:
8
 
9
  from __future__ import annotations
10
 
11
- from src.app.domain.models import CanonicalDocument, RawProviderPayload
12
- from src.app.domain.models.geometry import GeometryContext
13
  from src.app.providers.registry import get_adapter
14
 
 
 
 
 
15
 
16
  def normalize(
17
  raw: RawProviderPayload,
 
8
 
9
  from __future__ import annotations
10
 
11
+ from typing import TYPE_CHECKING
12
+
13
  from src.app.providers.registry import get_adapter
14
 
15
+ if TYPE_CHECKING:
16
+ from src.app.domain.models import CanonicalDocument, RawProviderPayload
17
+ from src.app.domain.models.geometry import GeometryContext
18
+
19
 
20
  def normalize(
21
  raw: RawProviderPayload,
src/app/persistence/db.py CHANGED
@@ -8,11 +8,14 @@ from __future__ import annotations
8
 
9
  import json
10
  import sqlite3
11
- from pathlib import Path
12
- from typing import Any
13
 
14
  from src.app.jobs.models import Job, JobStatus
15
 
 
 
 
16
  _SCHEMA = """
17
  CREATE TABLE IF NOT EXISTS jobs (
18
  job_id TEXT PRIMARY KEY,
@@ -146,12 +149,16 @@ class Database:
146
  # -- Providers ------------------------------------------------------------
147
 
148
  def save_provider_record(self, provider_id: str, data: dict) -> None:
149
- from datetime import datetime, timezone
150
 
151
- now = datetime.now(timezone.utc).isoformat()
152
  self.conn.execute(
153
- """INSERT OR REPLACE INTO providers (provider_id, data, created_at, updated_at)
154
- VALUES (?, ?, COALESCE((SELECT created_at FROM providers WHERE provider_id = ?), ?), ?)""",
 
 
 
 
155
  (provider_id, json.dumps(data, default=str), provider_id, now, now),
156
  )
157
  self.conn.commit()
 
8
 
9
  import json
10
  import sqlite3
11
+ from datetime import UTC
12
+ from typing import TYPE_CHECKING, Any
13
 
14
  from src.app.jobs.models import Job, JobStatus
15
 
16
+ if TYPE_CHECKING:
17
+ from pathlib import Path
18
+
19
  _SCHEMA = """
20
  CREATE TABLE IF NOT EXISTS jobs (
21
  job_id TEXT PRIMARY KEY,
 
149
  # -- Providers ------------------------------------------------------------
150
 
151
  def save_provider_record(self, provider_id: str, data: dict) -> None:
152
+ from datetime import datetime
153
 
154
+ now = datetime.now(UTC).isoformat()
155
  self.conn.execute(
156
+ """INSERT OR REPLACE INTO providers
157
+ (provider_id, data, created_at, updated_at)
158
+ VALUES (?, ?, COALESCE(
159
+ (SELECT created_at FROM providers WHERE provider_id = ?),
160
+ ?
161
+ ), ?)""",
162
  (provider_id, json.dumps(data, default=str), provider_id, now, now),
163
  )
164
  self.conn.commit()
src/app/persistence/file_store.py CHANGED
@@ -36,8 +36,15 @@ class FileStore:
36
 
37
  # -- Job directory --------------------------------------------------------
38
 
 
 
 
 
 
 
 
39
  def job_dir(self, job_id: str) -> Path:
40
- d = self._jobs_dir / job_id
41
  d.mkdir(parents=True, exist_ok=True)
42
  return d
43
 
@@ -52,7 +59,8 @@ class FileStore:
52
  def save_json(self, job_id: str, filename: str, data: Any) -> Path:
53
  """Save a JSON-serializable object."""
54
  dest = self.job_dir(job_id) / filename
55
- dest.write_text(json.dumps(data, ensure_ascii=False, indent=2, default=str), encoding="utf-8")
 
56
  return dest
57
 
58
  def save_bytes(self, job_id: str, filename: str, data: bytes) -> Path:
@@ -83,14 +91,14 @@ class FileStore:
83
 
84
  def load_json(self, job_id: str, filename: str) -> Any:
85
  """Load a JSON file from the job directory. Returns None if not found."""
86
- path = self._jobs_dir / job_id / filename
87
  if not path.exists():
88
  return None
89
  return json.loads(path.read_text(encoding="utf-8"))
90
 
91
  def load_bytes(self, job_id: str, filename: str) -> bytes | None:
92
  """Load raw bytes. Returns None if not found."""
93
- path = self._jobs_dir / job_id / filename
94
  if not path.exists():
95
  return None
96
  return path.read_bytes()
@@ -121,19 +129,21 @@ class FileStore:
121
  if not d.exists():
122
  return None
123
  for f in d.iterdir():
124
- if f.stem == "input" and f.suffix in (".png", ".jpg", ".jpeg", ".tif", ".tiff", ".webp"):
 
125
  return f
126
  return None
127
 
128
  # -- Provider profiles ----------------------------------------------------
129
 
130
  def save_provider(self, provider_id: str, data: dict) -> Path:
131
- dest = self._providers_dir / f"{provider_id}.json"
132
- dest.write_text(json.dumps(data, ensure_ascii=False, indent=2, default=str), encoding="utf-8")
 
133
  return dest
134
 
135
  def load_provider(self, provider_id: str) -> dict | None:
136
- path = self._providers_dir / f"{provider_id}.json"
137
  if not path.exists():
138
  return None
139
  return json.loads(path.read_text(encoding="utf-8"))
@@ -144,7 +154,7 @@ class FileStore:
144
  return [f.stem for f in self._providers_dir.glob("*.json")]
145
 
146
  def delete_provider(self, provider_id: str) -> bool:
147
- path = self._providers_dir / f"{provider_id}.json"
148
  if path.exists():
149
  path.unlink()
150
  return True
 
36
 
37
  # -- Job directory --------------------------------------------------------
38
 
39
+ @staticmethod
40
+ def _sanitize_id(value: str) -> str:
41
+ """Reject path-traversal attempts in identifiers."""
42
+ if not value or "/" in value or "\\" in value or ".." in value:
43
+ raise ValueError(f"Invalid identifier (path traversal rejected): {value!r}")
44
+ return value
45
+
46
  def job_dir(self, job_id: str) -> Path:
47
+ d = self._jobs_dir / self._sanitize_id(job_id)
48
  d.mkdir(parents=True, exist_ok=True)
49
  return d
50
 
 
59
  def save_json(self, job_id: str, filename: str, data: Any) -> Path:
60
  """Save a JSON-serializable object."""
61
  dest = self.job_dir(job_id) / filename
62
+ content = json.dumps(data, ensure_ascii=False, indent=2, default=str)
63
+ dest.write_text(content, encoding="utf-8")
64
  return dest
65
 
66
  def save_bytes(self, job_id: str, filename: str, data: bytes) -> Path:
 
91
 
92
  def load_json(self, job_id: str, filename: str) -> Any:
93
  """Load a JSON file from the job directory. Returns None if not found."""
94
+ path = self._jobs_dir / self._sanitize_id(job_id) / filename
95
  if not path.exists():
96
  return None
97
  return json.loads(path.read_text(encoding="utf-8"))
98
 
99
  def load_bytes(self, job_id: str, filename: str) -> bytes | None:
100
  """Load raw bytes. Returns None if not found."""
101
+ path = self._jobs_dir / self._sanitize_id(job_id) / filename
102
  if not path.exists():
103
  return None
104
  return path.read_bytes()
 
129
  if not d.exists():
130
  return None
131
  for f in d.iterdir():
132
+ valid = (".png", ".jpg", ".jpeg", ".tif", ".tiff", ".webp")
133
+ if f.stem == "input" and f.suffix in valid:
134
  return f
135
  return None
136
 
137
  # -- Provider profiles ----------------------------------------------------
138
 
139
  def save_provider(self, provider_id: str, data: dict) -> Path:
140
+ dest = self._providers_dir / f"{self._sanitize_id(provider_id)}.json"
141
+ content = json.dumps(data, ensure_ascii=False, indent=2, default=str)
142
+ dest.write_text(content, encoding="utf-8")
143
  return dest
144
 
145
  def load_provider(self, provider_id: str) -> dict | None:
146
+ path = self._providers_dir / f"{self._sanitize_id(provider_id)}.json"
147
  if not path.exists():
148
  return None
149
  return json.loads(path.read_text(encoding="utf-8"))
 
154
  return [f.stem for f in self._providers_dir.glob("*.json")]
155
 
156
  def delete_provider(self, provider_id: str) -> bool:
157
+ path = self._providers_dir / f"{self._sanitize_id(provider_id)}.json"
158
  if path.exists():
159
  path.unlink()
160
  return True
src/app/policies/document_policy.py CHANGED
@@ -7,12 +7,12 @@ that controls what the system may or may not do.
7
 
8
  from __future__ import annotations
9
 
10
- from enum import Enum
11
 
12
  from pydantic import BaseModel, ConfigDict
13
 
14
 
15
- class PolicyMode(str, Enum):
16
  """Named policy presets."""
17
 
18
  STRICT = "strict"
 
7
 
8
  from __future__ import annotations
9
 
10
+ from enum import StrEnum
11
 
12
  from pydantic import BaseModel, ConfigDict
13
 
14
 
15
+ class PolicyMode(StrEnum):
16
  """Named policy presets."""
17
 
18
  STRICT = "strict"
src/app/policies/export_policy.py CHANGED
@@ -7,11 +7,14 @@ decision for each export format.
7
  from __future__ import annotations
8
 
9
  from dataclasses import dataclass
 
10
 
11
- from src.app.domain.models.readiness import ExportEligibility
12
  from src.app.domain.models.status import ReadinessLevel
13
  from src.app.policies.document_policy import DocumentPolicy
14
 
 
 
 
15
 
16
  @dataclass(frozen=True)
17
  class ExportDecision:
@@ -36,7 +39,10 @@ def check_alto_export(
36
  return ExportDecision(
37
  allowed=False,
38
  level=level,
39
- reason="ALTO export not possible: missing required data (word text/geometry or line geometry)",
 
 
 
40
  )
41
 
42
  if level == ReadinessLevel.PARTIAL and not policy.allow_partial_alto:
 
7
  from __future__ import annotations
8
 
9
  from dataclasses import dataclass
10
+ from typing import TYPE_CHECKING
11
 
 
12
  from src.app.domain.models.status import ReadinessLevel
13
  from src.app.policies.document_policy import DocumentPolicy
14
 
15
+ if TYPE_CHECKING:
16
+ from src.app.domain.models.readiness import ExportEligibility
17
+
18
 
19
  @dataclass(frozen=True)
20
  class ExportDecision:
 
39
  return ExportDecision(
40
  allowed=False,
41
  level=level,
42
+ reason=(
43
+ "ALTO export not possible: missing required data"
44
+ " (word text/geometry or line geometry)"
45
+ ),
46
  )
47
 
48
  if level == ReadinessLevel.PARTIAL and not policy.allow_partial_alto:
src/app/providers/adapters/base.py CHANGED
@@ -3,9 +3,11 @@
3
  from __future__ import annotations
4
 
5
  from abc import ABC, abstractmethod
 
6
 
7
- from src.app.domain.models import CanonicalDocument, RawProviderPayload
8
- from src.app.domain.models.geometry import GeometryContext
 
9
 
10
 
11
  class BaseAdapter(ABC):
 
3
  from __future__ import annotations
4
 
5
  from abc import ABC, abstractmethod
6
+ from typing import TYPE_CHECKING
7
 
8
+ if TYPE_CHECKING:
9
+ from src.app.domain.models import CanonicalDocument, RawProviderPayload
10
+ from src.app.domain.models.geometry import GeometryContext
11
 
12
 
13
  class BaseAdapter(ABC):
src/app/providers/adapters/line_box_json.py CHANGED
@@ -17,19 +17,23 @@ since the provider doesn't segment words. The block is inferred.
17
 
18
  from __future__ import annotations
19
 
 
 
20
  from src.app.domain.models import (
21
  CanonicalDocument,
22
  Geometry,
23
  Provenance,
24
  RawProviderPayload,
25
  )
26
- from src.app.domain.models.geometry import GeometryContext
27
  from src.app.domain.models.status import EvidenceType, GeometryStatus, InputType
28
  from src.app.geometry.bbox import union_all
29
  from src.app.geometry.normalization import xyxy_to_xywh
30
  from src.app.normalization.canonical_builder import CanonicalBuilder
31
  from src.app.providers.adapters.base import BaseAdapter
32
 
 
 
 
33
 
34
  class LineBoxJsonAdapter(BaseAdapter):
35
  """Adapter for the line_box_json family."""
 
17
 
18
  from __future__ import annotations
19
 
20
+ from typing import TYPE_CHECKING
21
+
22
  from src.app.domain.models import (
23
  CanonicalDocument,
24
  Geometry,
25
  Provenance,
26
  RawProviderPayload,
27
  )
 
28
  from src.app.domain.models.status import EvidenceType, GeometryStatus, InputType
29
  from src.app.geometry.bbox import union_all
30
  from src.app.geometry.normalization import xyxy_to_xywh
31
  from src.app.normalization.canonical_builder import CanonicalBuilder
32
  from src.app.providers.adapters.base import BaseAdapter
33
 
34
+ if TYPE_CHECKING:
35
+ from src.app.domain.models.geometry import GeometryContext
36
+
37
 
38
  class LineBoxJsonAdapter(BaseAdapter):
39
  """Adapter for the line_box_json family."""
src/app/providers/adapters/text_only.py CHANGED
@@ -19,17 +19,21 @@ will show text without positioned overlays.
19
 
20
  from __future__ import annotations
21
 
 
 
22
  from src.app.domain.models import (
23
  CanonicalDocument,
24
  Geometry,
25
  Provenance,
26
  RawProviderPayload,
27
  )
28
- from src.app.domain.models.geometry import GeometryContext
29
  from src.app.domain.models.status import EvidenceType, GeometryStatus, InputType
30
  from src.app.normalization.canonical_builder import CanonicalBuilder
31
  from src.app.providers.adapters.base import BaseAdapter
32
 
 
 
 
33
 
34
  class TextOnlyAdapter(BaseAdapter):
35
  """Adapter for the text_only family (mLLM without geometry)."""
@@ -76,7 +80,11 @@ class TextOnlyAdapter(BaseAdapter):
76
  # Extract text blocks
77
  blocks = payload.get("blocks")
78
  if blocks and isinstance(blocks, list):
79
- texts = [str(b.get("text", "")) for b in blocks if isinstance(b, dict) and b.get("text")]
 
 
 
 
80
  else:
81
  # Single text blob — split into paragraphs
82
  full_text = str(payload.get("text", ""))
@@ -102,7 +110,7 @@ class TextOnlyAdapter(BaseAdapter):
102
  )
103
 
104
  # Split block into lines
105
- lines = [l.strip() for l in block_text.split("\n") if l.strip()]
106
  if not lines:
107
  lines = [block_text]
108
 
 
19
 
20
  from __future__ import annotations
21
 
22
+ from typing import TYPE_CHECKING
23
+
24
  from src.app.domain.models import (
25
  CanonicalDocument,
26
  Geometry,
27
  Provenance,
28
  RawProviderPayload,
29
  )
 
30
  from src.app.domain.models.status import EvidenceType, GeometryStatus, InputType
31
  from src.app.normalization.canonical_builder import CanonicalBuilder
32
  from src.app.providers.adapters.base import BaseAdapter
33
 
34
+ if TYPE_CHECKING:
35
+ from src.app.domain.models.geometry import GeometryContext
36
+
37
 
38
  class TextOnlyAdapter(BaseAdapter):
39
  """Adapter for the text_only family (mLLM without geometry)."""
 
80
  # Extract text blocks
81
  blocks = payload.get("blocks")
82
  if blocks and isinstance(blocks, list):
83
+ texts = [
84
+ str(b.get("text", ""))
85
+ for b in blocks
86
+ if isinstance(b, dict) and b.get("text")
87
+ ]
88
  else:
89
  # Single text blob — split into paragraphs
90
  full_text = str(payload.get("text", ""))
 
110
  )
111
 
112
  # Split block into lines
113
+ lines = [ln.strip() for ln in block_text.split("\n") if ln.strip()]
114
  if not lines:
115
  lines = [block_text]
116
 
src/app/providers/adapters/word_box_json.py CHANGED
@@ -13,13 +13,14 @@ within a single inferred text block.
13
 
14
  from __future__ import annotations
15
 
 
 
16
  from src.app.domain.models import (
17
  CanonicalDocument,
18
  Geometry,
19
  Provenance,
20
  RawProviderPayload,
21
  )
22
- from src.app.domain.models.geometry import GeometryContext
23
  from src.app.domain.models.status import (
24
  EvidenceType,
25
  GeometryStatus,
@@ -30,6 +31,9 @@ from src.app.geometry.normalization import four_point_to_polygon, four_point_to_
30
  from src.app.normalization.canonical_builder import CanonicalBuilder
31
  from src.app.providers.adapters.base import BaseAdapter
32
 
 
 
 
33
 
34
  class WordBoxJsonAdapter(BaseAdapter):
35
  """Adapter for the word_box_json family (PaddleOCR, etc.)."""
@@ -163,8 +167,9 @@ class WordBoxJsonAdapter(BaseAdapter):
163
  points = item[0]
164
  text_conf = item[1]
165
  if not isinstance(points, list) or len(points) != 4:
 
166
  raise ValueError(
167
- f"Item {idx}: expected 4 polygon points, got {len(points) if isinstance(points, list) else type(points).__name__}"
168
  )
169
  return points, text_conf
170
 
 
13
 
14
  from __future__ import annotations
15
 
16
+ from typing import TYPE_CHECKING
17
+
18
  from src.app.domain.models import (
19
  CanonicalDocument,
20
  Geometry,
21
  Provenance,
22
  RawProviderPayload,
23
  )
 
24
  from src.app.domain.models.status import (
25
  EvidenceType,
26
  GeometryStatus,
 
31
  from src.app.normalization.canonical_builder import CanonicalBuilder
32
  from src.app.providers.adapters.base import BaseAdapter
33
 
34
+ if TYPE_CHECKING:
35
+ from src.app.domain.models.geometry import GeometryContext
36
+
37
 
38
  class WordBoxJsonAdapter(BaseAdapter):
39
  """Adapter for the word_box_json family (PaddleOCR, etc.)."""
 
167
  points = item[0]
168
  text_conf = item[1]
169
  if not isinstance(points, list) or len(points) != 4:
170
+ got = len(points) if isinstance(points, list) else type(points).__name__
171
  raise ValueError(
172
+ f"Item {idx}: expected 4 polygon points, got {got}"
173
  )
174
  return points, text_conf
175
 
src/app/providers/profiles.py CHANGED
@@ -2,28 +2,28 @@
2
 
3
  from __future__ import annotations
4
 
5
- from datetime import datetime, timezone
6
- from enum import Enum
7
 
8
  from pydantic import BaseModel, ConfigDict, Field
9
 
10
  from src.app.providers.capabilities import CapabilityMatrix
11
 
12
 
13
- class RuntimeType(str, Enum):
14
  LOCAL = "local"
15
  HUB = "hub"
16
  API = "api"
17
 
18
 
19
- class ProviderFamily(str, Enum):
20
  WORD_BOX_JSON = "word_box_json"
21
  LINE_BOX_JSON = "line_box_json"
22
  REGION_LINE_WORD_POLYGON = "region_line_word_polygon"
23
  TEXT_ONLY = "text_only"
24
 
25
 
26
- class AuthMode(str, Enum):
27
  NONE = "none"
28
  API_KEY = "api_key"
29
  BEARER = "bearer"
@@ -51,5 +51,5 @@ class ProviderProfile(BaseModel):
51
  prompt_template: str | None = None
52
 
53
  last_test_status: str | None = None
54
- created_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
55
- updated_at: datetime = Field(default_factory=lambda: datetime.now(timezone.utc))
 
2
 
3
  from __future__ import annotations
4
 
5
+ from datetime import UTC, datetime
6
+ from enum import StrEnum
7
 
8
  from pydantic import BaseModel, ConfigDict, Field
9
 
10
  from src.app.providers.capabilities import CapabilityMatrix
11
 
12
 
13
+ class RuntimeType(StrEnum):
14
  LOCAL = "local"
15
  HUB = "hub"
16
  API = "api"
17
 
18
 
19
+ class ProviderFamily(StrEnum):
20
  WORD_BOX_JSON = "word_box_json"
21
  LINE_BOX_JSON = "line_box_json"
22
  REGION_LINE_WORD_POLYGON = "region_line_word_polygon"
23
  TEXT_ONLY = "text_only"
24
 
25
 
26
+ class AuthMode(StrEnum):
27
  NONE = "none"
28
  API_KEY = "api_key"
29
  BEARER = "bearer"
 
51
  prompt_template: str | None = None
52
 
53
  last_test_status: str | None = None
54
+ created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
55
+ updated_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
src/app/providers/registry.py CHANGED
@@ -2,16 +2,20 @@
2
 
3
  from __future__ import annotations
4
 
5
- from src.app.providers.adapters.base import BaseAdapter
 
6
  from src.app.providers.adapters.line_box_json import LineBoxJsonAdapter
7
  from src.app.providers.adapters.text_only import TextOnlyAdapter
8
  from src.app.providers.adapters.word_box_json import WordBoxJsonAdapter
9
  from src.app.providers.profiles import ProviderFamily, RuntimeType
10
  from src.app.providers.runtimes.api_runtime import ApiRuntime
11
- from src.app.providers.runtimes.base import BaseRuntime
12
  from src.app.providers.runtimes.hub_runtime import HubRuntime
13
  from src.app.providers.runtimes.local_runtime import LocalRuntime
14
 
 
 
 
 
15
  # -- Adapter registry ---------------------------------------------------------
16
 
17
  _ADAPTER_REGISTRY: dict[str, type[BaseAdapter]] = {
 
2
 
3
  from __future__ import annotations
4
 
5
+ from typing import TYPE_CHECKING
6
+
7
  from src.app.providers.adapters.line_box_json import LineBoxJsonAdapter
8
  from src.app.providers.adapters.text_only import TextOnlyAdapter
9
  from src.app.providers.adapters.word_box_json import WordBoxJsonAdapter
10
  from src.app.providers.profiles import ProviderFamily, RuntimeType
11
  from src.app.providers.runtimes.api_runtime import ApiRuntime
 
12
  from src.app.providers.runtimes.hub_runtime import HubRuntime
13
  from src.app.providers.runtimes.local_runtime import LocalRuntime
14
 
15
+ if TYPE_CHECKING:
16
+ from src.app.providers.adapters.base import BaseAdapter
17
+ from src.app.providers.runtimes.base import BaseRuntime
18
+
19
  # -- Adapter registry ---------------------------------------------------------
20
 
21
  _ADAPTER_REGISTRY: dict[str, type[BaseAdapter]] = {
src/app/providers/resolver.py CHANGED
@@ -2,16 +2,25 @@
2
 
3
  from __future__ import annotations
4
 
5
- from src.app.providers.adapters.base import BaseAdapter
6
- from src.app.providers.profiles import ProviderProfile
7
  from src.app.providers.registry import get_adapter, get_runtime
8
- from src.app.providers.runtimes.base import BaseRuntime
 
 
 
 
9
 
10
 
11
  class ResolvedProvider:
12
  """A fully resolved provider: runtime + adapter, ready to execute."""
13
 
14
- def __init__(self, profile: ProviderProfile, runtime: BaseRuntime, adapter: BaseAdapter) -> None:
 
 
 
 
 
15
  self.profile = profile
16
  self.runtime = runtime
17
  self.adapter = adapter
 
2
 
3
  from __future__ import annotations
4
 
5
+ from typing import TYPE_CHECKING
6
+
7
  from src.app.providers.registry import get_adapter, get_runtime
8
+
9
+ if TYPE_CHECKING:
10
+ from src.app.providers.adapters.base import BaseAdapter
11
+ from src.app.providers.profiles import ProviderProfile
12
+ from src.app.providers.runtimes.base import BaseRuntime
13
 
14
 
15
  class ResolvedProvider:
16
  """A fully resolved provider: runtime + adapter, ready to execute."""
17
 
18
+ def __init__(
19
+ self,
20
+ profile: ProviderProfile,
21
+ runtime: BaseRuntime,
22
+ adapter: BaseAdapter,
23
+ ) -> None:
24
  self.profile = profile
25
  self.runtime = runtime
26
  self.adapter = adapter
src/app/providers/runtimes/api_runtime.py CHANGED
@@ -2,12 +2,15 @@
2
 
3
  from __future__ import annotations
4
 
5
- from pathlib import Path
6
- from typing import Any
7
 
8
- from src.app.domain.models import RawProviderPayload
9
  from src.app.providers.runtimes.base import BaseRuntime
10
 
 
 
 
 
 
11
 
12
  class ApiRuntime(BaseRuntime):
13
  """Runtime that calls an external API endpoint.
 
2
 
3
  from __future__ import annotations
4
 
5
+ from typing import TYPE_CHECKING, Any
 
6
 
 
7
  from src.app.providers.runtimes.base import BaseRuntime
8
 
9
+ if TYPE_CHECKING:
10
+ from pathlib import Path
11
+
12
+ from src.app.domain.models import RawProviderPayload
13
+
14
 
15
  class ApiRuntime(BaseRuntime):
16
  """Runtime that calls an external API endpoint.
src/app/providers/runtimes/base.py CHANGED
@@ -3,10 +3,12 @@
3
  from __future__ import annotations
4
 
5
  from abc import ABC, abstractmethod
6
- from pathlib import Path
7
- from typing import Any
8
 
9
- from src.app.domain.models import RawProviderPayload
 
 
 
10
 
11
 
12
  class BaseRuntime(ABC):
 
3
  from __future__ import annotations
4
 
5
  from abc import ABC, abstractmethod
6
+ from typing import TYPE_CHECKING, Any
 
7
 
8
+ if TYPE_CHECKING:
9
+ from pathlib import Path
10
+
11
+ from src.app.domain.models import RawProviderPayload
12
 
13
 
14
  class BaseRuntime(ABC):
src/app/providers/runtimes/hub_runtime.py CHANGED
@@ -2,12 +2,15 @@
2
 
3
  from __future__ import annotations
4
 
5
- from pathlib import Path
6
- from typing import Any
7
 
8
- from src.app.domain.models import RawProviderPayload
9
  from src.app.providers.runtimes.base import BaseRuntime
10
 
 
 
 
 
 
11
 
12
  class HubRuntime(BaseRuntime):
13
  """Runtime for models loaded from the Hugging Face Hub.
 
2
 
3
  from __future__ import annotations
4
 
5
+ from typing import TYPE_CHECKING, Any
 
6
 
 
7
  from src.app.providers.runtimes.base import BaseRuntime
8
 
9
+ if TYPE_CHECKING:
10
+ from pathlib import Path
11
+
12
+ from src.app.domain.models import RawProviderPayload
13
+
14
 
15
  class HubRuntime(BaseRuntime):
16
  """Runtime for models loaded from the Hugging Face Hub.
src/app/providers/runtimes/local_runtime.py CHANGED
@@ -2,12 +2,15 @@
2
 
3
  from __future__ import annotations
4
 
5
- from pathlib import Path
6
- from typing import Any
7
 
8
- from src.app.domain.models import RawProviderPayload
9
  from src.app.providers.runtimes.base import BaseRuntime
10
 
 
 
 
 
 
11
 
12
  class LocalRuntime(BaseRuntime):
13
  """Runtime for locally installed models.
 
2
 
3
  from __future__ import annotations
4
 
5
+ from typing import TYPE_CHECKING, Any
 
6
 
 
7
  from src.app.providers.runtimes.base import BaseRuntime
8
 
9
+ if TYPE_CHECKING:
10
+ from pathlib import Path
11
+
12
+ from src.app.domain.models import RawProviderPayload
13
+
14
 
15
  class LocalRuntime(BaseRuntime):
16
  """Runtime for locally installed models.
src/app/serializers/alto_xml.py CHANGED
@@ -29,11 +29,15 @@ Coordinate mapping:
29
 
30
  from __future__ import annotations
31
 
 
 
32
  from lxml import etree
33
 
34
- from src.app.domain.models import CanonicalDocument, Page, TextLine, TextRegion, Word
35
  from src.app.geometry.quantization import RoundingStrategy, quantize_bbox
36
 
 
 
 
37
  ALTO_NS = "http://www.loc.gov/standards/alto/ns-v4#"
38
  XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"
39
  SCHEMA_LOCATION = (
 
29
 
30
  from __future__ import annotations
31
 
32
+ from typing import TYPE_CHECKING
33
+
34
  from lxml import etree
35
 
 
36
  from src.app.geometry.quantization import RoundingStrategy, quantize_bbox
37
 
38
+ if TYPE_CHECKING:
39
+ from src.app.domain.models import CanonicalDocument, Page, TextLine, TextRegion, Word
40
+
41
  ALTO_NS = "http://www.loc.gov/standards/alto/ns-v4#"
42
  XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"
43
  SCHEMA_LOCATION = (
src/app/serializers/page_xml.py CHANGED
@@ -26,21 +26,24 @@ Namespace: http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15
26
 
27
  from __future__ import annotations
28
 
29
- from datetime import datetime, timezone
 
30
 
31
  from lxml import etree
32
 
33
- from src.app.domain.models import (
34
- CanonicalDocument,
35
- Page,
36
- TextLine,
37
- TextRegion,
38
- Word,
39
- )
40
  from src.app.domain.models.status import BlockRole
41
  from src.app.geometry.polygon import bbox_to_polygon
42
  from src.app.geometry.quantization import RoundingStrategy, quantize_value
43
 
 
 
 
 
 
 
 
 
 
44
  PAGE_NS = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
45
  XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"
46
  SCHEMA_LOCATION = (
@@ -119,9 +122,9 @@ def _build_page_tree(
119
  creator = etree.SubElement(metadata, f"{{{PAGE_NS}}}Creator")
120
  creator.text = "XmLLM"
121
  created = etree.SubElement(metadata, f"{{{PAGE_NS}}}Created")
122
- created.text = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S")
123
  last_change = etree.SubElement(metadata, f"{{{PAGE_NS}}}LastChange")
124
- last_change.text = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%S")
125
 
126
  # One <Page> per canonical page (PAGE XML is per-page, but we handle multi-page)
127
  for page in doc.pages:
@@ -226,10 +229,7 @@ def _add_coords(
226
  """
227
  coords = etree.SubElement(parent, f"{{{PAGE_NS}}}Coords")
228
 
229
- if polygon and len(polygon) >= 3:
230
- points = polygon
231
- else:
232
- points = bbox_to_polygon(bbox)
233
 
234
  points_str = " ".join(
235
  f"{quantize_value(x, rounding)},{quantize_value(y, rounding)}"
 
26
 
27
  from __future__ import annotations
28
 
29
+ from datetime import UTC, datetime
30
+ from typing import TYPE_CHECKING
31
 
32
  from lxml import etree
33
 
 
 
 
 
 
 
 
34
  from src.app.domain.models.status import BlockRole
35
  from src.app.geometry.polygon import bbox_to_polygon
36
  from src.app.geometry.quantization import RoundingStrategy, quantize_value
37
 
38
+ if TYPE_CHECKING:
39
+ from src.app.domain.models import (
40
+ CanonicalDocument,
41
+ Page,
42
+ TextLine,
43
+ TextRegion,
44
+ Word,
45
+ )
46
+
47
  PAGE_NS = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
48
  XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"
49
  SCHEMA_LOCATION = (
 
122
  creator = etree.SubElement(metadata, f"{{{PAGE_NS}}}Creator")
123
  creator.text = "XmLLM"
124
  created = etree.SubElement(metadata, f"{{{PAGE_NS}}}Created")
125
+ created.text = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%S")
126
  last_change = etree.SubElement(metadata, f"{{{PAGE_NS}}}LastChange")
127
+ last_change.text = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%S")
128
 
129
  # One <Page> per canonical page (PAGE XML is per-page, but we handle multi-page)
130
  for page in doc.pages:
 
229
  """
230
  coords = etree.SubElement(parent, f"{{{PAGE_NS}}}Coords")
231
 
232
+ points = polygon if polygon and len(polygon) >= 3 else bbox_to_polygon(bbox)
 
 
 
233
 
234
  points_str = " ".join(
235
  f"{quantize_value(x, rounding)},{quantize_value(y, rounding)}"
src/app/settings.py CHANGED
@@ -7,14 +7,15 @@ via the SPACE_ID environment variable.
7
  from __future__ import annotations
8
 
9
  import os
10
- from enum import Enum
 
11
  from pathlib import Path
12
 
13
  from pydantic import Field
14
  from pydantic_settings import BaseSettings, SettingsConfigDict
15
 
16
 
17
- class AppMode(str, Enum):
18
  LOCAL = "local"
19
  SPACE = "space"
20
 
@@ -113,6 +114,7 @@ class Settings(BaseSettings):
113
  os.environ.setdefault("HF_HOME", str(self.hf_home))
114
 
115
 
 
116
  def get_settings() -> Settings:
117
- """Factory for dependency injection (FastAPI Depends)."""
118
  return Settings()
 
7
  from __future__ import annotations
8
 
9
  import os
10
+ from enum import StrEnum
11
+ from functools import lru_cache
12
  from pathlib import Path
13
 
14
  from pydantic import Field
15
  from pydantic_settings import BaseSettings, SettingsConfigDict
16
 
17
 
18
+ class AppMode(StrEnum):
19
  LOCAL = "local"
20
  SPACE = "space"
21
 
 
114
  os.environ.setdefault("HF_HOME", str(self.hf_home))
115
 
116
 
117
+ @lru_cache(maxsize=1)
118
  def get_settings() -> Settings:
119
+ """Factory for dependency injection (FastAPI Depends). Cached singleton."""
120
  return Settings()
src/app/validators/export_eligibility_validator.py CHANGED
@@ -6,7 +6,8 @@ an ExportEligibility decision for the whole document.
6
 
7
  from __future__ import annotations
8
 
9
- from src.app.domain.models import CanonicalDocument
 
10
  from src.app.domain.models.readiness import ExportEligibility
11
  from src.app.domain.models.status import ReadinessLevel
12
  from src.app.policies.document_policy import DocumentPolicy
@@ -15,6 +16,9 @@ from src.app.validators.readiness_validator import (
15
  compute_page_pagexml_readiness,
16
  )
17
 
 
 
 
18
 
19
  def compute_export_eligibility(
20
  doc: CanonicalDocument,
@@ -70,10 +74,10 @@ def _aggregate_levels(levels: list[ReadinessLevel]) -> ReadinessLevel:
70
  if not levels:
71
  return ReadinessLevel.NONE
72
 
73
- if all(l == ReadinessLevel.FULL for l in levels):
74
  return ReadinessLevel.FULL
75
- if all(l == ReadinessLevel.NONE for l in levels):
76
  return ReadinessLevel.NONE
77
- if any(l in (ReadinessLevel.FULL, ReadinessLevel.PARTIAL) for l in levels):
78
  return ReadinessLevel.PARTIAL
79
  return ReadinessLevel.DEGRADED
 
6
 
7
  from __future__ import annotations
8
 
9
+ from typing import TYPE_CHECKING
10
+
11
  from src.app.domain.models.readiness import ExportEligibility
12
  from src.app.domain.models.status import ReadinessLevel
13
  from src.app.policies.document_policy import DocumentPolicy
 
16
  compute_page_pagexml_readiness,
17
  )
18
 
19
+ if TYPE_CHECKING:
20
+ from src.app.domain.models import CanonicalDocument
21
+
22
 
23
  def compute_export_eligibility(
24
  doc: CanonicalDocument,
 
74
  if not levels:
75
  return ReadinessLevel.NONE
76
 
77
+ if all(lv == ReadinessLevel.FULL for lv in levels):
78
  return ReadinessLevel.FULL
79
+ if all(lv == ReadinessLevel.NONE for lv in levels):
80
  return ReadinessLevel.NONE
81
+ if any(lv in (ReadinessLevel.FULL, ReadinessLevel.PARTIAL) for lv in levels):
82
  return ReadinessLevel.PARTIAL
83
  return ReadinessLevel.DEGRADED