Spaces:
Running
Running
Merge pull request #5 from maribakulj/claude/repo-audit-deployment-71Dac
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .github/workflows/ci.yml +29 -0
- Dockerfile +7 -0
- README.md +12 -0
- pyproject.toml +1 -1
- src/app/api/__init__.py +5 -1
- src/app/api/routes_exports.py +4 -1
- src/app/api/routes_jobs.py +6 -3
- src/app/domain/errors/__init__.py +6 -3
- src/app/domain/models/__init__.py +1 -1
- src/app/domain/models/canonical_document.py +3 -4
- src/app/domain/models/geometry.py +0 -1
- src/app/domain/models/raw_payload.py +2 -2
- src/app/domain/models/status.py +11 -12
- src/app/enrichers/__init__.py +4 -1
- src/app/enrichers/bbox_repair_light.py +6 -2
- src/app/enrichers/hyphenation_basic.py +6 -2
- src/app/enrichers/lang_propagation.py +6 -2
- src/app/enrichers/polygon_to_bbox.py +7 -3
- src/app/enrichers/reading_order_simple.py +6 -2
- src/app/enrichers/text_consistency.py +6 -2
- src/app/geometry/baseline.py +0 -1
- src/app/geometry/bbox.py +0 -1
- src/app/geometry/normalization.py +5 -1
- src/app/geometry/polygon.py +3 -1
- src/app/geometry/quantization.py +5 -3
- src/app/geometry/transforms.py +4 -3
- src/app/jobs/events.py +11 -8
- src/app/jobs/models.py +4 -4
- src/app/jobs/service.py +17 -14
- src/app/main.py +5 -2
- src/app/normalization/pipeline.py +6 -2
- src/app/persistence/db.py +13 -6
- src/app/persistence/file_store.py +19 -9
- src/app/policies/document_policy.py +2 -2
- src/app/policies/export_policy.py +8 -2
- src/app/providers/adapters/base.py +4 -2
- src/app/providers/adapters/line_box_json.py +5 -1
- src/app/providers/adapters/text_only.py +11 -3
- src/app/providers/adapters/word_box_json.py +7 -2
- src/app/providers/profiles.py +7 -7
- src/app/providers/registry.py +6 -2
- src/app/providers/resolver.py +13 -4
- src/app/providers/runtimes/api_runtime.py +6 -3
- src/app/providers/runtimes/base.py +5 -3
- src/app/providers/runtimes/hub_runtime.py +6 -3
- src/app/providers/runtimes/local_runtime.py +6 -3
- src/app/serializers/alto_xml.py +5 -1
- src/app/serializers/page_xml.py +14 -14
- src/app/settings.py +5 -3
- src/app/validators/export_eligibility_validator.py +8 -4
.github/workflows/ci.yml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: CI
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [main]
|
| 6 |
+
pull_request:
|
| 7 |
+
branches: [main]
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
lint:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
steps:
|
| 13 |
+
- uses: actions/checkout@v4
|
| 14 |
+
- uses: actions/setup-python@v5
|
| 15 |
+
with:
|
| 16 |
+
python-version: "3.11"
|
| 17 |
+
- run: pip install ruff
|
| 18 |
+
- run: ruff check src/ tests/
|
| 19 |
+
|
| 20 |
+
test:
|
| 21 |
+
runs-on: ubuntu-latest
|
| 22 |
+
steps:
|
| 23 |
+
- uses: actions/checkout@v4
|
| 24 |
+
- uses: actions/setup-python@v5
|
| 25 |
+
with:
|
| 26 |
+
python-version: "3.11"
|
| 27 |
+
- run: pip install -e ".[dev]"
|
| 28 |
+
- run: pytest --tb=short -q
|
| 29 |
+
- run: pytest --cov=src --cov-report=term-missing --cov-fail-under=90
|
Dockerfile
CHANGED
|
@@ -16,6 +16,13 @@ COPY src/ src/
|
|
| 16 |
COPY frontend/ frontend/
|
| 17 |
COPY AGENTS.md ./
|
| 18 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
# Default storage root — overridden in Space mode via /data
|
| 20 |
ENV STORAGE_ROOT=/app/data
|
| 21 |
ENV HOST=0.0.0.0
|
|
|
|
| 16 |
COPY frontend/ frontend/
|
| 17 |
COPY AGENTS.md ./
|
| 18 |
|
| 19 |
+
# HF Spaces requires a non-root user
|
| 20 |
+
RUN useradd -m -u 1000 appuser \
|
| 21 |
+
&& mkdir -p /app/data /data \
|
| 22 |
+
&& chown -R appuser:appuser /app /data
|
| 23 |
+
|
| 24 |
+
USER appuser
|
| 25 |
+
|
| 26 |
# Default storage root — overridden in Space mode via /data
|
| 27 |
ENV STORAGE_ROOT=/app/data
|
| 28 |
ENV HOST=0.0.0.0
|
README.md
CHANGED
|
@@ -1,3 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# XmLLM
|
| 2 |
|
| 3 |
**Canonical-first document structure engine** that converts OCR/VLM provider outputs into validated **ALTO XML** and **PAGE XML**, via an internal canonical representation.
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: XmLLM
|
| 3 |
+
emoji: "\U0001F4C4"
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
+
pinned: false
|
| 9 |
+
license: apache-2.0
|
| 10 |
+
short_description: "Document structure engine: OCR output to ALTO XML & PAGE XML"
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
# XmLLM
|
| 14 |
|
| 15 |
**Canonical-first document structure engine** that converts OCR/VLM provider outputs into validated **ALTO XML** and **PAGE XML**, via an internal canonical representation.
|
pyproject.toml
CHANGED
|
@@ -54,7 +54,7 @@ line-length = 100
|
|
| 54 |
src = ["src", "tests"]
|
| 55 |
|
| 56 |
[tool.ruff.lint]
|
| 57 |
-
select = ["E", "F", "I", "N", "W", "UP", "B", "SIM"
|
| 58 |
|
| 59 |
[tool.mypy]
|
| 60 |
python_version = "3.11"
|
|
|
|
| 54 |
src = ["src", "tests"]
|
| 55 |
|
| 56 |
[tool.ruff.lint]
|
| 57 |
+
select = ["E", "F", "I", "N", "W", "UP", "B", "SIM"]
|
| 58 |
|
| 59 |
[tool.mypy]
|
| 60 |
python_version = "3.11"
|
src/app/api/__init__.py
CHANGED
|
@@ -6,10 +6,14 @@ JobService, scoped to the application lifespan.
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
|
|
|
|
|
|
| 9 |
from src.app.jobs.service import JobService
|
| 10 |
from src.app.persistence.db import Database
|
| 11 |
from src.app.persistence.file_store import FileStore
|
| 12 |
-
|
|
|
|
|
|
|
| 13 |
|
| 14 |
# Module-level singletons, initialized during lifespan startup.
|
| 15 |
_db: Database | None = None
|
|
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
+
from typing import TYPE_CHECKING
|
| 10 |
+
|
| 11 |
from src.app.jobs.service import JobService
|
| 12 |
from src.app.persistence.db import Database
|
| 13 |
from src.app.persistence.file_store import FileStore
|
| 14 |
+
|
| 15 |
+
if TYPE_CHECKING:
|
| 16 |
+
from src.app.settings import Settings
|
| 17 |
|
| 18 |
# Module-level singletons, initialized during lifespan startup.
|
| 19 |
_db: Database | None = None
|
src/app/api/routes_exports.py
CHANGED
|
@@ -28,7 +28,10 @@ async def get_canonical(job_id: str) -> dict[str, Any]:
|
|
| 28 |
store = get_file_store()
|
| 29 |
data = store.load_canonical(job_id)
|
| 30 |
if data is None:
|
| 31 |
-
raise HTTPException(
|
|
|
|
|
|
|
|
|
|
| 32 |
return data
|
| 33 |
|
| 34 |
|
|
|
|
| 28 |
store = get_file_store()
|
| 29 |
data = store.load_canonical(job_id)
|
| 30 |
if data is None:
|
| 31 |
+
raise HTTPException(
|
| 32 |
+
status_code=404,
|
| 33 |
+
detail=f"Canonical document not found for job '{job_id}'",
|
| 34 |
+
)
|
| 35 |
return data
|
| 36 |
|
| 37 |
|
src/app/api/routes_jobs.py
CHANGED
|
@@ -29,12 +29,15 @@ async def create_job(
|
|
| 29 |
"""
|
| 30 |
svc = get_job_service()
|
| 31 |
|
| 32 |
-
# Read raw payload
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
| 34 |
try:
|
| 35 |
payload_data = json.loads(content)
|
| 36 |
except json.JSONDecodeError as e:
|
| 37 |
-
raise HTTPException(status_code=422, detail=f"Invalid JSON payload: {e}")
|
| 38 |
|
| 39 |
raw = RawProviderPayload(
|
| 40 |
provider_id=provider_id,
|
|
|
|
| 29 |
"""
|
| 30 |
svc = get_job_service()
|
| 31 |
|
| 32 |
+
# Read raw payload (limit to 50 MB to prevent DoS)
|
| 33 |
+
max_size = 52_428_800
|
| 34 |
+
content = await raw_payload_file.read(max_size + 1)
|
| 35 |
+
if len(content) > max_size:
|
| 36 |
+
raise HTTPException(status_code=413, detail="Payload too large (max 50 MB)")
|
| 37 |
try:
|
| 38 |
payload_data = json.loads(content)
|
| 39 |
except json.JSONDecodeError as e:
|
| 40 |
+
raise HTTPException(status_code=422, detail=f"Invalid JSON payload: {e}") from None
|
| 41 |
|
| 42 |
raw = RawProviderPayload(
|
| 43 |
provider_id=provider_id,
|
src/app/domain/errors/__init__.py
CHANGED
|
@@ -2,12 +2,12 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
-
from enum import
|
| 6 |
|
| 7 |
from pydantic import BaseModel, ConfigDict, Field
|
| 8 |
|
| 9 |
|
| 10 |
-
class Severity(
|
| 11 |
"""Severity level for validation entries."""
|
| 12 |
|
| 13 |
ERROR = "error"
|
|
@@ -22,7 +22,10 @@ class ValidationEntry(BaseModel):
|
|
| 22 |
|
| 23 |
validator: str = Field(min_length=1)
|
| 24 |
severity: Severity
|
| 25 |
-
path: str = Field(
|
|
|
|
|
|
|
|
|
|
| 26 |
message: str = Field(min_length=1)
|
| 27 |
code: str | None = None
|
| 28 |
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
from enum import StrEnum
|
| 6 |
|
| 7 |
from pydantic import BaseModel, ConfigDict, Field
|
| 8 |
|
| 9 |
|
| 10 |
+
class Severity(StrEnum):
|
| 11 |
"""Severity level for validation entries."""
|
| 12 |
|
| 13 |
ERROR = "error"
|
|
|
|
| 22 |
|
| 23 |
validator: str = Field(min_length=1)
|
| 24 |
severity: Severity
|
| 25 |
+
path: str = Field(
|
| 26 |
+
min_length=1,
|
| 27 |
+
description="Path in the document, e.g. pages[0].text_regions[1].lines[3]",
|
| 28 |
+
)
|
| 29 |
message: str = Field(min_length=1)
|
| 30 |
code: str | None = None
|
| 31 |
|
src/app/domain/models/__init__.py
CHANGED
|
@@ -17,8 +17,8 @@ from src.app.domain.models.canonical_document import (
|
|
| 17 |
Word,
|
| 18 |
)
|
| 19 |
from src.app.domain.models.geometry import (
|
| 20 |
-
BBox,
|
| 21 |
Baseline,
|
|
|
|
| 22 |
ClipRect,
|
| 23 |
Geometry,
|
| 24 |
GeometryContext,
|
|
|
|
| 17 |
Word,
|
| 18 |
)
|
| 19 |
from src.app.domain.models.geometry import (
|
|
|
|
| 20 |
Baseline,
|
| 21 |
+
BBox,
|
| 22 |
ClipRect,
|
| 23 |
Geometry,
|
| 24 |
GeometryContext,
|
src/app/domain/models/canonical_document.py
CHANGED
|
@@ -11,10 +11,10 @@ Every node carries geometry + provenance. No exceptions.
|
|
| 11 |
|
| 12 |
from __future__ import annotations
|
| 13 |
|
| 14 |
-
from datetime import
|
| 15 |
from typing import Any
|
| 16 |
|
| 17 |
-
from pydantic import BaseModel, ConfigDict, Field,
|
| 18 |
|
| 19 |
from src.app.domain.models.geometry import Geometry
|
| 20 |
from src.app.domain.models.provenance import Provenance
|
|
@@ -33,7 +33,6 @@ from src.app.domain.models.status import (
|
|
| 33 |
Unit,
|
| 34 |
)
|
| 35 |
|
| 36 |
-
|
| 37 |
# -- Source ------------------------------------------------------------------
|
| 38 |
|
| 39 |
|
|
@@ -238,7 +237,7 @@ class CanonicalDocument(BaseModel):
|
|
| 238 |
schema_version: str = Field(default="1.0.0", pattern=r"^\d+\.\d+\.\d+$")
|
| 239 |
document_id: str = Field(min_length=1)
|
| 240 |
source: Source
|
| 241 |
-
created_at: datetime = Field(default_factory=lambda: datetime.now(
|
| 242 |
|
| 243 |
pages: list[Page] = Field(min_length=1)
|
| 244 |
|
|
|
|
| 11 |
|
| 12 |
from __future__ import annotations
|
| 13 |
|
| 14 |
+
from datetime import UTC, datetime
|
| 15 |
from typing import Any
|
| 16 |
|
| 17 |
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
| 18 |
|
| 19 |
from src.app.domain.models.geometry import Geometry
|
| 20 |
from src.app.domain.models.provenance import Provenance
|
|
|
|
| 33 |
Unit,
|
| 34 |
)
|
| 35 |
|
|
|
|
| 36 |
# -- Source ------------------------------------------------------------------
|
| 37 |
|
| 38 |
|
|
|
|
| 237 |
schema_version: str = Field(default="1.0.0", pattern=r"^\d+\.\d+\.\d+$")
|
| 238 |
document_id: str = Field(min_length=1)
|
| 239 |
source: Source
|
| 240 |
+
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
| 241 |
|
| 242 |
pages: list[Page] = Field(min_length=1)
|
| 243 |
|
src/app/domain/models/geometry.py
CHANGED
|
@@ -15,7 +15,6 @@ from pydantic import BaseModel, ConfigDict, Field, model_validator
|
|
| 15 |
|
| 16 |
from src.app.domain.models.status import CoordinateOrigin, GeometryStatus, Unit
|
| 17 |
|
| 18 |
-
|
| 19 |
# -- Primitives --------------------------------------------------------------
|
| 20 |
|
| 21 |
|
|
|
|
| 15 |
|
| 16 |
from src.app.domain.models.status import CoordinateOrigin, GeometryStatus, Unit
|
| 17 |
|
|
|
|
| 18 |
# -- Primitives --------------------------------------------------------------
|
| 19 |
|
| 20 |
|
src/app/domain/models/raw_payload.py
CHANGED
|
@@ -7,7 +7,7 @@ It is never used for export or rendering.
|
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
-
from datetime import
|
| 11 |
from typing import Any
|
| 12 |
|
| 13 |
from pydantic import BaseModel, ConfigDict, Field
|
|
@@ -26,7 +26,7 @@ class RawProviderPayload(BaseModel):
|
|
| 26 |
payload: dict[str, Any] | list[Any]
|
| 27 |
"""The raw JSON-serialisable output from the provider."""
|
| 28 |
|
| 29 |
-
received_at: datetime = Field(default_factory=lambda: datetime.now(
|
| 30 |
|
| 31 |
image_width: int | None = Field(default=None, gt=0)
|
| 32 |
image_height: int | None = Field(default=None, gt=0)
|
|
|
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
+
from datetime import UTC, datetime
|
| 11 |
from typing import Any
|
| 12 |
|
| 13 |
from pydantic import BaseModel, ConfigDict, Field
|
|
|
|
| 26 |
payload: dict[str, Any] | list[Any]
|
| 27 |
"""The raw JSON-serialisable output from the provider."""
|
| 28 |
|
| 29 |
+
received_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
| 30 |
|
| 31 |
image_width: int | None = Field(default=None, gt=0)
|
| 32 |
image_height: int | None = Field(default=None, gt=0)
|
src/app/domain/models/status.py
CHANGED
|
@@ -6,13 +6,12 @@ and viewer projection models. They carry no logic — only values.
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
-
from enum import
|
| 10 |
-
|
| 11 |
|
| 12 |
# -- Geometry ----------------------------------------------------------------
|
| 13 |
|
| 14 |
|
| 15 |
-
class GeometryStatus(
|
| 16 |
"""How a piece of geometry was obtained."""
|
| 17 |
|
| 18 |
EXACT = "exact"
|
|
@@ -21,13 +20,13 @@ class GeometryStatus(str, Enum):
|
|
| 21 |
UNKNOWN = "unknown"
|
| 22 |
|
| 23 |
|
| 24 |
-
class CoordinateOrigin(
|
| 25 |
"""Origin of the coordinate system. Always top_left in canonical model."""
|
| 26 |
|
| 27 |
TOP_LEFT = "top_left"
|
| 28 |
|
| 29 |
|
| 30 |
-
class Unit(
|
| 31 |
"""Measurement unit. Always px in canonical model."""
|
| 32 |
|
| 33 |
PX = "px"
|
|
@@ -36,7 +35,7 @@ class Unit(str, Enum):
|
|
| 36 |
# -- Provenance --------------------------------------------------------------
|
| 37 |
|
| 38 |
|
| 39 |
-
class EvidenceType(
|
| 40 |
"""How a piece of data was produced."""
|
| 41 |
|
| 42 |
PROVIDER_NATIVE = "provider_native"
|
|
@@ -48,7 +47,7 @@ class EvidenceType(str, Enum):
|
|
| 48 |
# -- Document structure ------------------------------------------------------
|
| 49 |
|
| 50 |
|
| 51 |
-
class BlockRole(
|
| 52 |
"""Semantic role of a text block within the page."""
|
| 53 |
|
| 54 |
BODY = "body"
|
|
@@ -62,7 +61,7 @@ class BlockRole(str, Enum):
|
|
| 62 |
OTHER = "other"
|
| 63 |
|
| 64 |
|
| 65 |
-
class NonTextKind(
|
| 66 |
"""Type of non-textual region."""
|
| 67 |
|
| 68 |
ILLUSTRATION = "illustration"
|
|
@@ -76,7 +75,7 @@ class NonTextKind(str, Enum):
|
|
| 76 |
# -- Source ------------------------------------------------------------------
|
| 77 |
|
| 78 |
|
| 79 |
-
class InputType(
|
| 80 |
"""Type of the original input document."""
|
| 81 |
|
| 82 |
IMAGE = "image"
|
|
@@ -92,7 +91,7 @@ class InputType(str, Enum):
|
|
| 92 |
# -- Readiness ---------------------------------------------------------------
|
| 93 |
|
| 94 |
|
| 95 |
-
class ReadinessLevel(
|
| 96 |
"""How ready a document / page / element is for export."""
|
| 97 |
|
| 98 |
FULL = "full"
|
|
@@ -101,7 +100,7 @@ class ReadinessLevel(str, Enum):
|
|
| 101 |
NONE = "none"
|
| 102 |
|
| 103 |
|
| 104 |
-
class MissingCapability(
|
| 105 |
"""Specific capabilities that may be missing for export readiness."""
|
| 106 |
|
| 107 |
PAGE_DIMENSIONS = "page_dimensions"
|
|
@@ -117,7 +116,7 @@ class MissingCapability(str, Enum):
|
|
| 117 |
# -- Overlay (viewer) --------------------------------------------------------
|
| 118 |
|
| 119 |
|
| 120 |
-
class OverlayLevel(
|
| 121 |
"""Granularity level for viewer overlays."""
|
| 122 |
|
| 123 |
BLOCK = "block"
|
|
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
+
from enum import StrEnum
|
|
|
|
| 10 |
|
| 11 |
# -- Geometry ----------------------------------------------------------------
|
| 12 |
|
| 13 |
|
| 14 |
+
class GeometryStatus(StrEnum):
|
| 15 |
"""How a piece of geometry was obtained."""
|
| 16 |
|
| 17 |
EXACT = "exact"
|
|
|
|
| 20 |
UNKNOWN = "unknown"
|
| 21 |
|
| 22 |
|
| 23 |
+
class CoordinateOrigin(StrEnum):
|
| 24 |
"""Origin of the coordinate system. Always top_left in canonical model."""
|
| 25 |
|
| 26 |
TOP_LEFT = "top_left"
|
| 27 |
|
| 28 |
|
| 29 |
+
class Unit(StrEnum):
|
| 30 |
"""Measurement unit. Always px in canonical model."""
|
| 31 |
|
| 32 |
PX = "px"
|
|
|
|
| 35 |
# -- Provenance --------------------------------------------------------------
|
| 36 |
|
| 37 |
|
| 38 |
+
class EvidenceType(StrEnum):
|
| 39 |
"""How a piece of data was produced."""
|
| 40 |
|
| 41 |
PROVIDER_NATIVE = "provider_native"
|
|
|
|
| 47 |
# -- Document structure ------------------------------------------------------
|
| 48 |
|
| 49 |
|
| 50 |
+
class BlockRole(StrEnum):
|
| 51 |
"""Semantic role of a text block within the page."""
|
| 52 |
|
| 53 |
BODY = "body"
|
|
|
|
| 61 |
OTHER = "other"
|
| 62 |
|
| 63 |
|
| 64 |
+
class NonTextKind(StrEnum):
|
| 65 |
"""Type of non-textual region."""
|
| 66 |
|
| 67 |
ILLUSTRATION = "illustration"
|
|
|
|
| 75 |
# -- Source ------------------------------------------------------------------
|
| 76 |
|
| 77 |
|
| 78 |
+
class InputType(StrEnum):
|
| 79 |
"""Type of the original input document."""
|
| 80 |
|
| 81 |
IMAGE = "image"
|
|
|
|
| 91 |
# -- Readiness ---------------------------------------------------------------
|
| 92 |
|
| 93 |
|
| 94 |
+
class ReadinessLevel(StrEnum):
|
| 95 |
"""How ready a document / page / element is for export."""
|
| 96 |
|
| 97 |
FULL = "full"
|
|
|
|
| 100 |
NONE = "none"
|
| 101 |
|
| 102 |
|
| 103 |
+
class MissingCapability(StrEnum):
|
| 104 |
"""Specific capabilities that may be missing for export readiness."""
|
| 105 |
|
| 106 |
PAGE_DIMENSIONS = "page_dimensions"
|
|
|
|
| 116 |
# -- Overlay (viewer) --------------------------------------------------------
|
| 117 |
|
| 118 |
|
| 119 |
+
class OverlayLevel(StrEnum):
|
| 120 |
"""Granularity level for viewer overlays."""
|
| 121 |
|
| 122 |
BLOCK = "block"
|
src/app/enrichers/__init__.py
CHANGED
|
@@ -13,10 +13,13 @@ Every enricher MUST:
|
|
| 13 |
from __future__ import annotations
|
| 14 |
|
| 15 |
from abc import ABC, abstractmethod
|
|
|
|
| 16 |
|
| 17 |
-
from src.app.domain.models import CanonicalDocument
|
| 18 |
from src.app.policies.document_policy import DocumentPolicy
|
| 19 |
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
class BaseEnricher(ABC):
|
| 22 |
"""Abstract base for enrichers."""
|
|
|
|
| 13 |
from __future__ import annotations
|
| 14 |
|
| 15 |
from abc import ABC, abstractmethod
|
| 16 |
+
from typing import TYPE_CHECKING
|
| 17 |
|
|
|
|
| 18 |
from src.app.policies.document_policy import DocumentPolicy
|
| 19 |
|
| 20 |
+
if TYPE_CHECKING:
|
| 21 |
+
from src.app.domain.models import CanonicalDocument
|
| 22 |
+
|
| 23 |
|
| 24 |
class BaseEnricher(ABC):
|
| 25 |
"""Abstract base for enrichers."""
|
src/app/enrichers/bbox_repair_light.py
CHANGED
|
@@ -8,12 +8,16 @@ Light repair only:
|
|
| 8 |
|
| 9 |
from __future__ import annotations
|
| 10 |
|
| 11 |
-
from
|
|
|
|
| 12 |
from src.app.domain.models.status import GeometryStatus
|
| 13 |
from src.app.enrichers import BaseEnricher
|
| 14 |
from src.app.geometry.bbox import contains
|
| 15 |
from src.app.geometry.transforms import clip_bbox_to_page
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
class BboxRepairLightEnricher(BaseEnricher):
|
|
|
|
| 8 |
|
| 9 |
from __future__ import annotations
|
| 10 |
|
| 11 |
+
from typing import TYPE_CHECKING
|
| 12 |
+
|
| 13 |
from src.app.domain.models.status import GeometryStatus
|
| 14 |
from src.app.enrichers import BaseEnricher
|
| 15 |
from src.app.geometry.bbox import contains
|
| 16 |
from src.app.geometry.transforms import clip_bbox_to_page
|
| 17 |
+
|
| 18 |
+
if TYPE_CHECKING:
|
| 19 |
+
from src.app.domain.models import CanonicalDocument
|
| 20 |
+
from src.app.policies.document_policy import DocumentPolicy
|
| 21 |
|
| 22 |
|
| 23 |
class BboxRepairLightEnricher(BaseEnricher):
|
src/app/enrichers/hyphenation_basic.py
CHANGED
|
@@ -6,9 +6,13 @@ a lowercase word, marks both as hyphenated with the combined full_form.
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
|
|
|
|
|
|
| 9 |
from src.app.domain.models import CanonicalDocument, Hyphenation, TextLine
|
| 10 |
from src.app.enrichers import BaseEnricher
|
| 11 |
-
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
class HyphenationBasicEnricher(BaseEnricher):
|
|
@@ -52,7 +56,7 @@ class HyphenationBasicEnricher(BaseEnricher):
|
|
| 52 |
continue
|
| 53 |
|
| 54 |
last_word = line_a.words[-1]
|
| 55 |
-
first_word = line_b.words[
|
| 56 |
|
| 57 |
# Skip if already hyphenated
|
| 58 |
if last_word.hyphenation is not None:
|
|
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
+
from typing import TYPE_CHECKING
|
| 10 |
+
|
| 11 |
from src.app.domain.models import CanonicalDocument, Hyphenation, TextLine
|
| 12 |
from src.app.enrichers import BaseEnricher
|
| 13 |
+
|
| 14 |
+
if TYPE_CHECKING:
|
| 15 |
+
from src.app.policies.document_policy import DocumentPolicy
|
| 16 |
|
| 17 |
|
| 18 |
class HyphenationBasicEnricher(BaseEnricher):
|
|
|
|
| 56 |
continue
|
| 57 |
|
| 58 |
last_word = line_a.words[-1]
|
| 59 |
+
first_word = line_b.words[0]
|
| 60 |
|
| 61 |
# Skip if already hyphenated
|
| 62 |
if last_word.hyphenation is not None:
|
src/app/enrichers/lang_propagation.py
CHANGED
|
@@ -6,9 +6,13 @@ propagated downward. The word's provenance is not changed — only lang is set.
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
-
from
|
|
|
|
| 10 |
from src.app.enrichers import BaseEnricher
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
class LangPropagationEnricher(BaseEnricher):
|
|
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
+
from typing import TYPE_CHECKING
|
| 10 |
+
|
| 11 |
from src.app.enrichers import BaseEnricher
|
| 12 |
+
|
| 13 |
+
if TYPE_CHECKING:
|
| 14 |
+
from src.app.domain.models import CanonicalDocument
|
| 15 |
+
from src.app.policies.document_policy import DocumentPolicy
|
| 16 |
|
| 17 |
|
| 18 |
class LangPropagationEnricher(BaseEnricher):
|
src/app/enrichers/polygon_to_bbox.py
CHANGED
|
@@ -6,11 +6,15 @@ axis-aligned bounding box and marks the geometry as 'inferred'.
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
-
from
|
| 10 |
-
|
|
|
|
| 11 |
from src.app.enrichers import BaseEnricher
|
| 12 |
from src.app.geometry.polygon import polygon_to_bbox as _poly_to_bbox
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
class PolygonToBboxEnricher(BaseEnricher):
|
|
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
+
from typing import TYPE_CHECKING
|
| 10 |
+
|
| 11 |
+
from src.app.domain.models.status import GeometryStatus
|
| 12 |
from src.app.enrichers import BaseEnricher
|
| 13 |
from src.app.geometry.polygon import polygon_to_bbox as _poly_to_bbox
|
| 14 |
+
|
| 15 |
+
if TYPE_CHECKING:
|
| 16 |
+
from src.app.domain.models import CanonicalDocument
|
| 17 |
+
from src.app.policies.document_policy import DocumentPolicy
|
| 18 |
|
| 19 |
|
| 20 |
class PolygonToBboxEnricher(BaseEnricher):
|
src/app/enrichers/reading_order_simple.py
CHANGED
|
@@ -6,10 +6,14 @@ left-to-right using the center of each region's bbox.
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
-
from
|
|
|
|
| 10 |
from src.app.enrichers import BaseEnricher
|
| 11 |
from src.app.geometry.bbox import center
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
class ReadingOrderSimpleEnricher(BaseEnricher):
|
|
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
+
from typing import TYPE_CHECKING
|
| 10 |
+
|
| 11 |
from src.app.enrichers import BaseEnricher
|
| 12 |
from src.app.geometry.bbox import center
|
| 13 |
+
|
| 14 |
+
if TYPE_CHECKING:
|
| 15 |
+
from src.app.domain.models import CanonicalDocument
|
| 16 |
+
from src.app.policies.document_policy import DocumentPolicy
|
| 17 |
|
| 18 |
|
| 19 |
class ReadingOrderSimpleEnricher(BaseEnricher):
|
src/app/enrichers/text_consistency.py
CHANGED
|
@@ -7,9 +7,13 @@ This enricher does NOT modify text — it only adds warnings.
|
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
-
from
|
|
|
|
| 11 |
from src.app.enrichers import BaseEnricher
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
class TextConsistencyEnricher(BaseEnricher):
|
|
|
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
+
from typing import TYPE_CHECKING
|
| 11 |
+
|
| 12 |
from src.app.enrichers import BaseEnricher
|
| 13 |
+
|
| 14 |
+
if TYPE_CHECKING:
|
| 15 |
+
from src.app.domain.models import CanonicalDocument
|
| 16 |
+
from src.app.policies.document_policy import DocumentPolicy
|
| 17 |
|
| 18 |
|
| 19 |
class TextConsistencyEnricher(BaseEnricher):
|
src/app/geometry/baseline.py
CHANGED
|
@@ -5,7 +5,6 @@ A baseline is defined by a sequence of points, typically from left to right.
|
|
| 5 |
|
| 6 |
from __future__ import annotations
|
| 7 |
|
| 8 |
-
|
| 9 |
BaselinePoints = list[tuple[float, float]]
|
| 10 |
|
| 11 |
|
|
|
|
| 5 |
|
| 6 |
from __future__ import annotations
|
| 7 |
|
|
|
|
| 8 |
BaselinePoints = list[tuple[float, float]]
|
| 9 |
|
| 10 |
|
src/app/geometry/bbox.py
CHANGED
|
@@ -6,7 +6,6 @@ x = left edge, y = top edge, width > 0, height > 0.
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
-
|
| 10 |
# Type alias for the canonical bbox tuple
|
| 11 |
BBoxTuple = tuple[float, float, float, float]
|
| 12 |
|
|
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
|
|
|
| 9 |
# Type alias for the canonical bbox tuple
|
| 10 |
BBoxTuple = tuple[float, float, float, float]
|
| 11 |
|
src/app/geometry/normalization.py
CHANGED
|
@@ -7,9 +7,13 @@ always (x, y, width, height) with origin at top_left, unit px.
|
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
-
from
|
|
|
|
| 11 |
from src.app.geometry.polygon import PolygonPoints, polygon_to_bbox
|
| 12 |
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
def xyxy_to_xywh(xyxy: tuple[float, float, float, float]) -> BBoxTuple:
|
| 15 |
"""Convert (x1, y1, x2, y2) to canonical (x, y, width, height).
|
|
|
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
+
from typing import TYPE_CHECKING
|
| 11 |
+
|
| 12 |
from src.app.geometry.polygon import PolygonPoints, polygon_to_bbox
|
| 13 |
|
| 14 |
+
if TYPE_CHECKING:
|
| 15 |
+
from src.app.geometry.bbox import BBoxTuple
|
| 16 |
+
|
| 17 |
|
| 18 |
def xyxy_to_xywh(xyxy: tuple[float, float, float, float]) -> BBoxTuple:
|
| 19 |
"""Convert (x1, y1, x2, y2) to canonical (x, y, width, height).
|
src/app/geometry/polygon.py
CHANGED
|
@@ -5,8 +5,10 @@ Polygons are represented as list[tuple[float, float]] — ordered (x, y) vertice
|
|
| 5 |
|
| 6 |
from __future__ import annotations
|
| 7 |
|
| 8 |
-
from
|
| 9 |
|
|
|
|
|
|
|
| 10 |
|
| 11 |
PolygonPoints = list[tuple[float, float]]
|
| 12 |
|
|
|
|
| 5 |
|
| 6 |
from __future__ import annotations
|
| 7 |
|
| 8 |
+
from typing import TYPE_CHECKING
|
| 9 |
|
| 10 |
+
if TYPE_CHECKING:
|
| 11 |
+
from src.app.geometry.bbox import BBoxTuple
|
| 12 |
|
| 13 |
PolygonPoints = list[tuple[float, float]]
|
| 14 |
|
src/app/geometry/quantization.py
CHANGED
|
@@ -8,12 +8,14 @@ rounding and tolerance for containment checks.
|
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
import math
|
| 11 |
-
from enum import
|
|
|
|
| 12 |
|
| 13 |
-
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
-
class RoundingStrategy(
|
| 17 |
"""How to round float coordinates to integers."""
|
| 18 |
|
| 19 |
ROUND = "round"
|
|
|
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
import math
|
| 11 |
+
from enum import StrEnum
|
| 12 |
+
from typing import TYPE_CHECKING
|
| 13 |
|
| 14 |
+
if TYPE_CHECKING:
|
| 15 |
+
from src.app.geometry.bbox import BBoxTuple
|
| 16 |
|
| 17 |
|
| 18 |
+
class RoundingStrategy(StrEnum):
|
| 19 |
"""How to round float coordinates to integers."""
|
| 20 |
|
| 21 |
ROUND = "round"
|
src/app/geometry/transforms.py
CHANGED
|
@@ -6,10 +6,11 @@ is allowed in serializers (see AGENTS.md rule §5).
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
-
import
|
| 10 |
|
| 11 |
-
|
| 12 |
-
from src.app.geometry.
|
|
|
|
| 13 |
|
| 14 |
|
| 15 |
def rescale_bbox(bbox: BBoxTuple, factor: float) -> BBoxTuple:
|
|
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
+
from typing import TYPE_CHECKING
|
| 10 |
|
| 11 |
+
if TYPE_CHECKING:
|
| 12 |
+
from src.app.geometry.bbox import BBoxTuple
|
| 13 |
+
from src.app.geometry.polygon import PolygonPoints
|
| 14 |
|
| 15 |
|
| 16 |
def rescale_bbox(bbox: BBoxTuple, factor: float) -> BBoxTuple:
|
src/app/jobs/events.py
CHANGED
|
@@ -4,14 +4,17 @@ from __future__ import annotations
|
|
| 4 |
|
| 5 |
import time
|
| 6 |
from contextlib import contextmanager
|
| 7 |
-
from datetime import
|
| 8 |
-
from enum import
|
| 9 |
-
from typing import
|
| 10 |
|
| 11 |
from pydantic import BaseModel, Field
|
| 12 |
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
|
|
|
| 15 |
"""Named steps in the processing pipeline."""
|
| 16 |
|
| 17 |
RECEIVE_FILE = "receive_file"
|
|
@@ -34,7 +37,7 @@ class JobEvent(BaseModel):
|
|
| 34 |
|
| 35 |
step: JobStep
|
| 36 |
status: str = "started" # started, completed, failed, skipped
|
| 37 |
-
started_at: datetime = Field(default_factory=lambda: datetime.now(
|
| 38 |
completed_at: datetime | None = None
|
| 39 |
duration_ms: float | None = None
|
| 40 |
message: str | None = None
|
|
@@ -67,7 +70,7 @@ class EventLog:
|
|
| 67 |
idx = len(self._events) - 1
|
| 68 |
self._events[idx] = event.model_copy(update={
|
| 69 |
"status": "completed",
|
| 70 |
-
"completed_at": datetime.now(
|
| 71 |
"duration_ms": elapsed,
|
| 72 |
})
|
| 73 |
except Exception as exc:
|
|
@@ -75,7 +78,7 @@ class EventLog:
|
|
| 75 |
idx = len(self._events) - 1
|
| 76 |
self._events[idx] = event.model_copy(update={
|
| 77 |
"status": "failed",
|
| 78 |
-
"completed_at": datetime.now(
|
| 79 |
"duration_ms": elapsed,
|
| 80 |
"error": str(exc),
|
| 81 |
})
|
|
@@ -86,7 +89,7 @@ class EventLog:
|
|
| 86 |
self._events.append(JobEvent(
|
| 87 |
step=step,
|
| 88 |
status="skipped",
|
| 89 |
-
completed_at=datetime.now(
|
| 90 |
message=reason,
|
| 91 |
))
|
| 92 |
|
|
|
|
| 4 |
|
| 5 |
import time
|
| 6 |
from contextlib import contextmanager
|
| 7 |
+
from datetime import UTC, datetime
|
| 8 |
+
from enum import StrEnum
|
| 9 |
+
from typing import TYPE_CHECKING
|
| 10 |
|
| 11 |
from pydantic import BaseModel, Field
|
| 12 |
|
| 13 |
+
if TYPE_CHECKING:
|
| 14 |
+
from collections.abc import Generator
|
| 15 |
|
| 16 |
+
|
| 17 |
+
class JobStep(StrEnum):
|
| 18 |
"""Named steps in the processing pipeline."""
|
| 19 |
|
| 20 |
RECEIVE_FILE = "receive_file"
|
|
|
|
| 37 |
|
| 38 |
step: JobStep
|
| 39 |
status: str = "started" # started, completed, failed, skipped
|
| 40 |
+
started_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
| 41 |
completed_at: datetime | None = None
|
| 42 |
duration_ms: float | None = None
|
| 43 |
message: str | None = None
|
|
|
|
| 70 |
idx = len(self._events) - 1
|
| 71 |
self._events[idx] = event.model_copy(update={
|
| 72 |
"status": "completed",
|
| 73 |
+
"completed_at": datetime.now(UTC),
|
| 74 |
"duration_ms": elapsed,
|
| 75 |
})
|
| 76 |
except Exception as exc:
|
|
|
|
| 78 |
idx = len(self._events) - 1
|
| 79 |
self._events[idx] = event.model_copy(update={
|
| 80 |
"status": "failed",
|
| 81 |
+
"completed_at": datetime.now(UTC),
|
| 82 |
"duration_ms": elapsed,
|
| 83 |
"error": str(exc),
|
| 84 |
})
|
|
|
|
| 89 |
self._events.append(JobEvent(
|
| 90 |
step=step,
|
| 91 |
status="skipped",
|
| 92 |
+
completed_at=datetime.now(UTC),
|
| 93 |
message=reason,
|
| 94 |
))
|
| 95 |
|
src/app/jobs/models.py
CHANGED
|
@@ -2,14 +2,14 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
-
from datetime import
|
| 6 |
-
from enum import
|
| 7 |
from typing import Any
|
| 8 |
|
| 9 |
from pydantic import BaseModel, Field
|
| 10 |
|
| 11 |
|
| 12 |
-
class JobStatus(
|
| 13 |
"""State machine for job lifecycle."""
|
| 14 |
|
| 15 |
QUEUED = "queued"
|
|
@@ -40,7 +40,7 @@ class Job(BaseModel):
|
|
| 40 |
has_viewer: bool = False
|
| 41 |
|
| 42 |
# Timing
|
| 43 |
-
created_at: datetime = Field(default_factory=lambda: datetime.now(
|
| 44 |
started_at: datetime | None = None
|
| 45 |
completed_at: datetime | None = None
|
| 46 |
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
from datetime import UTC, datetime
|
| 6 |
+
from enum import StrEnum
|
| 7 |
from typing import Any
|
| 8 |
|
| 9 |
from pydantic import BaseModel, Field
|
| 10 |
|
| 11 |
|
| 12 |
+
class JobStatus(StrEnum):
|
| 13 |
"""State machine for job lifecycle."""
|
| 14 |
|
| 15 |
QUEUED = "queued"
|
|
|
|
| 40 |
has_viewer: bool = False
|
| 41 |
|
| 42 |
# Timing
|
| 43 |
+
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
| 44 |
started_at: datetime | None = None
|
| 45 |
completed_at: datetime | None = None
|
| 46 |
|
src/app/jobs/service.py
CHANGED
|
@@ -19,10 +19,9 @@ Pipeline steps (§23.1):
|
|
| 19 |
from __future__ import annotations
|
| 20 |
|
| 21 |
import uuid
|
| 22 |
-
from datetime import
|
| 23 |
-
from
|
| 24 |
|
| 25 |
-
from src.app.domain.models import CanonicalDocument, RawProviderPayload
|
| 26 |
from src.app.domain.models.geometry import GeometryContext
|
| 27 |
from src.app.enrichers import EnricherPipeline
|
| 28 |
from src.app.enrichers.bbox_repair_light import BboxRepairLightEnricher
|
|
@@ -34,8 +33,6 @@ from src.app.enrichers.text_consistency import TextConsistencyEnricher
|
|
| 34 |
from src.app.jobs.events import EventLog, JobStep
|
| 35 |
from src.app.jobs.models import Job, JobStatus
|
| 36 |
from src.app.normalization.pipeline import normalize
|
| 37 |
-
from src.app.persistence.db import Database
|
| 38 |
-
from src.app.persistence.file_store import FileStore
|
| 39 |
from src.app.policies.document_policy import DocumentPolicy
|
| 40 |
from src.app.policies.export_policy import check_alto_export, check_page_export
|
| 41 |
from src.app.serializers.alto_xml import serialize_alto
|
|
@@ -44,6 +41,13 @@ from src.app.validators.export_eligibility_validator import compute_export_eligi
|
|
| 44 |
from src.app.validators.structural_validator import validate_structure
|
| 45 |
from src.app.viewer.projection_builder import build_projection
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
def _default_enricher_pipeline() -> EnricherPipeline:
|
| 49 |
return EnricherPipeline([
|
|
@@ -103,7 +107,7 @@ class JobService:
|
|
| 103 |
events = EventLog()
|
| 104 |
job = job.model_copy(update={
|
| 105 |
"status": JobStatus.RUNNING,
|
| 106 |
-
"started_at": datetime.now(
|
| 107 |
"image_width": image_width,
|
| 108 |
"image_height": image_height,
|
| 109 |
})
|
|
@@ -193,24 +197,23 @@ class JobService:
|
|
| 193 |
self._store.save_events(job.job_id, events.to_dicts())
|
| 194 |
|
| 195 |
# Determine final status
|
| 196 |
-
if job.has_alto
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
else:
|
| 200 |
-
final_status = JobStatus.PARTIAL_SUCCESS
|
| 201 |
-
else:
|
| 202 |
final_status = JobStatus.PARTIAL_SUCCESS
|
|
|
|
|
|
|
| 203 |
|
| 204 |
job = job.model_copy(update={
|
| 205 |
"status": final_status,
|
| 206 |
-
"completed_at": datetime.now(
|
| 207 |
"warnings": warnings,
|
| 208 |
})
|
| 209 |
|
| 210 |
except Exception as exc:
|
| 211 |
job = job.model_copy(update={
|
| 212 |
"status": JobStatus.FAILED,
|
| 213 |
-
"completed_at": datetime.now(
|
| 214 |
"error": str(exc),
|
| 215 |
"warnings": warnings,
|
| 216 |
})
|
|
|
|
| 19 |
from __future__ import annotations
|
| 20 |
|
| 21 |
import uuid
|
| 22 |
+
from datetime import UTC, datetime
|
| 23 |
+
from typing import TYPE_CHECKING
|
| 24 |
|
|
|
|
| 25 |
from src.app.domain.models.geometry import GeometryContext
|
| 26 |
from src.app.enrichers import EnricherPipeline
|
| 27 |
from src.app.enrichers.bbox_repair_light import BboxRepairLightEnricher
|
|
|
|
| 33 |
from src.app.jobs.events import EventLog, JobStep
|
| 34 |
from src.app.jobs.models import Job, JobStatus
|
| 35 |
from src.app.normalization.pipeline import normalize
|
|
|
|
|
|
|
| 36 |
from src.app.policies.document_policy import DocumentPolicy
|
| 37 |
from src.app.policies.export_policy import check_alto_export, check_page_export
|
| 38 |
from src.app.serializers.alto_xml import serialize_alto
|
|
|
|
| 41 |
from src.app.validators.structural_validator import validate_structure
|
| 42 |
from src.app.viewer.projection_builder import build_projection
|
| 43 |
|
| 44 |
+
if TYPE_CHECKING:
|
| 45 |
+
from pathlib import Path
|
| 46 |
+
|
| 47 |
+
from src.app.domain.models import CanonicalDocument, RawProviderPayload
|
| 48 |
+
from src.app.persistence.db import Database
|
| 49 |
+
from src.app.persistence.file_store import FileStore
|
| 50 |
+
|
| 51 |
|
| 52 |
def _default_enricher_pipeline() -> EnricherPipeline:
|
| 53 |
return EnricherPipeline([
|
|
|
|
| 107 |
events = EventLog()
|
| 108 |
job = job.model_copy(update={
|
| 109 |
"status": JobStatus.RUNNING,
|
| 110 |
+
"started_at": datetime.now(UTC),
|
| 111 |
"image_width": image_width,
|
| 112 |
"image_height": image_height,
|
| 113 |
})
|
|
|
|
| 197 |
self._store.save_events(job.job_id, events.to_dicts())
|
| 198 |
|
| 199 |
# Determine final status
|
| 200 |
+
if job.has_alto and job.has_page_xml:
|
| 201 |
+
final_status = JobStatus.SUCCEEDED
|
| 202 |
+
elif job.has_alto or job.has_page_xml:
|
|
|
|
|
|
|
|
|
|
| 203 |
final_status = JobStatus.PARTIAL_SUCCESS
|
| 204 |
+
else:
|
| 205 |
+
final_status = JobStatus.FAILED
|
| 206 |
|
| 207 |
job = job.model_copy(update={
|
| 208 |
"status": final_status,
|
| 209 |
+
"completed_at": datetime.now(UTC),
|
| 210 |
"warnings": warnings,
|
| 211 |
})
|
| 212 |
|
| 213 |
except Exception as exc:
|
| 214 |
job = job.model_copy(update={
|
| 215 |
"status": JobStatus.FAILED,
|
| 216 |
+
"completed_at": datetime.now(UTC),
|
| 217 |
"error": str(exc),
|
| 218 |
"warnings": warnings,
|
| 219 |
})
|
src/app/main.py
CHANGED
|
@@ -7,7 +7,7 @@ from __future__ import annotations
|
|
| 7 |
|
| 8 |
from contextlib import asynccontextmanager
|
| 9 |
from pathlib import Path
|
| 10 |
-
from typing import
|
| 11 |
|
| 12 |
from fastapi import FastAPI
|
| 13 |
from fastapi.middleware.cors import CORSMiddleware
|
|
@@ -22,6 +22,9 @@ from src.app.api.routes_providers import router as providers_router
|
|
| 22 |
from src.app.api.routes_viewer import router as viewer_router
|
| 23 |
from src.app.settings import get_settings
|
| 24 |
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
@asynccontextmanager
|
| 27 |
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
|
|
@@ -43,7 +46,7 @@ app = FastAPI(
|
|
| 43 |
app.add_middleware(
|
| 44 |
CORSMiddleware,
|
| 45 |
allow_origins=["*"],
|
| 46 |
-
allow_credentials=
|
| 47 |
allow_methods=["*"],
|
| 48 |
allow_headers=["*"],
|
| 49 |
)
|
|
|
|
| 7 |
|
| 8 |
from contextlib import asynccontextmanager
|
| 9 |
from pathlib import Path
|
| 10 |
+
from typing import TYPE_CHECKING
|
| 11 |
|
| 12 |
from fastapi import FastAPI
|
| 13 |
from fastapi.middleware.cors import CORSMiddleware
|
|
|
|
| 22 |
from src.app.api.routes_viewer import router as viewer_router
|
| 23 |
from src.app.settings import get_settings
|
| 24 |
|
| 25 |
+
if TYPE_CHECKING:
|
| 26 |
+
from collections.abc import AsyncIterator
|
| 27 |
+
|
| 28 |
|
| 29 |
@asynccontextmanager
|
| 30 |
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
|
|
|
|
| 46 |
app.add_middleware(
|
| 47 |
CORSMiddleware,
|
| 48 |
allow_origins=["*"],
|
| 49 |
+
allow_credentials=False,
|
| 50 |
allow_methods=["*"],
|
| 51 |
allow_headers=["*"],
|
| 52 |
)
|
src/app/normalization/pipeline.py
CHANGED
|
@@ -8,10 +8,14 @@ The pipeline:
|
|
| 8 |
|
| 9 |
from __future__ import annotations
|
| 10 |
|
| 11 |
-
from
|
| 12 |
-
|
| 13 |
from src.app.providers.registry import get_adapter
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
def normalize(
|
| 17 |
raw: RawProviderPayload,
|
|
|
|
| 8 |
|
| 9 |
from __future__ import annotations
|
| 10 |
|
| 11 |
+
from typing import TYPE_CHECKING
|
| 12 |
+
|
| 13 |
from src.app.providers.registry import get_adapter
|
| 14 |
|
| 15 |
+
if TYPE_CHECKING:
|
| 16 |
+
from src.app.domain.models import CanonicalDocument, RawProviderPayload
|
| 17 |
+
from src.app.domain.models.geometry import GeometryContext
|
| 18 |
+
|
| 19 |
|
| 20 |
def normalize(
|
| 21 |
raw: RawProviderPayload,
|
src/app/persistence/db.py
CHANGED
|
@@ -8,11 +8,14 @@ from __future__ import annotations
|
|
| 8 |
|
| 9 |
import json
|
| 10 |
import sqlite3
|
| 11 |
-
from
|
| 12 |
-
from typing import Any
|
| 13 |
|
| 14 |
from src.app.jobs.models import Job, JobStatus
|
| 15 |
|
|
|
|
|
|
|
|
|
|
| 16 |
_SCHEMA = """
|
| 17 |
CREATE TABLE IF NOT EXISTS jobs (
|
| 18 |
job_id TEXT PRIMARY KEY,
|
|
@@ -146,12 +149,16 @@ class Database:
|
|
| 146 |
# -- Providers ------------------------------------------------------------
|
| 147 |
|
| 148 |
def save_provider_record(self, provider_id: str, data: dict) -> None:
|
| 149 |
-
from datetime import datetime
|
| 150 |
|
| 151 |
-
now = datetime.now(
|
| 152 |
self.conn.execute(
|
| 153 |
-
"""INSERT OR REPLACE INTO providers
|
| 154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 155 |
(provider_id, json.dumps(data, default=str), provider_id, now, now),
|
| 156 |
)
|
| 157 |
self.conn.commit()
|
|
|
|
| 8 |
|
| 9 |
import json
|
| 10 |
import sqlite3
|
| 11 |
+
from datetime import UTC
|
| 12 |
+
from typing import TYPE_CHECKING, Any
|
| 13 |
|
| 14 |
from src.app.jobs.models import Job, JobStatus
|
| 15 |
|
| 16 |
+
if TYPE_CHECKING:
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
_SCHEMA = """
|
| 20 |
CREATE TABLE IF NOT EXISTS jobs (
|
| 21 |
job_id TEXT PRIMARY KEY,
|
|
|
|
| 149 |
# -- Providers ------------------------------------------------------------
|
| 150 |
|
| 151 |
def save_provider_record(self, provider_id: str, data: dict) -> None:
|
| 152 |
+
from datetime import datetime
|
| 153 |
|
| 154 |
+
now = datetime.now(UTC).isoformat()
|
| 155 |
self.conn.execute(
|
| 156 |
+
"""INSERT OR REPLACE INTO providers
|
| 157 |
+
(provider_id, data, created_at, updated_at)
|
| 158 |
+
VALUES (?, ?, COALESCE(
|
| 159 |
+
(SELECT created_at FROM providers WHERE provider_id = ?),
|
| 160 |
+
?
|
| 161 |
+
), ?)""",
|
| 162 |
(provider_id, json.dumps(data, default=str), provider_id, now, now),
|
| 163 |
)
|
| 164 |
self.conn.commit()
|
src/app/persistence/file_store.py
CHANGED
|
@@ -36,8 +36,15 @@ class FileStore:
|
|
| 36 |
|
| 37 |
# -- Job directory --------------------------------------------------------
|
| 38 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
def job_dir(self, job_id: str) -> Path:
|
| 40 |
-
d = self._jobs_dir / job_id
|
| 41 |
d.mkdir(parents=True, exist_ok=True)
|
| 42 |
return d
|
| 43 |
|
|
@@ -52,7 +59,8 @@ class FileStore:
|
|
| 52 |
def save_json(self, job_id: str, filename: str, data: Any) -> Path:
|
| 53 |
"""Save a JSON-serializable object."""
|
| 54 |
dest = self.job_dir(job_id) / filename
|
| 55 |
-
|
|
|
|
| 56 |
return dest
|
| 57 |
|
| 58 |
def save_bytes(self, job_id: str, filename: str, data: bytes) -> Path:
|
|
@@ -83,14 +91,14 @@ class FileStore:
|
|
| 83 |
|
| 84 |
def load_json(self, job_id: str, filename: str) -> Any:
|
| 85 |
"""Load a JSON file from the job directory. Returns None if not found."""
|
| 86 |
-
path = self._jobs_dir / job_id / filename
|
| 87 |
if not path.exists():
|
| 88 |
return None
|
| 89 |
return json.loads(path.read_text(encoding="utf-8"))
|
| 90 |
|
| 91 |
def load_bytes(self, job_id: str, filename: str) -> bytes | None:
|
| 92 |
"""Load raw bytes. Returns None if not found."""
|
| 93 |
-
path = self._jobs_dir / job_id / filename
|
| 94 |
if not path.exists():
|
| 95 |
return None
|
| 96 |
return path.read_bytes()
|
|
@@ -121,19 +129,21 @@ class FileStore:
|
|
| 121 |
if not d.exists():
|
| 122 |
return None
|
| 123 |
for f in d.iterdir():
|
| 124 |
-
|
|
|
|
| 125 |
return f
|
| 126 |
return None
|
| 127 |
|
| 128 |
# -- Provider profiles ----------------------------------------------------
|
| 129 |
|
| 130 |
def save_provider(self, provider_id: str, data: dict) -> Path:
|
| 131 |
-
dest = self._providers_dir / f"{provider_id}.json"
|
| 132 |
-
|
|
|
|
| 133 |
return dest
|
| 134 |
|
| 135 |
def load_provider(self, provider_id: str) -> dict | None:
|
| 136 |
-
path = self._providers_dir / f"{provider_id}.json"
|
| 137 |
if not path.exists():
|
| 138 |
return None
|
| 139 |
return json.loads(path.read_text(encoding="utf-8"))
|
|
@@ -144,7 +154,7 @@ class FileStore:
|
|
| 144 |
return [f.stem for f in self._providers_dir.glob("*.json")]
|
| 145 |
|
| 146 |
def delete_provider(self, provider_id: str) -> bool:
|
| 147 |
-
path = self._providers_dir / f"{provider_id}.json"
|
| 148 |
if path.exists():
|
| 149 |
path.unlink()
|
| 150 |
return True
|
|
|
|
| 36 |
|
| 37 |
# -- Job directory --------------------------------------------------------
|
| 38 |
|
| 39 |
+
@staticmethod
|
| 40 |
+
def _sanitize_id(value: str) -> str:
|
| 41 |
+
"""Reject path-traversal attempts in identifiers."""
|
| 42 |
+
if not value or "/" in value or "\\" in value or ".." in value:
|
| 43 |
+
raise ValueError(f"Invalid identifier (path traversal rejected): {value!r}")
|
| 44 |
+
return value
|
| 45 |
+
|
| 46 |
def job_dir(self, job_id: str) -> Path:
|
| 47 |
+
d = self._jobs_dir / self._sanitize_id(job_id)
|
| 48 |
d.mkdir(parents=True, exist_ok=True)
|
| 49 |
return d
|
| 50 |
|
|
|
|
| 59 |
def save_json(self, job_id: str, filename: str, data: Any) -> Path:
|
| 60 |
"""Save a JSON-serializable object."""
|
| 61 |
dest = self.job_dir(job_id) / filename
|
| 62 |
+
content = json.dumps(data, ensure_ascii=False, indent=2, default=str)
|
| 63 |
+
dest.write_text(content, encoding="utf-8")
|
| 64 |
return dest
|
| 65 |
|
| 66 |
def save_bytes(self, job_id: str, filename: str, data: bytes) -> Path:
|
|
|
|
| 91 |
|
| 92 |
def load_json(self, job_id: str, filename: str) -> Any:
|
| 93 |
"""Load a JSON file from the job directory. Returns None if not found."""
|
| 94 |
+
path = self._jobs_dir / self._sanitize_id(job_id) / filename
|
| 95 |
if not path.exists():
|
| 96 |
return None
|
| 97 |
return json.loads(path.read_text(encoding="utf-8"))
|
| 98 |
|
| 99 |
def load_bytes(self, job_id: str, filename: str) -> bytes | None:
|
| 100 |
"""Load raw bytes. Returns None if not found."""
|
| 101 |
+
path = self._jobs_dir / self._sanitize_id(job_id) / filename
|
| 102 |
if not path.exists():
|
| 103 |
return None
|
| 104 |
return path.read_bytes()
|
|
|
|
| 129 |
if not d.exists():
|
| 130 |
return None
|
| 131 |
for f in d.iterdir():
|
| 132 |
+
valid = (".png", ".jpg", ".jpeg", ".tif", ".tiff", ".webp")
|
| 133 |
+
if f.stem == "input" and f.suffix in valid:
|
| 134 |
return f
|
| 135 |
return None
|
| 136 |
|
| 137 |
# -- Provider profiles ----------------------------------------------------
|
| 138 |
|
| 139 |
def save_provider(self, provider_id: str, data: dict) -> Path:
|
| 140 |
+
dest = self._providers_dir / f"{self._sanitize_id(provider_id)}.json"
|
| 141 |
+
content = json.dumps(data, ensure_ascii=False, indent=2, default=str)
|
| 142 |
+
dest.write_text(content, encoding="utf-8")
|
| 143 |
return dest
|
| 144 |
|
| 145 |
def load_provider(self, provider_id: str) -> dict | None:
|
| 146 |
+
path = self._providers_dir / f"{self._sanitize_id(provider_id)}.json"
|
| 147 |
if not path.exists():
|
| 148 |
return None
|
| 149 |
return json.loads(path.read_text(encoding="utf-8"))
|
|
|
|
| 154 |
return [f.stem for f in self._providers_dir.glob("*.json")]
|
| 155 |
|
| 156 |
def delete_provider(self, provider_id: str) -> bool:
|
| 157 |
+
path = self._providers_dir / f"{self._sanitize_id(provider_id)}.json"
|
| 158 |
if path.exists():
|
| 159 |
path.unlink()
|
| 160 |
return True
|
src/app/policies/document_policy.py
CHANGED
|
@@ -7,12 +7,12 @@ that controls what the system may or may not do.
|
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
-
from enum import
|
| 11 |
|
| 12 |
from pydantic import BaseModel, ConfigDict
|
| 13 |
|
| 14 |
|
| 15 |
-
class PolicyMode(
|
| 16 |
"""Named policy presets."""
|
| 17 |
|
| 18 |
STRICT = "strict"
|
|
|
|
| 7 |
|
| 8 |
from __future__ import annotations
|
| 9 |
|
| 10 |
+
from enum import StrEnum
|
| 11 |
|
| 12 |
from pydantic import BaseModel, ConfigDict
|
| 13 |
|
| 14 |
|
| 15 |
+
class PolicyMode(StrEnum):
|
| 16 |
"""Named policy presets."""
|
| 17 |
|
| 18 |
STRICT = "strict"
|
src/app/policies/export_policy.py
CHANGED
|
@@ -7,11 +7,14 @@ decision for each export format.
|
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
from dataclasses import dataclass
|
|
|
|
| 10 |
|
| 11 |
-
from src.app.domain.models.readiness import ExportEligibility
|
| 12 |
from src.app.domain.models.status import ReadinessLevel
|
| 13 |
from src.app.policies.document_policy import DocumentPolicy
|
| 14 |
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
@dataclass(frozen=True)
|
| 17 |
class ExportDecision:
|
|
@@ -36,7 +39,10 @@ def check_alto_export(
|
|
| 36 |
return ExportDecision(
|
| 37 |
allowed=False,
|
| 38 |
level=level,
|
| 39 |
-
reason=
|
|
|
|
|
|
|
|
|
|
| 40 |
)
|
| 41 |
|
| 42 |
if level == ReadinessLevel.PARTIAL and not policy.allow_partial_alto:
|
|
|
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
from dataclasses import dataclass
|
| 10 |
+
from typing import TYPE_CHECKING
|
| 11 |
|
|
|
|
| 12 |
from src.app.domain.models.status import ReadinessLevel
|
| 13 |
from src.app.policies.document_policy import DocumentPolicy
|
| 14 |
|
| 15 |
+
if TYPE_CHECKING:
|
| 16 |
+
from src.app.domain.models.readiness import ExportEligibility
|
| 17 |
+
|
| 18 |
|
| 19 |
@dataclass(frozen=True)
|
| 20 |
class ExportDecision:
|
|
|
|
| 39 |
return ExportDecision(
|
| 40 |
allowed=False,
|
| 41 |
level=level,
|
| 42 |
+
reason=(
|
| 43 |
+
"ALTO export not possible: missing required data"
|
| 44 |
+
" (word text/geometry or line geometry)"
|
| 45 |
+
),
|
| 46 |
)
|
| 47 |
|
| 48 |
if level == ReadinessLevel.PARTIAL and not policy.allow_partial_alto:
|
src/app/providers/adapters/base.py
CHANGED
|
@@ -3,9 +3,11 @@
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
from abc import ABC, abstractmethod
|
|
|
|
| 6 |
|
| 7 |
-
|
| 8 |
-
from src.app.domain.models
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
class BaseAdapter(ABC):
|
|
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
from abc import ABC, abstractmethod
|
| 6 |
+
from typing import TYPE_CHECKING
|
| 7 |
|
| 8 |
+
if TYPE_CHECKING:
|
| 9 |
+
from src.app.domain.models import CanonicalDocument, RawProviderPayload
|
| 10 |
+
from src.app.domain.models.geometry import GeometryContext
|
| 11 |
|
| 12 |
|
| 13 |
class BaseAdapter(ABC):
|
src/app/providers/adapters/line_box_json.py
CHANGED
|
@@ -17,19 +17,23 @@ since the provider doesn't segment words. The block is inferred.
|
|
| 17 |
|
| 18 |
from __future__ import annotations
|
| 19 |
|
|
|
|
|
|
|
| 20 |
from src.app.domain.models import (
|
| 21 |
CanonicalDocument,
|
| 22 |
Geometry,
|
| 23 |
Provenance,
|
| 24 |
RawProviderPayload,
|
| 25 |
)
|
| 26 |
-
from src.app.domain.models.geometry import GeometryContext
|
| 27 |
from src.app.domain.models.status import EvidenceType, GeometryStatus, InputType
|
| 28 |
from src.app.geometry.bbox import union_all
|
| 29 |
from src.app.geometry.normalization import xyxy_to_xywh
|
| 30 |
from src.app.normalization.canonical_builder import CanonicalBuilder
|
| 31 |
from src.app.providers.adapters.base import BaseAdapter
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
class LineBoxJsonAdapter(BaseAdapter):
|
| 35 |
"""Adapter for the line_box_json family."""
|
|
|
|
| 17 |
|
| 18 |
from __future__ import annotations
|
| 19 |
|
| 20 |
+
from typing import TYPE_CHECKING
|
| 21 |
+
|
| 22 |
from src.app.domain.models import (
|
| 23 |
CanonicalDocument,
|
| 24 |
Geometry,
|
| 25 |
Provenance,
|
| 26 |
RawProviderPayload,
|
| 27 |
)
|
|
|
|
| 28 |
from src.app.domain.models.status import EvidenceType, GeometryStatus, InputType
|
| 29 |
from src.app.geometry.bbox import union_all
|
| 30 |
from src.app.geometry.normalization import xyxy_to_xywh
|
| 31 |
from src.app.normalization.canonical_builder import CanonicalBuilder
|
| 32 |
from src.app.providers.adapters.base import BaseAdapter
|
| 33 |
|
| 34 |
+
if TYPE_CHECKING:
|
| 35 |
+
from src.app.domain.models.geometry import GeometryContext
|
| 36 |
+
|
| 37 |
|
| 38 |
class LineBoxJsonAdapter(BaseAdapter):
|
| 39 |
"""Adapter for the line_box_json family."""
|
src/app/providers/adapters/text_only.py
CHANGED
|
@@ -19,17 +19,21 @@ will show text without positioned overlays.
|
|
| 19 |
|
| 20 |
from __future__ import annotations
|
| 21 |
|
|
|
|
|
|
|
| 22 |
from src.app.domain.models import (
|
| 23 |
CanonicalDocument,
|
| 24 |
Geometry,
|
| 25 |
Provenance,
|
| 26 |
RawProviderPayload,
|
| 27 |
)
|
| 28 |
-
from src.app.domain.models.geometry import GeometryContext
|
| 29 |
from src.app.domain.models.status import EvidenceType, GeometryStatus, InputType
|
| 30 |
from src.app.normalization.canonical_builder import CanonicalBuilder
|
| 31 |
from src.app.providers.adapters.base import BaseAdapter
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
class TextOnlyAdapter(BaseAdapter):
|
| 35 |
"""Adapter for the text_only family (mLLM without geometry)."""
|
|
@@ -76,7 +80,11 @@ class TextOnlyAdapter(BaseAdapter):
|
|
| 76 |
# Extract text blocks
|
| 77 |
blocks = payload.get("blocks")
|
| 78 |
if blocks and isinstance(blocks, list):
|
| 79 |
-
texts = [
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
else:
|
| 81 |
# Single text blob — split into paragraphs
|
| 82 |
full_text = str(payload.get("text", ""))
|
|
@@ -102,7 +110,7 @@ class TextOnlyAdapter(BaseAdapter):
|
|
| 102 |
)
|
| 103 |
|
| 104 |
# Split block into lines
|
| 105 |
-
lines = [
|
| 106 |
if not lines:
|
| 107 |
lines = [block_text]
|
| 108 |
|
|
|
|
| 19 |
|
| 20 |
from __future__ import annotations
|
| 21 |
|
| 22 |
+
from typing import TYPE_CHECKING
|
| 23 |
+
|
| 24 |
from src.app.domain.models import (
|
| 25 |
CanonicalDocument,
|
| 26 |
Geometry,
|
| 27 |
Provenance,
|
| 28 |
RawProviderPayload,
|
| 29 |
)
|
|
|
|
| 30 |
from src.app.domain.models.status import EvidenceType, GeometryStatus, InputType
|
| 31 |
from src.app.normalization.canonical_builder import CanonicalBuilder
|
| 32 |
from src.app.providers.adapters.base import BaseAdapter
|
| 33 |
|
| 34 |
+
if TYPE_CHECKING:
|
| 35 |
+
from src.app.domain.models.geometry import GeometryContext
|
| 36 |
+
|
| 37 |
|
| 38 |
class TextOnlyAdapter(BaseAdapter):
|
| 39 |
"""Adapter for the text_only family (mLLM without geometry)."""
|
|
|
|
| 80 |
# Extract text blocks
|
| 81 |
blocks = payload.get("blocks")
|
| 82 |
if blocks and isinstance(blocks, list):
|
| 83 |
+
texts = [
|
| 84 |
+
str(b.get("text", ""))
|
| 85 |
+
for b in blocks
|
| 86 |
+
if isinstance(b, dict) and b.get("text")
|
| 87 |
+
]
|
| 88 |
else:
|
| 89 |
# Single text blob — split into paragraphs
|
| 90 |
full_text = str(payload.get("text", ""))
|
|
|
|
| 110 |
)
|
| 111 |
|
| 112 |
# Split block into lines
|
| 113 |
+
lines = [ln.strip() for ln in block_text.split("\n") if ln.strip()]
|
| 114 |
if not lines:
|
| 115 |
lines = [block_text]
|
| 116 |
|
src/app/providers/adapters/word_box_json.py
CHANGED
|
@@ -13,13 +13,14 @@ within a single inferred text block.
|
|
| 13 |
|
| 14 |
from __future__ import annotations
|
| 15 |
|
|
|
|
|
|
|
| 16 |
from src.app.domain.models import (
|
| 17 |
CanonicalDocument,
|
| 18 |
Geometry,
|
| 19 |
Provenance,
|
| 20 |
RawProviderPayload,
|
| 21 |
)
|
| 22 |
-
from src.app.domain.models.geometry import GeometryContext
|
| 23 |
from src.app.domain.models.status import (
|
| 24 |
EvidenceType,
|
| 25 |
GeometryStatus,
|
|
@@ -30,6 +31,9 @@ from src.app.geometry.normalization import four_point_to_polygon, four_point_to_
|
|
| 30 |
from src.app.normalization.canonical_builder import CanonicalBuilder
|
| 31 |
from src.app.providers.adapters.base import BaseAdapter
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
class WordBoxJsonAdapter(BaseAdapter):
|
| 35 |
"""Adapter for the word_box_json family (PaddleOCR, etc.)."""
|
|
@@ -163,8 +167,9 @@ class WordBoxJsonAdapter(BaseAdapter):
|
|
| 163 |
points = item[0]
|
| 164 |
text_conf = item[1]
|
| 165 |
if not isinstance(points, list) or len(points) != 4:
|
|
|
|
| 166 |
raise ValueError(
|
| 167 |
-
f"Item {idx}: expected 4 polygon points, got {
|
| 168 |
)
|
| 169 |
return points, text_conf
|
| 170 |
|
|
|
|
| 13 |
|
| 14 |
from __future__ import annotations
|
| 15 |
|
| 16 |
+
from typing import TYPE_CHECKING
|
| 17 |
+
|
| 18 |
from src.app.domain.models import (
|
| 19 |
CanonicalDocument,
|
| 20 |
Geometry,
|
| 21 |
Provenance,
|
| 22 |
RawProviderPayload,
|
| 23 |
)
|
|
|
|
| 24 |
from src.app.domain.models.status import (
|
| 25 |
EvidenceType,
|
| 26 |
GeometryStatus,
|
|
|
|
| 31 |
from src.app.normalization.canonical_builder import CanonicalBuilder
|
| 32 |
from src.app.providers.adapters.base import BaseAdapter
|
| 33 |
|
| 34 |
+
if TYPE_CHECKING:
|
| 35 |
+
from src.app.domain.models.geometry import GeometryContext
|
| 36 |
+
|
| 37 |
|
| 38 |
class WordBoxJsonAdapter(BaseAdapter):
|
| 39 |
"""Adapter for the word_box_json family (PaddleOCR, etc.)."""
|
|
|
|
| 167 |
points = item[0]
|
| 168 |
text_conf = item[1]
|
| 169 |
if not isinstance(points, list) or len(points) != 4:
|
| 170 |
+
got = len(points) if isinstance(points, list) else type(points).__name__
|
| 171 |
raise ValueError(
|
| 172 |
+
f"Item {idx}: expected 4 polygon points, got {got}"
|
| 173 |
)
|
| 174 |
return points, text_conf
|
| 175 |
|
src/app/providers/profiles.py
CHANGED
|
@@ -2,28 +2,28 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
-
from datetime import
|
| 6 |
-
from enum import
|
| 7 |
|
| 8 |
from pydantic import BaseModel, ConfigDict, Field
|
| 9 |
|
| 10 |
from src.app.providers.capabilities import CapabilityMatrix
|
| 11 |
|
| 12 |
|
| 13 |
-
class RuntimeType(
|
| 14 |
LOCAL = "local"
|
| 15 |
HUB = "hub"
|
| 16 |
API = "api"
|
| 17 |
|
| 18 |
|
| 19 |
-
class ProviderFamily(
|
| 20 |
WORD_BOX_JSON = "word_box_json"
|
| 21 |
LINE_BOX_JSON = "line_box_json"
|
| 22 |
REGION_LINE_WORD_POLYGON = "region_line_word_polygon"
|
| 23 |
TEXT_ONLY = "text_only"
|
| 24 |
|
| 25 |
|
| 26 |
-
class AuthMode(
|
| 27 |
NONE = "none"
|
| 28 |
API_KEY = "api_key"
|
| 29 |
BEARER = "bearer"
|
|
@@ -51,5 +51,5 @@ class ProviderProfile(BaseModel):
|
|
| 51 |
prompt_template: str | None = None
|
| 52 |
|
| 53 |
last_test_status: str | None = None
|
| 54 |
-
created_at: datetime = Field(default_factory=lambda: datetime.now(
|
| 55 |
-
updated_at: datetime = Field(default_factory=lambda: datetime.now(
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
from datetime import UTC, datetime
|
| 6 |
+
from enum import StrEnum
|
| 7 |
|
| 8 |
from pydantic import BaseModel, ConfigDict, Field
|
| 9 |
|
| 10 |
from src.app.providers.capabilities import CapabilityMatrix
|
| 11 |
|
| 12 |
|
| 13 |
+
class RuntimeType(StrEnum):
|
| 14 |
LOCAL = "local"
|
| 15 |
HUB = "hub"
|
| 16 |
API = "api"
|
| 17 |
|
| 18 |
|
| 19 |
+
class ProviderFamily(StrEnum):
|
| 20 |
WORD_BOX_JSON = "word_box_json"
|
| 21 |
LINE_BOX_JSON = "line_box_json"
|
| 22 |
REGION_LINE_WORD_POLYGON = "region_line_word_polygon"
|
| 23 |
TEXT_ONLY = "text_only"
|
| 24 |
|
| 25 |
|
| 26 |
+
class AuthMode(StrEnum):
|
| 27 |
NONE = "none"
|
| 28 |
API_KEY = "api_key"
|
| 29 |
BEARER = "bearer"
|
|
|
|
| 51 |
prompt_template: str | None = None
|
| 52 |
|
| 53 |
last_test_status: str | None = None
|
| 54 |
+
created_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
| 55 |
+
updated_at: datetime = Field(default_factory=lambda: datetime.now(UTC))
|
src/app/providers/registry.py
CHANGED
|
@@ -2,16 +2,20 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
-
from
|
|
|
|
| 6 |
from src.app.providers.adapters.line_box_json import LineBoxJsonAdapter
|
| 7 |
from src.app.providers.adapters.text_only import TextOnlyAdapter
|
| 8 |
from src.app.providers.adapters.word_box_json import WordBoxJsonAdapter
|
| 9 |
from src.app.providers.profiles import ProviderFamily, RuntimeType
|
| 10 |
from src.app.providers.runtimes.api_runtime import ApiRuntime
|
| 11 |
-
from src.app.providers.runtimes.base import BaseRuntime
|
| 12 |
from src.app.providers.runtimes.hub_runtime import HubRuntime
|
| 13 |
from src.app.providers.runtimes.local_runtime import LocalRuntime
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
# -- Adapter registry ---------------------------------------------------------
|
| 16 |
|
| 17 |
_ADAPTER_REGISTRY: dict[str, type[BaseAdapter]] = {
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
from typing import TYPE_CHECKING
|
| 6 |
+
|
| 7 |
from src.app.providers.adapters.line_box_json import LineBoxJsonAdapter
|
| 8 |
from src.app.providers.adapters.text_only import TextOnlyAdapter
|
| 9 |
from src.app.providers.adapters.word_box_json import WordBoxJsonAdapter
|
| 10 |
from src.app.providers.profiles import ProviderFamily, RuntimeType
|
| 11 |
from src.app.providers.runtimes.api_runtime import ApiRuntime
|
|
|
|
| 12 |
from src.app.providers.runtimes.hub_runtime import HubRuntime
|
| 13 |
from src.app.providers.runtimes.local_runtime import LocalRuntime
|
| 14 |
|
| 15 |
+
if TYPE_CHECKING:
|
| 16 |
+
from src.app.providers.adapters.base import BaseAdapter
|
| 17 |
+
from src.app.providers.runtimes.base import BaseRuntime
|
| 18 |
+
|
| 19 |
# -- Adapter registry ---------------------------------------------------------
|
| 20 |
|
| 21 |
_ADAPTER_REGISTRY: dict[str, type[BaseAdapter]] = {
|
src/app/providers/resolver.py
CHANGED
|
@@ -2,16 +2,25 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
-
from
|
| 6 |
-
|
| 7 |
from src.app.providers.registry import get_adapter, get_runtime
|
| 8 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
class ResolvedProvider:
|
| 12 |
"""A fully resolved provider: runtime + adapter, ready to execute."""
|
| 13 |
|
| 14 |
-
def __init__(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
self.profile = profile
|
| 16 |
self.runtime = runtime
|
| 17 |
self.adapter = adapter
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
from typing import TYPE_CHECKING
|
| 6 |
+
|
| 7 |
from src.app.providers.registry import get_adapter, get_runtime
|
| 8 |
+
|
| 9 |
+
if TYPE_CHECKING:
|
| 10 |
+
from src.app.providers.adapters.base import BaseAdapter
|
| 11 |
+
from src.app.providers.profiles import ProviderProfile
|
| 12 |
+
from src.app.providers.runtimes.base import BaseRuntime
|
| 13 |
|
| 14 |
|
| 15 |
class ResolvedProvider:
|
| 16 |
"""A fully resolved provider: runtime + adapter, ready to execute."""
|
| 17 |
|
| 18 |
+
def __init__(
|
| 19 |
+
self,
|
| 20 |
+
profile: ProviderProfile,
|
| 21 |
+
runtime: BaseRuntime,
|
| 22 |
+
adapter: BaseAdapter,
|
| 23 |
+
) -> None:
|
| 24 |
self.profile = profile
|
| 25 |
self.runtime = runtime
|
| 26 |
self.adapter = adapter
|
src/app/providers/runtimes/api_runtime.py
CHANGED
|
@@ -2,12 +2,15 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
-
from
|
| 6 |
-
from typing import Any
|
| 7 |
|
| 8 |
-
from src.app.domain.models import RawProviderPayload
|
| 9 |
from src.app.providers.runtimes.base import BaseRuntime
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
class ApiRuntime(BaseRuntime):
|
| 13 |
"""Runtime that calls an external API endpoint.
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
from typing import TYPE_CHECKING, Any
|
|
|
|
| 6 |
|
|
|
|
| 7 |
from src.app.providers.runtimes.base import BaseRuntime
|
| 8 |
|
| 9 |
+
if TYPE_CHECKING:
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
from src.app.domain.models import RawProviderPayload
|
| 13 |
+
|
| 14 |
|
| 15 |
class ApiRuntime(BaseRuntime):
|
| 16 |
"""Runtime that calls an external API endpoint.
|
src/app/providers/runtimes/base.py
CHANGED
|
@@ -3,10 +3,12 @@
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
from abc import ABC, abstractmethod
|
| 6 |
-
from
|
| 7 |
-
from typing import Any
|
| 8 |
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
class BaseRuntime(ABC):
|
|
|
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
from abc import ABC, abstractmethod
|
| 6 |
+
from typing import TYPE_CHECKING, Any
|
|
|
|
| 7 |
|
| 8 |
+
if TYPE_CHECKING:
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
from src.app.domain.models import RawProviderPayload
|
| 12 |
|
| 13 |
|
| 14 |
class BaseRuntime(ABC):
|
src/app/providers/runtimes/hub_runtime.py
CHANGED
|
@@ -2,12 +2,15 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
-
from
|
| 6 |
-
from typing import Any
|
| 7 |
|
| 8 |
-
from src.app.domain.models import RawProviderPayload
|
| 9 |
from src.app.providers.runtimes.base import BaseRuntime
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
class HubRuntime(BaseRuntime):
|
| 13 |
"""Runtime for models loaded from the Hugging Face Hub.
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
from typing import TYPE_CHECKING, Any
|
|
|
|
| 6 |
|
|
|
|
| 7 |
from src.app.providers.runtimes.base import BaseRuntime
|
| 8 |
|
| 9 |
+
if TYPE_CHECKING:
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
from src.app.domain.models import RawProviderPayload
|
| 13 |
+
|
| 14 |
|
| 15 |
class HubRuntime(BaseRuntime):
|
| 16 |
"""Runtime for models loaded from the Hugging Face Hub.
|
src/app/providers/runtimes/local_runtime.py
CHANGED
|
@@ -2,12 +2,15 @@
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
-
from
|
| 6 |
-
from typing import Any
|
| 7 |
|
| 8 |
-
from src.app.domain.models import RawProviderPayload
|
| 9 |
from src.app.providers.runtimes.base import BaseRuntime
|
| 10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
class LocalRuntime(BaseRuntime):
|
| 13 |
"""Runtime for locally installed models.
|
|
|
|
| 2 |
|
| 3 |
from __future__ import annotations
|
| 4 |
|
| 5 |
+
from typing import TYPE_CHECKING, Any
|
|
|
|
| 6 |
|
|
|
|
| 7 |
from src.app.providers.runtimes.base import BaseRuntime
|
| 8 |
|
| 9 |
+
if TYPE_CHECKING:
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
from src.app.domain.models import RawProviderPayload
|
| 13 |
+
|
| 14 |
|
| 15 |
class LocalRuntime(BaseRuntime):
|
| 16 |
"""Runtime for locally installed models.
|
src/app/serializers/alto_xml.py
CHANGED
|
@@ -29,11 +29,15 @@ Coordinate mapping:
|
|
| 29 |
|
| 30 |
from __future__ import annotations
|
| 31 |
|
|
|
|
|
|
|
| 32 |
from lxml import etree
|
| 33 |
|
| 34 |
-
from src.app.domain.models import CanonicalDocument, Page, TextLine, TextRegion, Word
|
| 35 |
from src.app.geometry.quantization import RoundingStrategy, quantize_bbox
|
| 36 |
|
|
|
|
|
|
|
|
|
|
| 37 |
ALTO_NS = "http://www.loc.gov/standards/alto/ns-v4#"
|
| 38 |
XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"
|
| 39 |
SCHEMA_LOCATION = (
|
|
|
|
| 29 |
|
| 30 |
from __future__ import annotations
|
| 31 |
|
| 32 |
+
from typing import TYPE_CHECKING
|
| 33 |
+
|
| 34 |
from lxml import etree
|
| 35 |
|
|
|
|
| 36 |
from src.app.geometry.quantization import RoundingStrategy, quantize_bbox
|
| 37 |
|
| 38 |
+
if TYPE_CHECKING:
|
| 39 |
+
from src.app.domain.models import CanonicalDocument, Page, TextLine, TextRegion, Word
|
| 40 |
+
|
| 41 |
ALTO_NS = "http://www.loc.gov/standards/alto/ns-v4#"
|
| 42 |
XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"
|
| 43 |
SCHEMA_LOCATION = (
|
src/app/serializers/page_xml.py
CHANGED
|
@@ -26,21 +26,24 @@ Namespace: http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15
|
|
| 26 |
|
| 27 |
from __future__ import annotations
|
| 28 |
|
| 29 |
-
from datetime import
|
|
|
|
| 30 |
|
| 31 |
from lxml import etree
|
| 32 |
|
| 33 |
-
from src.app.domain.models import (
|
| 34 |
-
CanonicalDocument,
|
| 35 |
-
Page,
|
| 36 |
-
TextLine,
|
| 37 |
-
TextRegion,
|
| 38 |
-
Word,
|
| 39 |
-
)
|
| 40 |
from src.app.domain.models.status import BlockRole
|
| 41 |
from src.app.geometry.polygon import bbox_to_polygon
|
| 42 |
from src.app.geometry.quantization import RoundingStrategy, quantize_value
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
PAGE_NS = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
|
| 45 |
XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"
|
| 46 |
SCHEMA_LOCATION = (
|
|
@@ -119,9 +122,9 @@ def _build_page_tree(
|
|
| 119 |
creator = etree.SubElement(metadata, f"{{{PAGE_NS}}}Creator")
|
| 120 |
creator.text = "XmLLM"
|
| 121 |
created = etree.SubElement(metadata, f"{{{PAGE_NS}}}Created")
|
| 122 |
-
created.text = datetime.now(
|
| 123 |
last_change = etree.SubElement(metadata, f"{{{PAGE_NS}}}LastChange")
|
| 124 |
-
last_change.text = datetime.now(
|
| 125 |
|
| 126 |
# One <Page> per canonical page (PAGE XML is per-page, but we handle multi-page)
|
| 127 |
for page in doc.pages:
|
|
@@ -226,10 +229,7 @@ def _add_coords(
|
|
| 226 |
"""
|
| 227 |
coords = etree.SubElement(parent, f"{{{PAGE_NS}}}Coords")
|
| 228 |
|
| 229 |
-
if polygon and len(polygon) >= 3
|
| 230 |
-
points = polygon
|
| 231 |
-
else:
|
| 232 |
-
points = bbox_to_polygon(bbox)
|
| 233 |
|
| 234 |
points_str = " ".join(
|
| 235 |
f"{quantize_value(x, rounding)},{quantize_value(y, rounding)}"
|
|
|
|
| 26 |
|
| 27 |
from __future__ import annotations
|
| 28 |
|
| 29 |
+
from datetime import UTC, datetime
|
| 30 |
+
from typing import TYPE_CHECKING
|
| 31 |
|
| 32 |
from lxml import etree
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
from src.app.domain.models.status import BlockRole
|
| 35 |
from src.app.geometry.polygon import bbox_to_polygon
|
| 36 |
from src.app.geometry.quantization import RoundingStrategy, quantize_value
|
| 37 |
|
| 38 |
+
if TYPE_CHECKING:
|
| 39 |
+
from src.app.domain.models import (
|
| 40 |
+
CanonicalDocument,
|
| 41 |
+
Page,
|
| 42 |
+
TextLine,
|
| 43 |
+
TextRegion,
|
| 44 |
+
Word,
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
PAGE_NS = "http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15"
|
| 48 |
XSI_NS = "http://www.w3.org/2001/XMLSchema-instance"
|
| 49 |
SCHEMA_LOCATION = (
|
|
|
|
| 122 |
creator = etree.SubElement(metadata, f"{{{PAGE_NS}}}Creator")
|
| 123 |
creator.text = "XmLLM"
|
| 124 |
created = etree.SubElement(metadata, f"{{{PAGE_NS}}}Created")
|
| 125 |
+
created.text = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%S")
|
| 126 |
last_change = etree.SubElement(metadata, f"{{{PAGE_NS}}}LastChange")
|
| 127 |
+
last_change.text = datetime.now(UTC).strftime("%Y-%m-%dT%H:%M:%S")
|
| 128 |
|
| 129 |
# One <Page> per canonical page (PAGE XML is per-page, but we handle multi-page)
|
| 130 |
for page in doc.pages:
|
|
|
|
| 229 |
"""
|
| 230 |
coords = etree.SubElement(parent, f"{{{PAGE_NS}}}Coords")
|
| 231 |
|
| 232 |
+
points = polygon if polygon and len(polygon) >= 3 else bbox_to_polygon(bbox)
|
|
|
|
|
|
|
|
|
|
| 233 |
|
| 234 |
points_str = " ".join(
|
| 235 |
f"{quantize_value(x, rounding)},{quantize_value(y, rounding)}"
|
src/app/settings.py
CHANGED
|
@@ -7,14 +7,15 @@ via the SPACE_ID environment variable.
|
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
import os
|
| 10 |
-
from enum import
|
|
|
|
| 11 |
from pathlib import Path
|
| 12 |
|
| 13 |
from pydantic import Field
|
| 14 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 15 |
|
| 16 |
|
| 17 |
-
class AppMode(
|
| 18 |
LOCAL = "local"
|
| 19 |
SPACE = "space"
|
| 20 |
|
|
@@ -113,6 +114,7 @@ class Settings(BaseSettings):
|
|
| 113 |
os.environ.setdefault("HF_HOME", str(self.hf_home))
|
| 114 |
|
| 115 |
|
|
|
|
| 116 |
def get_settings() -> Settings:
|
| 117 |
-
"""Factory for dependency injection (FastAPI Depends)."""
|
| 118 |
return Settings()
|
|
|
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
import os
|
| 10 |
+
from enum import StrEnum
|
| 11 |
+
from functools import lru_cache
|
| 12 |
from pathlib import Path
|
| 13 |
|
| 14 |
from pydantic import Field
|
| 15 |
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 16 |
|
| 17 |
|
| 18 |
+
class AppMode(StrEnum):
|
| 19 |
LOCAL = "local"
|
| 20 |
SPACE = "space"
|
| 21 |
|
|
|
|
| 114 |
os.environ.setdefault("HF_HOME", str(self.hf_home))
|
| 115 |
|
| 116 |
|
| 117 |
+
@lru_cache(maxsize=1)
|
| 118 |
def get_settings() -> Settings:
|
| 119 |
+
"""Factory for dependency injection (FastAPI Depends). Cached singleton."""
|
| 120 |
return Settings()
|
src/app/validators/export_eligibility_validator.py
CHANGED
|
@@ -6,7 +6,8 @@ an ExportEligibility decision for the whole document.
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
-
from
|
|
|
|
| 10 |
from src.app.domain.models.readiness import ExportEligibility
|
| 11 |
from src.app.domain.models.status import ReadinessLevel
|
| 12 |
from src.app.policies.document_policy import DocumentPolicy
|
|
@@ -15,6 +16,9 @@ from src.app.validators.readiness_validator import (
|
|
| 15 |
compute_page_pagexml_readiness,
|
| 16 |
)
|
| 17 |
|
|
|
|
|
|
|
|
|
|
| 18 |
|
| 19 |
def compute_export_eligibility(
|
| 20 |
doc: CanonicalDocument,
|
|
@@ -70,10 +74,10 @@ def _aggregate_levels(levels: list[ReadinessLevel]) -> ReadinessLevel:
|
|
| 70 |
if not levels:
|
| 71 |
return ReadinessLevel.NONE
|
| 72 |
|
| 73 |
-
if all(
|
| 74 |
return ReadinessLevel.FULL
|
| 75 |
-
if all(
|
| 76 |
return ReadinessLevel.NONE
|
| 77 |
-
if any(
|
| 78 |
return ReadinessLevel.PARTIAL
|
| 79 |
return ReadinessLevel.DEGRADED
|
|
|
|
| 6 |
|
| 7 |
from __future__ import annotations
|
| 8 |
|
| 9 |
+
from typing import TYPE_CHECKING
|
| 10 |
+
|
| 11 |
from src.app.domain.models.readiness import ExportEligibility
|
| 12 |
from src.app.domain.models.status import ReadinessLevel
|
| 13 |
from src.app.policies.document_policy import DocumentPolicy
|
|
|
|
| 16 |
compute_page_pagexml_readiness,
|
| 17 |
)
|
| 18 |
|
| 19 |
+
if TYPE_CHECKING:
|
| 20 |
+
from src.app.domain.models import CanonicalDocument
|
| 21 |
+
|
| 22 |
|
| 23 |
def compute_export_eligibility(
|
| 24 |
doc: CanonicalDocument,
|
|
|
|
| 74 |
if not levels:
|
| 75 |
return ReadinessLevel.NONE
|
| 76 |
|
| 77 |
+
if all(lv == ReadinessLevel.FULL for lv in levels):
|
| 78 |
return ReadinessLevel.FULL
|
| 79 |
+
if all(lv == ReadinessLevel.NONE for lv in levels):
|
| 80 |
return ReadinessLevel.NONE
|
| 81 |
+
if any(lv in (ReadinessLevel.FULL, ReadinessLevel.PARTIAL) for lv in levels):
|
| 82 |
return ReadinessLevel.PARTIAL
|
| 83 |
return ReadinessLevel.DEGRADED
|