Spaces:
Running
Running
Claude commited on
Sprint 0: project foundation — structure, settings, FastAPI skeleton
Browse filesSet up the complete project scaffolding for the XmLLM document structure
engine (canonical-first architecture for image → ALTO XML / PAGE XML):
- pyproject.toml with core deps (FastAPI, Pydantic v2, lxml, Pillow) and dev deps
- Full directory structure matching the 4-anneau architecture (domain, execution, API, presentation)
- SettingsService with auto-detection of local vs HF Space mode
- FastAPI app skeleton with /health endpoint
- AGENTS.md with all non-negotiable architecture rules
- .env.example with documented configuration variables
- Test suite foundation with 6 passing tests (settings + health)
https://claude.ai/code/session_01Cuzvc9Pjfo5u46eT3ta2Cg
This view is limited to 50 files because it contains too many changes. See raw diff
- .env.example +68 -0
- .gitignore +53 -0
- AGENTS.md +111 -0
- README.md +3 -0
- pyproject.toml +68 -0
- src/__init__.py +0 -0
- src/app/__init__.py +0 -0
- src/app/api/__init__.py +0 -0
- src/app/api/routes_exports.py +0 -0
- src/app/api/routes_health.py +0 -0
- src/app/api/routes_jobs.py +0 -0
- src/app/api/routes_providers.py +0 -0
- src/app/api/routes_viewer.py +0 -0
- src/app/domain/__init__.py +0 -0
- src/app/domain/errors/__init__.py +0 -0
- src/app/domain/models/__init__.py +0 -0
- src/app/domain/models/canonical_document.py +0 -0
- src/app/domain/models/geometry.py +0 -0
- src/app/domain/models/provenance.py +0 -0
- src/app/domain/models/raw_payload.py +0 -0
- src/app/domain/models/readiness.py +0 -0
- src/app/domain/models/status.py +0 -0
- src/app/domain/models/viewer_projection.py +0 -0
- src/app/domain/services/__init__.py +0 -0
- src/app/enrichers/__init__.py +0 -0
- src/app/enrichers/bbox_repair_light.py +0 -0
- src/app/enrichers/hyphenation_basic.py +0 -0
- src/app/enrichers/lang_propagation.py +0 -0
- src/app/enrichers/polygon_to_bbox.py +0 -0
- src/app/enrichers/reading_order_simple.py +0 -0
- src/app/enrichers/text_consistency.py +0 -0
- src/app/geometry/__init__.py +0 -0
- src/app/geometry/baseline.py +0 -0
- src/app/geometry/bbox.py +0 -0
- src/app/geometry/normalization.py +0 -0
- src/app/geometry/polygon.py +0 -0
- src/app/geometry/quantization.py +0 -0
- src/app/geometry/transforms.py +0 -0
- src/app/jobs/__init__.py +0 -0
- src/app/jobs/events.py +0 -0
- src/app/jobs/models.py +0 -0
- src/app/jobs/service.py +0 -0
- src/app/main.py +51 -0
- src/app/normalization/__init__.py +0 -0
- src/app/normalization/canonical_builder.py +0 -0
- src/app/normalization/pipeline.py +0 -0
- src/app/persistence/__init__.py +0 -0
- src/app/persistence/db.py +0 -0
- src/app/persistence/file_store.py +0 -0
- src/app/persistence/repositories/__init__.py +0 -0
.env.example
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# =============================================================================
|
| 2 |
+
# XmLLM — Environment Configuration
|
| 3 |
+
# =============================================================================
|
| 4 |
+
# Copy this file to .env and adjust values for your environment.
|
| 5 |
+
|
| 6 |
+
# -----------------------------------------------------------------------------
|
| 7 |
+
# Execution mode
|
| 8 |
+
# -----------------------------------------------------------------------------
|
| 9 |
+
# "local" or "space" — auto-detected from SPACE_ID if not set
|
| 10 |
+
APP_MODE=local
|
| 11 |
+
|
| 12 |
+
# -----------------------------------------------------------------------------
|
| 13 |
+
# Storage
|
| 14 |
+
# -----------------------------------------------------------------------------
|
| 15 |
+
# Root directory for all persistent data (jobs, providers, exports, db).
|
| 16 |
+
# On HF Spaces with persistent storage, use /data
|
| 17 |
+
STORAGE_ROOT=./data
|
| 18 |
+
|
| 19 |
+
# SQLite database path (relative to STORAGE_ROOT)
|
| 20 |
+
DB_NAME=app.db
|
| 21 |
+
|
| 22 |
+
# -----------------------------------------------------------------------------
|
| 23 |
+
# HuggingFace (only relevant in Space mode)
|
| 24 |
+
# -----------------------------------------------------------------------------
|
| 25 |
+
# Set by HF automatically in Spaces — do not set manually in local mode
|
| 26 |
+
# SPACE_ID=
|
| 27 |
+
# HF_HOME=/data/.huggingface
|
| 28 |
+
|
| 29 |
+
# -----------------------------------------------------------------------------
|
| 30 |
+
# Server
|
| 31 |
+
# -----------------------------------------------------------------------------
|
| 32 |
+
HOST=0.0.0.0
|
| 33 |
+
PORT=7860
|
| 34 |
+
LOG_LEVEL=info
|
| 35 |
+
|
| 36 |
+
# -----------------------------------------------------------------------------
|
| 37 |
+
# Upload limits
|
| 38 |
+
# -----------------------------------------------------------------------------
|
| 39 |
+
# Maximum upload file size in bytes (default: 50 MB)
|
| 40 |
+
MAX_UPLOAD_SIZE=52428800
|
| 41 |
+
|
| 42 |
+
# Allowed MIME types for upload (comma-separated)
|
| 43 |
+
ALLOWED_MIME_TYPES=image/png,image/jpeg,image/tiff,image/webp
|
| 44 |
+
|
| 45 |
+
# -----------------------------------------------------------------------------
|
| 46 |
+
# Provider defaults
|
| 47 |
+
# -----------------------------------------------------------------------------
|
| 48 |
+
# Default timeout for provider execution in seconds
|
| 49 |
+
PROVIDER_TIMEOUT=120
|
| 50 |
+
|
| 51 |
+
# Default timeout for API-based providers in seconds
|
| 52 |
+
API_PROVIDER_TIMEOUT=60
|
| 53 |
+
|
| 54 |
+
# Maximum retries for API-based providers
|
| 55 |
+
API_PROVIDER_MAX_RETRIES=2
|
| 56 |
+
|
| 57 |
+
# -----------------------------------------------------------------------------
|
| 58 |
+
# Geometry
|
| 59 |
+
# -----------------------------------------------------------------------------
|
| 60 |
+
# Tolerance in pixels for bbox containment checks (child bbox may exceed parent
|
| 61 |
+
# by this many pixels without triggering a validation error)
|
| 62 |
+
BBOX_CONTAINMENT_TOLERANCE=5
|
| 63 |
+
|
| 64 |
+
# -----------------------------------------------------------------------------
|
| 65 |
+
# Secrets (referenced by name in provider profiles, never serialized)
|
| 66 |
+
# -----------------------------------------------------------------------------
|
| 67 |
+
# SECRET_OPENAI_API_KEY=sk-...
|
| 68 |
+
# SECRET_HF_TOKEN=hf_...
|
.gitignore
ADDED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
*.egg-info/
|
| 7 |
+
*.egg
|
| 8 |
+
dist/
|
| 9 |
+
build/
|
| 10 |
+
.eggs/
|
| 11 |
+
|
| 12 |
+
# Virtual environments
|
| 13 |
+
.venv/
|
| 14 |
+
venv/
|
| 15 |
+
ENV/
|
| 16 |
+
|
| 17 |
+
# IDE
|
| 18 |
+
.vscode/
|
| 19 |
+
.idea/
|
| 20 |
+
*.swp
|
| 21 |
+
*.swo
|
| 22 |
+
*~
|
| 23 |
+
|
| 24 |
+
# Testing
|
| 25 |
+
.pytest_cache/
|
| 26 |
+
.coverage
|
| 27 |
+
htmlcov/
|
| 28 |
+
.mypy_cache/
|
| 29 |
+
.ruff_cache/
|
| 30 |
+
|
| 31 |
+
# Environment
|
| 32 |
+
.env
|
| 33 |
+
.env.local
|
| 34 |
+
.env.production
|
| 35 |
+
|
| 36 |
+
# Storage (local dev data)
|
| 37 |
+
data/
|
| 38 |
+
*.db
|
| 39 |
+
|
| 40 |
+
# OS
|
| 41 |
+
.DS_Store
|
| 42 |
+
Thumbs.db
|
| 43 |
+
|
| 44 |
+
# Uploads and job artifacts (local)
|
| 45 |
+
uploads/
|
| 46 |
+
exports/
|
| 47 |
+
|
| 48 |
+
# HuggingFace cache (local)
|
| 49 |
+
.huggingface/
|
| 50 |
+
|
| 51 |
+
# Node (frontend)
|
| 52 |
+
frontend/node_modules/
|
| 53 |
+
frontend/dist/
|
AGENTS.md
ADDED
|
@@ -0,0 +1,111 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AGENTS.md — XmLLM Project Rules
|
| 2 |
+
|
| 3 |
+
## What this project is
|
| 4 |
+
|
| 5 |
+
XmLLM is a **canonical-first document structure engine** that converts images (and
|
| 6 |
+
later PDFs) into structured ALTO XML and PAGE XML, via an internal canonical
|
| 7 |
+
representation (`CanonicalDocument`).
|
| 8 |
+
|
| 9 |
+
## Architecture invariants
|
| 10 |
+
|
| 11 |
+
These rules are **non-negotiable**. Any code change that violates them must be
|
| 12 |
+
rejected.
|
| 13 |
+
|
| 14 |
+
### 1. Canonical-first
|
| 15 |
+
|
| 16 |
+
- The system is **never** provider-first, ALTO-first, PAGE-first, or viewer-first.
|
| 17 |
+
- All processing flows through `CanonicalDocument`.
|
| 18 |
+
- ALTO XML and PAGE XML are **output serializations**, not internal representations.
|
| 19 |
+
|
| 20 |
+
### 2. Three internal objects
|
| 21 |
+
|
| 22 |
+
| Object | Role | Never used for |
|
| 23 |
+
|---------------------|--------------------|------------------------------|
|
| 24 |
+
| `RawProviderPayload`| Source truth | Export, rendering |
|
| 25 |
+
| `CanonicalDocument` | Business truth | Direct UI rendering |
|
| 26 |
+
| `ViewerProjection` | Rendering truth | Validation, export decisions |
|
| 27 |
+
|
| 28 |
+
### 3. Domain independence
|
| 29 |
+
|
| 30 |
+
- Anneau A (domain) **must not** depend on FastAPI, frontend, or OpenSeadragon.
|
| 31 |
+
- Anneau B (execution) may depend on domain.
|
| 32 |
+
- Anneau C (API) may depend on domain and execution.
|
| 33 |
+
- Anneau D (presentation) may depend on the API layer only.
|
| 34 |
+
|
| 35 |
+
### 4. Provenance on every node
|
| 36 |
+
|
| 37 |
+
Every node in `CanonicalDocument` **must** carry a `provenance` with:
|
| 38 |
+
- `provider`: source engine name
|
| 39 |
+
- `adapter`: adapter version
|
| 40 |
+
- `source_ref`: path in the raw output
|
| 41 |
+
- `evidence_type`: `provider_native | derived | repaired | manual`
|
| 42 |
+
- `derived_from`: list of canonical IDs (empty if native)
|
| 43 |
+
|
| 44 |
+
### 5. Geometry conventions
|
| 45 |
+
|
| 46 |
+
- **bbox format**: `[x, y, width, height]` — always. `x` = left edge, `y` = top edge.
|
| 47 |
+
- **coordinate origin**: `top_left` — always in internal representation.
|
| 48 |
+
- **unit**: `px` — always in the canonical model.
|
| 49 |
+
- **polygon**: `list[tuple[float, float]] | None` — optional, preserved when available.
|
| 50 |
+
- **Providers returning `[x1, y1, x2, y2]`** must be converted in their adapter.
|
| 51 |
+
- **No serializer** may perform implicit heavy geometry conversion. All geometry
|
| 52 |
+
must be normalized before it reaches a serializer.
|
| 53 |
+
|
| 54 |
+
### 6. Serializer rules
|
| 55 |
+
|
| 56 |
+
Serializers (ALTO, PAGE) **must not**:
|
| 57 |
+
- Call any model or provider
|
| 58 |
+
- Reconstruct segmentation
|
| 59 |
+
- Correct text
|
| 60 |
+
- Invent coordinates
|
| 61 |
+
- Make export eligibility decisions
|
| 62 |
+
|
| 63 |
+
They receive a validated `CanonicalDocument` and produce deterministic XML.
|
| 64 |
+
|
| 65 |
+
### 7. Export eligibility
|
| 66 |
+
|
| 67 |
+
- Every export decision must pass through validators + document policy.
|
| 68 |
+
- A serializer is **never** called when export eligibility is `none`.
|
| 69 |
+
- ALTO and PAGE eligibility are computed **independently**.
|
| 70 |
+
|
| 71 |
+
### 8. Enricher rules
|
| 72 |
+
|
| 73 |
+
Every enricher **must**:
|
| 74 |
+
- Set `provenance.evidence_type` to `derived` or `inferred`
|
| 75 |
+
- Update `geometry.status` if geometry was modified
|
| 76 |
+
- Add warnings to the document when appropriate
|
| 77 |
+
- Respect the active `DocumentPolicy`
|
| 78 |
+
|
| 79 |
+
Enrichers **must not**:
|
| 80 |
+
- Hallucinate text
|
| 81 |
+
- Invent fine-grained coordinates without geometric basis
|
| 82 |
+
- Claim `provider_native` evidence
|
| 83 |
+
|
| 84 |
+
### 9. Viewer rules
|
| 85 |
+
|
| 86 |
+
- The viewer **never** parses ALTO or PAGE XML.
|
| 87 |
+
- It works exclusively from `ViewerProjection`.
|
| 88 |
+
- No business logic in the presentation layer.
|
| 89 |
+
|
| 90 |
+
### 10. Single codebase
|
| 91 |
+
|
| 92 |
+
The same code runs:
|
| 93 |
+
- Locally (bare Python)
|
| 94 |
+
- In Docker
|
| 95 |
+
- On Hugging Face Spaces
|
| 96 |
+
|
| 97 |
+
The only difference is `STORAGE_ROOT` and environment detection.
|
| 98 |
+
|
| 99 |
+
## Bbox containment tolerance
|
| 100 |
+
|
| 101 |
+
Bbox containment checks (child within parent) use a configurable tolerance
|
| 102 |
+
(default: 5px). This is set via `BBOX_CONTAINMENT_TOLERANCE` in the environment.
|
| 103 |
+
|
| 104 |
+
## Coding conventions
|
| 105 |
+
|
| 106 |
+
- Python 3.11+
|
| 107 |
+
- Pydantic v2 for all models
|
| 108 |
+
- `lxml` for XML serialization
|
| 109 |
+
- Type hints everywhere
|
| 110 |
+
- No `Any` types in domain models
|
| 111 |
+
- Tests alongside implementation — no sprint ships without tests
|
README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# XmLLM
|
| 2 |
+
|
| 3 |
+
Document structure engine: image → canonical model → ALTO XML / PAGE XML.
|
pyproject.toml
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[build-system]
|
| 2 |
+
requires = ["hatchling"]
|
| 3 |
+
build-backend = "hatchling.build"
|
| 4 |
+
|
| 5 |
+
[project]
|
| 6 |
+
name = "xmllm"
|
| 7 |
+
version = "0.1.0"
|
| 8 |
+
description = "Document structure engine: image → canonical model → ALTO XML / PAGE XML"
|
| 9 |
+
readme = "README.md"
|
| 10 |
+
license = "Apache-2.0"
|
| 11 |
+
requires-python = ">=3.11"
|
| 12 |
+
authors = [
|
| 13 |
+
{ name = "XmLLM contributors" },
|
| 14 |
+
]
|
| 15 |
+
|
| 16 |
+
dependencies = [
|
| 17 |
+
"fastapi>=0.115,<1",
|
| 18 |
+
"uvicorn[standard]>=0.30,<1",
|
| 19 |
+
"pydantic>=2.7,<3",
|
| 20 |
+
"pydantic-settings>=2.3,<3",
|
| 21 |
+
"lxml>=5.2,<6",
|
| 22 |
+
"Pillow>=10.3,<11",
|
| 23 |
+
"python-multipart>=0.0.9",
|
| 24 |
+
"aiosqlite>=0.20,<1",
|
| 25 |
+
"httpx>=0.27,<1",
|
| 26 |
+
]
|
| 27 |
+
|
| 28 |
+
[project.optional-dependencies]
|
| 29 |
+
dev = [
|
| 30 |
+
"pytest>=8.2,<9",
|
| 31 |
+
"pytest-asyncio>=0.23,<1",
|
| 32 |
+
"pytest-cov>=5,<6",
|
| 33 |
+
"httpx>=0.27,<1",
|
| 34 |
+
"ruff>=0.4,<1",
|
| 35 |
+
"mypy>=1.10,<2",
|
| 36 |
+
"lxml-stubs>=0.5",
|
| 37 |
+
]
|
| 38 |
+
|
| 39 |
+
[tool.hatch.build.targets.wheel]
|
| 40 |
+
packages = ["src"]
|
| 41 |
+
|
| 42 |
+
[tool.pytest.ini_options]
|
| 43 |
+
testpaths = ["tests"]
|
| 44 |
+
asyncio_mode = "auto"
|
| 45 |
+
markers = [
|
| 46 |
+
"unit: unit tests",
|
| 47 |
+
"integration: integration tests",
|
| 48 |
+
"e2e: end-to-end tests",
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
[tool.ruff]
|
| 52 |
+
target-version = "py311"
|
| 53 |
+
line-length = 100
|
| 54 |
+
src = ["src", "tests"]
|
| 55 |
+
|
| 56 |
+
[tool.ruff.lint]
|
| 57 |
+
select = ["E", "F", "I", "N", "W", "UP", "B", "SIM", "TCH"]
|
| 58 |
+
|
| 59 |
+
[tool.mypy]
|
| 60 |
+
python_version = "3.11"
|
| 61 |
+
strict = true
|
| 62 |
+
warn_return_any = true
|
| 63 |
+
warn_unused_configs = true
|
| 64 |
+
plugins = ["pydantic.mypy"]
|
| 65 |
+
|
| 66 |
+
[[tool.mypy.overrides]]
|
| 67 |
+
module = ["lxml.*"]
|
| 68 |
+
ignore_missing_imports = true
|
src/__init__.py
ADDED
|
File without changes
|
src/app/__init__.py
ADDED
|
File without changes
|
src/app/api/__init__.py
ADDED
|
File without changes
|
src/app/api/routes_exports.py
ADDED
|
File without changes
|
src/app/api/routes_health.py
ADDED
|
File without changes
|
src/app/api/routes_jobs.py
ADDED
|
File without changes
|
src/app/api/routes_providers.py
ADDED
|
File without changes
|
src/app/api/routes_viewer.py
ADDED
|
File without changes
|
src/app/domain/__init__.py
ADDED
|
File without changes
|
src/app/domain/errors/__init__.py
ADDED
|
File without changes
|
src/app/domain/models/__init__.py
ADDED
|
File without changes
|
src/app/domain/models/canonical_document.py
ADDED
|
File without changes
|
src/app/domain/models/geometry.py
ADDED
|
File without changes
|
src/app/domain/models/provenance.py
ADDED
|
File without changes
|
src/app/domain/models/raw_payload.py
ADDED
|
File without changes
|
src/app/domain/models/readiness.py
ADDED
|
File without changes
|
src/app/domain/models/status.py
ADDED
|
File without changes
|
src/app/domain/models/viewer_projection.py
ADDED
|
File without changes
|
src/app/domain/services/__init__.py
ADDED
|
File without changes
|
src/app/enrichers/__init__.py
ADDED
|
File without changes
|
src/app/enrichers/bbox_repair_light.py
ADDED
|
File without changes
|
src/app/enrichers/hyphenation_basic.py
ADDED
|
File without changes
|
src/app/enrichers/lang_propagation.py
ADDED
|
File without changes
|
src/app/enrichers/polygon_to_bbox.py
ADDED
|
File without changes
|
src/app/enrichers/reading_order_simple.py
ADDED
|
File without changes
|
src/app/enrichers/text_consistency.py
ADDED
|
File without changes
|
src/app/geometry/__init__.py
ADDED
|
File without changes
|
src/app/geometry/baseline.py
ADDED
|
File without changes
|
src/app/geometry/bbox.py
ADDED
|
File without changes
|
src/app/geometry/normalization.py
ADDED
|
File without changes
|
src/app/geometry/polygon.py
ADDED
|
File without changes
|
src/app/geometry/quantization.py
ADDED
|
File without changes
|
src/app/geometry/transforms.py
ADDED
|
File without changes
|
src/app/jobs/__init__.py
ADDED
|
File without changes
|
src/app/jobs/events.py
ADDED
|
File without changes
|
src/app/jobs/models.py
ADDED
|
File without changes
|
src/app/jobs/service.py
ADDED
|
File without changes
|
src/app/main.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""XmLLM — Document Structure Engine.
|
| 2 |
+
|
| 3 |
+
FastAPI application entry point.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from __future__ import annotations
|
| 7 |
+
|
| 8 |
+
from contextlib import asynccontextmanager
|
| 9 |
+
from typing import AsyncIterator
|
| 10 |
+
|
| 11 |
+
from fastapi import FastAPI
|
| 12 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 13 |
+
|
| 14 |
+
from src.app.settings import Settings, get_settings
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
@asynccontextmanager
|
| 18 |
+
async def lifespan(app: FastAPI) -> AsyncIterator[None]:
|
| 19 |
+
"""Startup / shutdown lifecycle."""
|
| 20 |
+
settings: Settings = get_settings()
|
| 21 |
+
settings.ensure_directories()
|
| 22 |
+
yield
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
app = FastAPI(
|
| 26 |
+
title="XmLLM",
|
| 27 |
+
description="Document structure engine: image → canonical model → ALTO XML / PAGE XML",
|
| 28 |
+
version="0.1.0",
|
| 29 |
+
lifespan=lifespan,
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
app.add_middleware(
|
| 33 |
+
CORSMiddleware,
|
| 34 |
+
allow_origins=["*"],
|
| 35 |
+
allow_credentials=True,
|
| 36 |
+
allow_methods=["*"],
|
| 37 |
+
allow_headers=["*"],
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# -- Health route (always available) ------------------------------------------
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@app.get("/health")
|
| 45 |
+
async def health() -> dict[str, str]:
|
| 46 |
+
settings = get_settings()
|
| 47 |
+
return {
|
| 48 |
+
"status": "ok",
|
| 49 |
+
"version": "0.1.0",
|
| 50 |
+
"mode": settings.app_mode.value,
|
| 51 |
+
}
|
src/app/normalization/__init__.py
ADDED
|
File without changes
|
src/app/normalization/canonical_builder.py
ADDED
|
File without changes
|
src/app/normalization/pipeline.py
ADDED
|
File without changes
|
src/app/persistence/__init__.py
ADDED
|
File without changes
|
src/app/persistence/db.py
ADDED
|
File without changes
|
src/app/persistence/file_store.py
ADDED
|
File without changes
|
src/app/persistence/repositories/__init__.py
ADDED
|
File without changes
|