Claude commited on
Commit
96f8c0a
·
unverified ·
1 Parent(s): 686a885

Sprint 0: project foundation — structure, settings, FastAPI skeleton

Browse files

Set up the complete project scaffolding for the XmLLM document structure
engine (canonical-first architecture for image → ALTO XML / PAGE XML):

- pyproject.toml with core deps (FastAPI, Pydantic v2, lxml, Pillow) and dev deps
- Full directory structure matching the 4-anneau architecture (domain, execution, API, presentation)
- SettingsService with auto-detection of local vs HF Space mode
- FastAPI app skeleton with /health endpoint
- AGENTS.md with all non-negotiable architecture rules
- .env.example with documented configuration variables
- Test suite foundation with 6 passing tests (settings + health)

https://claude.ai/code/session_01Cuzvc9Pjfo5u46eT3ta2Cg

This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .env.example +68 -0
  2. .gitignore +53 -0
  3. AGENTS.md +111 -0
  4. README.md +3 -0
  5. pyproject.toml +68 -0
  6. src/__init__.py +0 -0
  7. src/app/__init__.py +0 -0
  8. src/app/api/__init__.py +0 -0
  9. src/app/api/routes_exports.py +0 -0
  10. src/app/api/routes_health.py +0 -0
  11. src/app/api/routes_jobs.py +0 -0
  12. src/app/api/routes_providers.py +0 -0
  13. src/app/api/routes_viewer.py +0 -0
  14. src/app/domain/__init__.py +0 -0
  15. src/app/domain/errors/__init__.py +0 -0
  16. src/app/domain/models/__init__.py +0 -0
  17. src/app/domain/models/canonical_document.py +0 -0
  18. src/app/domain/models/geometry.py +0 -0
  19. src/app/domain/models/provenance.py +0 -0
  20. src/app/domain/models/raw_payload.py +0 -0
  21. src/app/domain/models/readiness.py +0 -0
  22. src/app/domain/models/status.py +0 -0
  23. src/app/domain/models/viewer_projection.py +0 -0
  24. src/app/domain/services/__init__.py +0 -0
  25. src/app/enrichers/__init__.py +0 -0
  26. src/app/enrichers/bbox_repair_light.py +0 -0
  27. src/app/enrichers/hyphenation_basic.py +0 -0
  28. src/app/enrichers/lang_propagation.py +0 -0
  29. src/app/enrichers/polygon_to_bbox.py +0 -0
  30. src/app/enrichers/reading_order_simple.py +0 -0
  31. src/app/enrichers/text_consistency.py +0 -0
  32. src/app/geometry/__init__.py +0 -0
  33. src/app/geometry/baseline.py +0 -0
  34. src/app/geometry/bbox.py +0 -0
  35. src/app/geometry/normalization.py +0 -0
  36. src/app/geometry/polygon.py +0 -0
  37. src/app/geometry/quantization.py +0 -0
  38. src/app/geometry/transforms.py +0 -0
  39. src/app/jobs/__init__.py +0 -0
  40. src/app/jobs/events.py +0 -0
  41. src/app/jobs/models.py +0 -0
  42. src/app/jobs/service.py +0 -0
  43. src/app/main.py +51 -0
  44. src/app/normalization/__init__.py +0 -0
  45. src/app/normalization/canonical_builder.py +0 -0
  46. src/app/normalization/pipeline.py +0 -0
  47. src/app/persistence/__init__.py +0 -0
  48. src/app/persistence/db.py +0 -0
  49. src/app/persistence/file_store.py +0 -0
  50. src/app/persistence/repositories/__init__.py +0 -0
.env.example ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # =============================================================================
2
+ # XmLLM — Environment Configuration
3
+ # =============================================================================
4
+ # Copy this file to .env and adjust values for your environment.
5
+
6
+ # -----------------------------------------------------------------------------
7
+ # Execution mode
8
+ # -----------------------------------------------------------------------------
9
+ # "local" or "space" — auto-detected from SPACE_ID if not set
10
+ APP_MODE=local
11
+
12
+ # -----------------------------------------------------------------------------
13
+ # Storage
14
+ # -----------------------------------------------------------------------------
15
+ # Root directory for all persistent data (jobs, providers, exports, db).
16
+ # On HF Spaces with persistent storage, use /data
17
+ STORAGE_ROOT=./data
18
+
19
+ # SQLite database path (relative to STORAGE_ROOT)
20
+ DB_NAME=app.db
21
+
22
+ # -----------------------------------------------------------------------------
23
+ # HuggingFace (only relevant in Space mode)
24
+ # -----------------------------------------------------------------------------
25
+ # Set by HF automatically in Spaces — do not set manually in local mode
26
+ # SPACE_ID=
27
+ # HF_HOME=/data/.huggingface
28
+
29
+ # -----------------------------------------------------------------------------
30
+ # Server
31
+ # -----------------------------------------------------------------------------
32
+ HOST=0.0.0.0
33
+ PORT=7860
34
+ LOG_LEVEL=info
35
+
36
+ # -----------------------------------------------------------------------------
37
+ # Upload limits
38
+ # -----------------------------------------------------------------------------
39
+ # Maximum upload file size in bytes (default: 50 MB)
40
+ MAX_UPLOAD_SIZE=52428800
41
+
42
+ # Allowed MIME types for upload (comma-separated)
43
+ ALLOWED_MIME_TYPES=image/png,image/jpeg,image/tiff,image/webp
44
+
45
+ # -----------------------------------------------------------------------------
46
+ # Provider defaults
47
+ # -----------------------------------------------------------------------------
48
+ # Default timeout for provider execution in seconds
49
+ PROVIDER_TIMEOUT=120
50
+
51
+ # Default timeout for API-based providers in seconds
52
+ API_PROVIDER_TIMEOUT=60
53
+
54
+ # Maximum retries for API-based providers
55
+ API_PROVIDER_MAX_RETRIES=2
56
+
57
+ # -----------------------------------------------------------------------------
58
+ # Geometry
59
+ # -----------------------------------------------------------------------------
60
+ # Tolerance in pixels for bbox containment checks (child bbox may exceed parent
61
+ # by this many pixels without triggering a validation error)
62
+ BBOX_CONTAINMENT_TOLERANCE=5
63
+
64
+ # -----------------------------------------------------------------------------
65
+ # Secrets (referenced by name in provider profiles, never serialized)
66
+ # -----------------------------------------------------------------------------
67
+ # SECRET_OPENAI_API_KEY=sk-...
68
+ # SECRET_HF_TOKEN=hf_...
.gitignore ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ *.egg-info/
7
+ *.egg
8
+ dist/
9
+ build/
10
+ .eggs/
11
+
12
+ # Virtual environments
13
+ .venv/
14
+ venv/
15
+ ENV/
16
+
17
+ # IDE
18
+ .vscode/
19
+ .idea/
20
+ *.swp
21
+ *.swo
22
+ *~
23
+
24
+ # Testing
25
+ .pytest_cache/
26
+ .coverage
27
+ htmlcov/
28
+ .mypy_cache/
29
+ .ruff_cache/
30
+
31
+ # Environment
32
+ .env
33
+ .env.local
34
+ .env.production
35
+
36
+ # Storage (local dev data)
37
+ data/
38
+ *.db
39
+
40
+ # OS
41
+ .DS_Store
42
+ Thumbs.db
43
+
44
+ # Uploads and job artifacts (local)
45
+ uploads/
46
+ exports/
47
+
48
+ # HuggingFace cache (local)
49
+ .huggingface/
50
+
51
+ # Node (frontend)
52
+ frontend/node_modules/
53
+ frontend/dist/
AGENTS.md ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AGENTS.md — XmLLM Project Rules
2
+
3
+ ## What this project is
4
+
5
+ XmLLM is a **canonical-first document structure engine** that converts images (and
6
+ later PDFs) into structured ALTO XML and PAGE XML, via an internal canonical
7
+ representation (`CanonicalDocument`).
8
+
9
+ ## Architecture invariants
10
+
11
+ These rules are **non-negotiable**. Any code change that violates them must be
12
+ rejected.
13
+
14
+ ### 1. Canonical-first
15
+
16
+ - The system is **never** provider-first, ALTO-first, PAGE-first, or viewer-first.
17
+ - All processing flows through `CanonicalDocument`.
18
+ - ALTO XML and PAGE XML are **output serializations**, not internal representations.
19
+
20
+ ### 2. Three internal objects
21
+
22
+ | Object | Role | Never used for |
23
+ |---------------------|--------------------|------------------------------|
24
+ | `RawProviderPayload`| Source truth | Export, rendering |
25
+ | `CanonicalDocument` | Business truth | Direct UI rendering |
26
+ | `ViewerProjection` | Rendering truth | Validation, export decisions |
27
+
28
+ ### 3. Domain independence
29
+
30
+ - Anneau A (domain) **must not** depend on FastAPI, frontend, or OpenSeadragon.
31
+ - Anneau B (execution) may depend on domain.
32
+ - Anneau C (API) may depend on domain and execution.
33
+ - Anneau D (presentation) may depend on the API layer only.
34
+
35
+ ### 4. Provenance on every node
36
+
37
+ Every node in `CanonicalDocument` **must** carry a `provenance` with:
38
+ - `provider`: source engine name
39
+ - `adapter`: adapter version
40
+ - `source_ref`: path in the raw output
41
+ - `evidence_type`: `provider_native | derived | repaired | manual`
42
+ - `derived_from`: list of canonical IDs (empty if native)
43
+
44
+ ### 5. Geometry conventions
45
+
46
+ - **bbox format**: `[x, y, width, height]` — always. `x` = left edge, `y` = top edge.
47
+ - **coordinate origin**: `top_left` — always in internal representation.
48
+ - **unit**: `px` — always in the canonical model.
49
+ - **polygon**: `list[tuple[float, float]] | None` — optional, preserved when available.
50
+ - **Providers returning `[x1, y1, x2, y2]`** must be converted in their adapter.
51
+ - **No serializer** may perform implicit heavy geometry conversion. All geometry
52
+ must be normalized before it reaches a serializer.
53
+
54
+ ### 6. Serializer rules
55
+
56
+ Serializers (ALTO, PAGE) **must not**:
57
+ - Call any model or provider
58
+ - Reconstruct segmentation
59
+ - Correct text
60
+ - Invent coordinates
61
+ - Make export eligibility decisions
62
+
63
+ They receive a validated `CanonicalDocument` and produce deterministic XML.
64
+
65
+ ### 7. Export eligibility
66
+
67
+ - Every export decision must pass through validators + document policy.
68
+ - A serializer is **never** called when export eligibility is `none`.
69
+ - ALTO and PAGE eligibility are computed **independently**.
70
+
71
+ ### 8. Enricher rules
72
+
73
+ Every enricher **must**:
74
+ - Set `provenance.evidence_type` to `derived` or `inferred`
75
+ - Update `geometry.status` if geometry was modified
76
+ - Add warnings to the document when appropriate
77
+ - Respect the active `DocumentPolicy`
78
+
79
+ Enrichers **must not**:
80
+ - Hallucinate text
81
+ - Invent fine-grained coordinates without geometric basis
82
+ - Claim `provider_native` evidence
83
+
84
+ ### 9. Viewer rules
85
+
86
+ - The viewer **never** parses ALTO or PAGE XML.
87
+ - It works exclusively from `ViewerProjection`.
88
+ - No business logic in the presentation layer.
89
+
90
+ ### 10. Single codebase
91
+
92
+ The same code runs:
93
+ - Locally (bare Python)
94
+ - In Docker
95
+ - On Hugging Face Spaces
96
+
97
+ The only difference is `STORAGE_ROOT` and environment detection.
98
+
99
+ ## Bbox containment tolerance
100
+
101
+ Bbox containment checks (child within parent) use a configurable tolerance
102
+ (default: 5px). This is set via `BBOX_CONTAINMENT_TOLERANCE` in the environment.
103
+
104
+ ## Coding conventions
105
+
106
+ - Python 3.11+
107
+ - Pydantic v2 for all models
108
+ - `lxml` for XML serialization
109
+ - Type hints everywhere
110
+ - No `Any` types in domain models
111
+ - Tests alongside implementation — no sprint ships without tests
README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ # XmLLM
2
+
3
+ Document structure engine: image → canonical model → ALTO XML / PAGE XML.
pyproject.toml ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [build-system]
2
+ requires = ["hatchling"]
3
+ build-backend = "hatchling.build"
4
+
5
+ [project]
6
+ name = "xmllm"
7
+ version = "0.1.0"
8
+ description = "Document structure engine: image → canonical model → ALTO XML / PAGE XML"
9
+ readme = "README.md"
10
+ license = "Apache-2.0"
11
+ requires-python = ">=3.11"
12
+ authors = [
13
+ { name = "XmLLM contributors" },
14
+ ]
15
+
16
+ dependencies = [
17
+ "fastapi>=0.115,<1",
18
+ "uvicorn[standard]>=0.30,<1",
19
+ "pydantic>=2.7,<3",
20
+ "pydantic-settings>=2.3,<3",
21
+ "lxml>=5.2,<6",
22
+ "Pillow>=10.3,<11",
23
+ "python-multipart>=0.0.9",
24
+ "aiosqlite>=0.20,<1",
25
+ "httpx>=0.27,<1",
26
+ ]
27
+
28
+ [project.optional-dependencies]
29
+ dev = [
30
+ "pytest>=8.2,<9",
31
+ "pytest-asyncio>=0.23,<1",
32
+ "pytest-cov>=5,<6",
33
+ "httpx>=0.27,<1",
34
+ "ruff>=0.4,<1",
35
+ "mypy>=1.10,<2",
36
+ "lxml-stubs>=0.5",
37
+ ]
38
+
39
+ [tool.hatch.build.targets.wheel]
40
+ packages = ["src"]
41
+
42
+ [tool.pytest.ini_options]
43
+ testpaths = ["tests"]
44
+ asyncio_mode = "auto"
45
+ markers = [
46
+ "unit: unit tests",
47
+ "integration: integration tests",
48
+ "e2e: end-to-end tests",
49
+ ]
50
+
51
+ [tool.ruff]
52
+ target-version = "py311"
53
+ line-length = 100
54
+ src = ["src", "tests"]
55
+
56
+ [tool.ruff.lint]
57
+ select = ["E", "F", "I", "N", "W", "UP", "B", "SIM", "TCH"]
58
+
59
+ [tool.mypy]
60
+ python_version = "3.11"
61
+ strict = true
62
+ warn_return_any = true
63
+ warn_unused_configs = true
64
+ plugins = ["pydantic.mypy"]
65
+
66
+ [[tool.mypy.overrides]]
67
+ module = ["lxml.*"]
68
+ ignore_missing_imports = true
src/__init__.py ADDED
File without changes
src/app/__init__.py ADDED
File without changes
src/app/api/__init__.py ADDED
File without changes
src/app/api/routes_exports.py ADDED
File without changes
src/app/api/routes_health.py ADDED
File without changes
src/app/api/routes_jobs.py ADDED
File without changes
src/app/api/routes_providers.py ADDED
File without changes
src/app/api/routes_viewer.py ADDED
File without changes
src/app/domain/__init__.py ADDED
File without changes
src/app/domain/errors/__init__.py ADDED
File without changes
src/app/domain/models/__init__.py ADDED
File without changes
src/app/domain/models/canonical_document.py ADDED
File without changes
src/app/domain/models/geometry.py ADDED
File without changes
src/app/domain/models/provenance.py ADDED
File without changes
src/app/domain/models/raw_payload.py ADDED
File without changes
src/app/domain/models/readiness.py ADDED
File without changes
src/app/domain/models/status.py ADDED
File without changes
src/app/domain/models/viewer_projection.py ADDED
File without changes
src/app/domain/services/__init__.py ADDED
File without changes
src/app/enrichers/__init__.py ADDED
File without changes
src/app/enrichers/bbox_repair_light.py ADDED
File without changes
src/app/enrichers/hyphenation_basic.py ADDED
File without changes
src/app/enrichers/lang_propagation.py ADDED
File without changes
src/app/enrichers/polygon_to_bbox.py ADDED
File without changes
src/app/enrichers/reading_order_simple.py ADDED
File without changes
src/app/enrichers/text_consistency.py ADDED
File without changes
src/app/geometry/__init__.py ADDED
File without changes
src/app/geometry/baseline.py ADDED
File without changes
src/app/geometry/bbox.py ADDED
File without changes
src/app/geometry/normalization.py ADDED
File without changes
src/app/geometry/polygon.py ADDED
File without changes
src/app/geometry/quantization.py ADDED
File without changes
src/app/geometry/transforms.py ADDED
File without changes
src/app/jobs/__init__.py ADDED
File without changes
src/app/jobs/events.py ADDED
File without changes
src/app/jobs/models.py ADDED
File without changes
src/app/jobs/service.py ADDED
File without changes
src/app/main.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """XmLLM — Document Structure Engine.
2
+
3
+ FastAPI application entry point.
4
+ """
5
+
6
+ from __future__ import annotations
7
+
8
+ from contextlib import asynccontextmanager
9
+ from typing import AsyncIterator
10
+
11
+ from fastapi import FastAPI
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+
14
+ from src.app.settings import Settings, get_settings
15
+
16
+
17
+ @asynccontextmanager
18
+ async def lifespan(app: FastAPI) -> AsyncIterator[None]:
19
+ """Startup / shutdown lifecycle."""
20
+ settings: Settings = get_settings()
21
+ settings.ensure_directories()
22
+ yield
23
+
24
+
25
+ app = FastAPI(
26
+ title="XmLLM",
27
+ description="Document structure engine: image → canonical model → ALTO XML / PAGE XML",
28
+ version="0.1.0",
29
+ lifespan=lifespan,
30
+ )
31
+
32
+ app.add_middleware(
33
+ CORSMiddleware,
34
+ allow_origins=["*"],
35
+ allow_credentials=True,
36
+ allow_methods=["*"],
37
+ allow_headers=["*"],
38
+ )
39
+
40
+
41
+ # -- Health route (always available) ------------------------------------------
42
+
43
+
44
+ @app.get("/health")
45
+ async def health() -> dict[str, str]:
46
+ settings = get_settings()
47
+ return {
48
+ "status": "ok",
49
+ "version": "0.1.0",
50
+ "mode": settings.app_mode.value,
51
+ }
src/app/normalization/__init__.py ADDED
File without changes
src/app/normalization/canonical_builder.py ADDED
File without changes
src/app/normalization/pipeline.py ADDED
File without changes
src/app/persistence/__init__.py ADDED
File without changes
src/app/persistence/db.py ADDED
File without changes
src/app/persistence/file_store.py ADDED
File without changes
src/app/persistence/repositories/__init__.py ADDED
File without changes