Spaces:
Running
Running
maribakulj commited on
Commit ·
b64f8e3
1
Parent(s): 179bb0f
feat(lot5): add Gallica connector with normalized mapping and fixture fallback
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .gitignore +6 -0
- README.md +70 -3
- app/backend/app/__init__.py +0 -0
- app/backend/app/api/__init__.py +0 -0
- app/backend/app/api/dependencies.py +54 -0
- app/backend/app/api/health.py +12 -0
- app/backend/app/api/import_.py +19 -0
- app/backend/app/api/items.py +19 -0
- app/backend/app/api/manifest.py +19 -0
- app/backend/app/api/router.py +13 -0
- app/backend/app/api/search.py +19 -0
- app/backend/app/api/sources.py +16 -0
- app/backend/app/config/__init__.py +0 -0
- app/backend/app/config/settings.py +21 -0
- app/backend/app/connectors/__init__.py +0 -0
- app/backend/app/connectors/base.py +47 -0
- app/backend/app/connectors/gallica/__init__.py +5 -0
- app/backend/app/connectors/gallica/connector.py +264 -0
- app/backend/app/connectors/gallica/fixtures.py +24 -0
- app/backend/app/connectors/manifest_by_url_connector.py +105 -0
- app/backend/app/connectors/mock_connector.py +109 -0
- app/backend/app/connectors/registry.py +35 -0
- app/backend/app/main.py +44 -0
- app/backend/app/models/__init__.py +0 -0
- app/backend/app/models/error_models.py +10 -0
- app/backend/app/models/import_models.py +22 -0
- app/backend/app/models/manifest_models.py +21 -0
- app/backend/app/models/normalized_item.py +37 -0
- app/backend/app/models/search_models.py +38 -0
- app/backend/app/models/source_models.py +28 -0
- app/backend/app/services/__init__.py +0 -0
- app/backend/app/services/import_service.py +61 -0
- app/backend/app/services/item_service.py +27 -0
- app/backend/app/services/manifest_resolver.py +30 -0
- app/backend/app/services/search_orchestrator.py +63 -0
- app/backend/app/services/source_service.py +29 -0
- app/backend/app/utils/__init__.py +0 -0
- app/backend/app/utils/errors.py +13 -0
- app/backend/app/utils/http_client.py +12 -0
- app/backend/app/utils/ids.py +18 -0
- app/backend/app/utils/url_validation.py +58 -0
- app/frontend/index.html +12 -0
- app/frontend/package.json +30 -0
- app/frontend/postcss.config.js +6 -0
- app/frontend/src/app/providers.tsx +17 -0
- app/frontend/src/app/router.tsx +45 -0
- app/frontend/src/components/results/ResultCard.tsx +34 -0
- app/frontend/src/components/search/ResultsGrid.tsx +34 -0
- app/frontend/src/components/search/SearchBar.tsx +29 -0
- app/frontend/src/components/search/SearchFilters.tsx +50 -0
.gitignore
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__pycache__/
|
| 2 |
+
*.pyc
|
| 3 |
+
.venv/
|
| 4 |
+
|
| 5 |
+
app/frontend/node_modules/
|
| 6 |
+
app/frontend/dist/
|
README.md
CHANGED
|
@@ -145,6 +145,34 @@ Le moteur fédéré ne doit jamais échouer globalement à cause d’un seul con
|
|
| 145 |
- `POST /api/resolve-manifest`
|
| 146 |
- `POST /api/import`
|
| 147 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 148 |
## Outils MCP prévus
|
| 149 |
|
| 150 |
- `search_items`
|
|
@@ -165,11 +193,10 @@ Le moteur fédéré ne doit jamais échouer globalement à cause d’un seul con
|
|
| 165 |
### Backend
|
| 166 |
|
| 167 |
```bash
|
| 168 |
-
cd app/backend
|
| 169 |
python -m venv .venv
|
| 170 |
source .venv/bin/activate
|
| 171 |
-
pip install -
|
| 172 |
-
uvicorn app.main:app --reload
|
| 173 |
```
|
| 174 |
|
| 175 |
### Frontend
|
|
@@ -180,6 +207,14 @@ npm install
|
|
| 180 |
npm run dev
|
| 181 |
```
|
| 182 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
## Variables d’environnement
|
| 184 |
|
| 185 |
Créer un fichier `.env` à partir de `.env.example`.
|
|
@@ -210,6 +245,38 @@ docker run -p 8000:8000 universal-iiif-portal
|
|
| 210 |
- Europeana
|
| 211 |
- connecteur générique `manifest-by-url`
|
| 212 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
## Principes de développement
|
| 214 |
|
| 215 |
- code modulaire ;
|
|
|
|
| 145 |
- `POST /api/resolve-manifest`
|
| 146 |
- `POST /api/import`
|
| 147 |
|
| 148 |
+
### Heuristiques MVP de `/api/import`
|
| 149 |
+
|
| 150 |
+
Le connecteur générique `manifest_by_url` applique des heuristiques minimales et explicites :
|
| 151 |
+
|
| 152 |
+
1. **Manifest direct** : l’URL est considérée comme manifest si son chemin contient `manifest`
|
| 153 |
+
(ou se termine par `manifest.json`).
|
| 154 |
+
2. **Notice -> manifest** : si l’URL ne ressemble pas à un manifest, le backend tente des suffixes
|
| 155 |
+
courants, dans cet ordre :
|
| 156 |
+
- `/manifest`
|
| 157 |
+
- `/manifest.json`
|
| 158 |
+
- `/iiif/manifest`
|
| 159 |
+
- `/iiif/manifest.json`
|
| 160 |
+
|
| 161 |
+
Ces heuristiques sont volontairement simples au MVP et seront enrichies par source aux lots
|
| 162 |
+
connecteurs réels.
|
| 163 |
+
|
| 164 |
+
### Sécurité MVP import URL (validation + SSRF basique)
|
| 165 |
+
|
| 166 |
+
`/api/import` applique une validation stricte avant résolution :
|
| 167 |
+
|
| 168 |
+
- schémas autorisés : `http`, `https` uniquement ;
|
| 169 |
+
- rejet explicite de `localhost`/hôtes locaux ;
|
| 170 |
+
- rejet des IP privées/loopback/link-local/réservées/unspecified ;
|
| 171 |
+
- rejet des hôtes DNS qui résolvent vers ces plages privées/locales.
|
| 172 |
+
|
| 173 |
+
Limite connue MVP : cette protection SSRF reste basique et devra être durcie (allowlist,
|
| 174 |
+
résolution DNS contrôlée, protections réseau infra) avant production.
|
| 175 |
+
|
| 176 |
## Outils MCP prévus
|
| 177 |
|
| 178 |
- `search_items`
|
|
|
|
| 193 |
### Backend
|
| 194 |
|
| 195 |
```bash
|
|
|
|
| 196 |
python -m venv .venv
|
| 197 |
source .venv/bin/activate
|
| 198 |
+
pip install -e '.[dev]'
|
| 199 |
+
uvicorn app.main:app --app-dir app/backend --reload
|
| 200 |
```
|
| 201 |
|
| 202 |
### Frontend
|
|
|
|
| 207 |
npm run dev
|
| 208 |
```
|
| 209 |
|
| 210 |
+
Par défaut, le frontend appelle `http://localhost:8000`.
|
| 211 |
+
|
| 212 |
+
Optionnel :
|
| 213 |
+
|
| 214 |
+
```bash
|
| 215 |
+
VITE_API_BASE_URL=http://localhost:8000 npm run dev
|
| 216 |
+
```
|
| 217 |
+
|
| 218 |
## Variables d’environnement
|
| 219 |
|
| 220 |
Créer un fichier `.env` à partir de `.env.example`.
|
|
|
|
| 245 |
- Europeana
|
| 246 |
- connecteur générique `manifest-by-url`
|
| 247 |
|
| 248 |
+
## Connecteur Gallica (lot 5)
|
| 249 |
+
|
| 250 |
+
### Hypothèses de mapping `NormalizedItem`
|
| 251 |
+
|
| 252 |
+
- `source_item_id` : ARK extrait des identifiants Gallica (`ark:/...`) ;
|
| 253 |
+
- `id` global : `gallica:{source_item_id}` ;
|
| 254 |
+
- `title` : premier champ `dc:title` disponible ;
|
| 255 |
+
- `creators` : liste des `dc:creator` ;
|
| 256 |
+
- `date_display` : premier `dc:date` ;
|
| 257 |
+
- `object_type` : dérivé de `dc:type` via mapping simple (`manuscript`, `book`, `map`, `image`, `newspaper`, `other`) ;
|
| 258 |
+
- `record_url` : premier `dc:identifier` ;
|
| 259 |
+
- `manifest_url` : construit depuis l’ARK (`https://gallica.bnf.fr/iiif/{ark}/manifest.json`) ;
|
| 260 |
+
- `institution` : `Bibliothèque nationale de France`.
|
| 261 |
+
|
| 262 |
+
### Stratégie de résolution de manifest
|
| 263 |
+
|
| 264 |
+
1. si `item.manifest_url` est déjà présent, il est renvoyé ;
|
| 265 |
+
2. sinon, extraction d’un ARK depuis `record_url` (ou URL fournie) ;
|
| 266 |
+
3. construction déterministe de l’URL IIIF manifest Gallica.
|
| 267 |
+
|
| 268 |
+
### Robustesse / mode fallback
|
| 269 |
+
|
| 270 |
+
- Le connecteur tente un mode live SRU Gallica ;
|
| 271 |
+
- pour éviter de casser la suite en environnement instable, un mode fixtures est disponible (`CLAFOUTIS_GALLICA_USE_FIXTURES=true` au MVP, valeur par défaut) ;
|
| 272 |
+
- en cas d’échec live, le connecteur renvoie un succès dégradé avec données fixtures et `partial_failures` explicite.
|
| 273 |
+
|
| 274 |
+
### Limites connues (MVP)
|
| 275 |
+
|
| 276 |
+
- le parsing SRU est volontairement minimal et basé sur un sous-ensemble Dublin Core ;
|
| 277 |
+
- certains champs Gallica restent absents/incertains selon les notices ;
|
| 278 |
+
- la détection fine des types documentaires sera améliorée aux lots suivants.
|
| 279 |
+
|
| 280 |
## Principes de développement
|
| 281 |
|
| 282 |
- code modulaire ;
|
app/backend/app/__init__.py
ADDED
|
File without changes
|
app/backend/app/api/__init__.py
ADDED
|
File without changes
|
app/backend/app/api/dependencies.py
ADDED
|
@@ -0,0 +1,54 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Dependency providers for API routes."""
|
| 2 |
+
|
| 3 |
+
from functools import lru_cache
|
| 4 |
+
|
| 5 |
+
from app.connectors.gallica import GallicaConnector
|
| 6 |
+
from app.connectors.manifest_by_url_connector import ManifestByUrlConnector
|
| 7 |
+
from app.connectors.mock_connector import MockConnector
|
| 8 |
+
from app.connectors.registry import ConnectorRegistry
|
| 9 |
+
from app.services.import_service import ImportService
|
| 10 |
+
from app.services.item_service import ItemService
|
| 11 |
+
from app.services.manifest_resolver import ManifestResolver
|
| 12 |
+
from app.services.search_orchestrator import SearchOrchestrator
|
| 13 |
+
from app.services.source_service import SourceService
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
@lru_cache(maxsize=1)
|
| 17 |
+
def get_registry() -> ConnectorRegistry:
|
| 18 |
+
"""Create and cache connector registry with MVP connectors."""
|
| 19 |
+
|
| 20 |
+
registry = ConnectorRegistry()
|
| 21 |
+
registry.register(MockConnector())
|
| 22 |
+
registry.register(GallicaConnector())
|
| 23 |
+
registry.register(ManifestByUrlConnector())
|
| 24 |
+
return registry
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def get_search_orchestrator() -> SearchOrchestrator:
|
| 28 |
+
"""Return orchestrator instance wired with connector registry."""
|
| 29 |
+
|
| 30 |
+
return SearchOrchestrator(get_registry())
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def get_source_service() -> SourceService:
|
| 34 |
+
"""Return source service instance."""
|
| 35 |
+
|
| 36 |
+
return SourceService(get_registry())
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def get_item_service() -> ItemService:
|
| 40 |
+
"""Return item service instance."""
|
| 41 |
+
|
| 42 |
+
return ItemService(get_registry())
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def get_manifest_resolver() -> ManifestResolver:
|
| 46 |
+
"""Return manifest resolver instance."""
|
| 47 |
+
|
| 48 |
+
return ManifestResolver(get_registry())
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def get_import_service() -> ImportService:
|
| 52 |
+
"""Return import service instance."""
|
| 53 |
+
|
| 54 |
+
return ImportService(get_registry())
|
app/backend/app/api/health.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Health endpoint."""
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter
|
| 4 |
+
|
| 5 |
+
router = APIRouter(tags=["health"])
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
@router.get("/health")
|
| 9 |
+
async def health() -> dict[str, str]:
|
| 10 |
+
"""Return backend health status."""
|
| 11 |
+
|
| 12 |
+
return {"status": "ok"}
|
app/backend/app/api/import_.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Import endpoint for notice or manifest URLs."""
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter, Depends
|
| 4 |
+
|
| 5 |
+
from app.api.dependencies import get_import_service
|
| 6 |
+
from app.models.import_models import ImportRequest, ImportResponse
|
| 7 |
+
from app.services.import_service import ImportService
|
| 8 |
+
|
| 9 |
+
router = APIRouter(tags=["import"])
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@router.post("/import", response_model=ImportResponse)
|
| 13 |
+
async def import_item(
|
| 14 |
+
payload: ImportRequest,
|
| 15 |
+
service: ImportService = Depends(get_import_service),
|
| 16 |
+
) -> ImportResponse:
|
| 17 |
+
"""Import an external URL and attempt to resolve source and manifest."""
|
| 18 |
+
|
| 19 |
+
return await service.import_url(payload.url)
|
app/backend/app/api/items.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Item detail endpoint."""
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter, Depends
|
| 4 |
+
|
| 5 |
+
from app.api.dependencies import get_item_service
|
| 6 |
+
from app.models.normalized_item import NormalizedItem
|
| 7 |
+
from app.services.item_service import ItemService
|
| 8 |
+
|
| 9 |
+
router = APIRouter(tags=["items"])
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@router.get("/item/{global_id}", response_model=NormalizedItem)
|
| 13 |
+
async def get_item(
|
| 14 |
+
global_id: str,
|
| 15 |
+
service: ItemService = Depends(get_item_service),
|
| 16 |
+
) -> NormalizedItem:
|
| 17 |
+
"""Return a normalized item by global identifier."""
|
| 18 |
+
|
| 19 |
+
return await service.get_item(global_id)
|
app/backend/app/api/manifest.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Manifest resolution endpoint."""
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter, Depends
|
| 4 |
+
|
| 5 |
+
from app.api.dependencies import get_manifest_resolver
|
| 6 |
+
from app.models.manifest_models import ResolveManifestRequest, ResolveManifestResponse
|
| 7 |
+
from app.services.manifest_resolver import ManifestResolver
|
| 8 |
+
|
| 9 |
+
router = APIRouter(tags=["manifest"])
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@router.post("/resolve-manifest", response_model=ResolveManifestResponse)
|
| 13 |
+
async def resolve_manifest(
|
| 14 |
+
payload: ResolveManifestRequest,
|
| 15 |
+
resolver: ManifestResolver = Depends(get_manifest_resolver),
|
| 16 |
+
) -> ResolveManifestResponse:
|
| 17 |
+
"""Resolve manifest URL for a source item."""
|
| 18 |
+
|
| 19 |
+
return await resolver.resolve(payload)
|
app/backend/app/api/router.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Top-level API router."""
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter
|
| 4 |
+
|
| 5 |
+
from app.api import health, import_, items, manifest, search, sources
|
| 6 |
+
|
| 7 |
+
api_router = APIRouter(prefix="/api")
|
| 8 |
+
api_router.include_router(health.router)
|
| 9 |
+
api_router.include_router(sources.router)
|
| 10 |
+
api_router.include_router(search.router)
|
| 11 |
+
api_router.include_router(items.router)
|
| 12 |
+
api_router.include_router(manifest.router)
|
| 13 |
+
api_router.include_router(import_.router)
|
app/backend/app/api/search.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Search endpoint."""
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter, Depends
|
| 4 |
+
|
| 5 |
+
from app.api.dependencies import get_search_orchestrator
|
| 6 |
+
from app.models.search_models import SearchRequest, SearchResponse
|
| 7 |
+
from app.services.search_orchestrator import SearchOrchestrator
|
| 8 |
+
|
| 9 |
+
router = APIRouter(tags=["search"])
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@router.post("/search", response_model=SearchResponse)
|
| 13 |
+
async def search_items(
|
| 14 |
+
payload: SearchRequest,
|
| 15 |
+
orchestrator: SearchOrchestrator = Depends(get_search_orchestrator),
|
| 16 |
+
) -> SearchResponse:
|
| 17 |
+
"""Run federated search and return normalized results."""
|
| 18 |
+
|
| 19 |
+
return await orchestrator.search(payload)
|
app/backend/app/api/sources.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Source listing endpoint."""
|
| 2 |
+
|
| 3 |
+
from fastapi import APIRouter, Depends
|
| 4 |
+
|
| 5 |
+
from app.api.dependencies import get_source_service
|
| 6 |
+
from app.models.source_models import SourcesResponse
|
| 7 |
+
from app.services.source_service import SourceService
|
| 8 |
+
|
| 9 |
+
router = APIRouter(tags=["sources"])
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
@router.get("/sources", response_model=SourcesResponse)
|
| 13 |
+
async def list_sources(service: SourceService = Depends(get_source_service)) -> SourcesResponse:
|
| 14 |
+
"""List registered sources and capabilities."""
|
| 15 |
+
|
| 16 |
+
return await service.list_sources()
|
app/backend/app/config/__init__.py
ADDED
|
File without changes
|
app/backend/app/config/settings.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Application settings loaded from environment variables."""
|
| 2 |
+
|
| 3 |
+
from pydantic import Field
|
| 4 |
+
from pydantic_settings import BaseSettings, SettingsConfigDict
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class Settings(BaseSettings):
|
| 8 |
+
"""Runtime settings for the backend application."""
|
| 9 |
+
|
| 10 |
+
app_name: str = "Clafoutis Backend"
|
| 11 |
+
app_version: str = "0.1.0"
|
| 12 |
+
debug: bool = False
|
| 13 |
+
request_timeout_seconds: float = Field(default=8.0, gt=0)
|
| 14 |
+
cors_allow_origins: list[str] = Field(default_factory=lambda: ["http://localhost:5173"])
|
| 15 |
+
gallica_sru_base_url: str = "https://gallica.bnf.fr/SRU"
|
| 16 |
+
gallica_use_fixtures: bool = True
|
| 17 |
+
|
| 18 |
+
model_config = SettingsConfigDict(env_prefix="CLAFOUTIS_", extra="ignore")
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
settings = Settings()
|
app/backend/app/connectors/__init__.py
ADDED
|
File without changes
|
app/backend/app/connectors/base.py
ADDED
|
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Abstract connector interface for all external sources."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from abc import ABC, abstractmethod
|
| 6 |
+
|
| 7 |
+
from app.models.normalized_item import NormalizedItem
|
| 8 |
+
from app.models.search_models import SearchResponse
|
| 9 |
+
from app.models.source_models import SourceCapabilities
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class BaseConnector(ABC):
|
| 13 |
+
"""Common contract implemented by every source connector."""
|
| 14 |
+
|
| 15 |
+
name: str
|
| 16 |
+
label: str
|
| 17 |
+
source_type: str
|
| 18 |
+
|
| 19 |
+
@abstractmethod
|
| 20 |
+
async def search(
|
| 21 |
+
self,
|
| 22 |
+
query: str,
|
| 23 |
+
filters: dict[str, object],
|
| 24 |
+
page: int,
|
| 25 |
+
page_size: int,
|
| 26 |
+
) -> SearchResponse:
|
| 27 |
+
"""Execute source search and return normalized results."""
|
| 28 |
+
|
| 29 |
+
@abstractmethod
|
| 30 |
+
async def get_item(self, source_item_id: str) -> NormalizedItem | None:
|
| 31 |
+
"""Get a single normalized item by source-specific identifier."""
|
| 32 |
+
|
| 33 |
+
@abstractmethod
|
| 34 |
+
async def resolve_manifest(
|
| 35 |
+
self,
|
| 36 |
+
item: NormalizedItem | None = None,
|
| 37 |
+
record_url: str | None = None,
|
| 38 |
+
) -> str | None:
|
| 39 |
+
"""Resolve a IIIF manifest URL from item metadata or record URL."""
|
| 40 |
+
|
| 41 |
+
@abstractmethod
|
| 42 |
+
async def healthcheck(self) -> dict[str, str]:
|
| 43 |
+
"""Check connector health and return a compact status report."""
|
| 44 |
+
|
| 45 |
+
@abstractmethod
|
| 46 |
+
async def capabilities(self) -> SourceCapabilities:
|
| 47 |
+
"""Declare static connector capabilities."""
|
app/backend/app/connectors/gallica/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Gallica connector package."""
|
| 2 |
+
|
| 3 |
+
from .connector import GallicaConnector
|
| 4 |
+
|
| 5 |
+
__all__ = ["GallicaConnector"]
|
app/backend/app/connectors/gallica/connector.py
ADDED
|
@@ -0,0 +1,264 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Gallica connector implementation."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import time
|
| 6 |
+
import xml.etree.ElementTree as ET
|
| 7 |
+
from urllib.parse import quote_plus
|
| 8 |
+
|
| 9 |
+
from app.config.settings import settings
|
| 10 |
+
from app.connectors.base import BaseConnector
|
| 11 |
+
from app.connectors.gallica.fixtures import FIXTURE_GALLICA_RECORDS
|
| 12 |
+
from app.models.normalized_item import NormalizedItem
|
| 13 |
+
from app.models.search_models import PartialFailure, SearchResponse
|
| 14 |
+
from app.models.source_models import SourceCapabilities
|
| 15 |
+
from app.utils.http_client import build_async_client
|
| 16 |
+
from app.utils.ids import make_global_id
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
class GallicaConnector(BaseConnector):
|
| 20 |
+
"""Gallica/BnF connector with live SRU mode and deterministic fixture fallback."""
|
| 21 |
+
|
| 22 |
+
name = "gallica"
|
| 23 |
+
label = "Gallica / BnF"
|
| 24 |
+
source_type = "institution"
|
| 25 |
+
|
| 26 |
+
async def search(
|
| 27 |
+
self,
|
| 28 |
+
query: str,
|
| 29 |
+
filters: dict[str, object],
|
| 30 |
+
page: int,
|
| 31 |
+
page_size: int,
|
| 32 |
+
) -> SearchResponse:
|
| 33 |
+
"""Search Gallica through SRU and map records to NormalizedItem."""
|
| 34 |
+
|
| 35 |
+
start = time.perf_counter()
|
| 36 |
+
try:
|
| 37 |
+
records = await self._fetch_search_records(query=query, page=page, page_size=page_size)
|
| 38 |
+
items = [self._map_record(record, index) for index, record in enumerate(records)]
|
| 39 |
+
partial = [PartialFailure(source=self.name, status="ok")]
|
| 40 |
+
except Exception as exc:
|
| 41 |
+
records = self._search_fixtures(query)
|
| 42 |
+
items = [self._map_fixture_record(record, index) for index, record in enumerate(records)]
|
| 43 |
+
partial = [
|
| 44 |
+
PartialFailure(
|
| 45 |
+
source=self.name,
|
| 46 |
+
status="degraded",
|
| 47 |
+
error=f"live_gallica_unavailable: {exc}",
|
| 48 |
+
)
|
| 49 |
+
]
|
| 50 |
+
|
| 51 |
+
start_index = (page - 1) * page_size
|
| 52 |
+
page_items = items[start_index : start_index + page_size]
|
| 53 |
+
|
| 54 |
+
return SearchResponse(
|
| 55 |
+
query=query,
|
| 56 |
+
page=page,
|
| 57 |
+
page_size=page_size,
|
| 58 |
+
total_estimated=len(items),
|
| 59 |
+
results=page_items,
|
| 60 |
+
sources_used=[self.name],
|
| 61 |
+
partial_failures=partial,
|
| 62 |
+
duration_ms=int((time.perf_counter() - start) * 1000),
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
async def get_item(self, source_item_id: str) -> NormalizedItem | None:
|
| 66 |
+
"""Retrieve one Gallica item by ARK, using live mode then fixtures fallback."""
|
| 67 |
+
|
| 68 |
+
try:
|
| 69 |
+
records = await self._fetch_search_records(
|
| 70 |
+
query=f'ark all "{source_item_id}"',
|
| 71 |
+
page=1,
|
| 72 |
+
page_size=1,
|
| 73 |
+
raw_query=True,
|
| 74 |
+
)
|
| 75 |
+
if records:
|
| 76 |
+
return self._map_record(records[0], 0)
|
| 77 |
+
except Exception:
|
| 78 |
+
pass
|
| 79 |
+
|
| 80 |
+
for fixture in FIXTURE_GALLICA_RECORDS:
|
| 81 |
+
if fixture["source_item_id"] == source_item_id:
|
| 82 |
+
return self._map_fixture_record(fixture, 0)
|
| 83 |
+
return None
|
| 84 |
+
|
| 85 |
+
async def resolve_manifest(
|
| 86 |
+
self,
|
| 87 |
+
item: NormalizedItem | None = None,
|
| 88 |
+
record_url: str | None = None,
|
| 89 |
+
) -> str | None:
|
| 90 |
+
"""Resolve Gallica IIIF manifest from normalized item or record URL."""
|
| 91 |
+
|
| 92 |
+
if item and item.manifest_url:
|
| 93 |
+
return item.manifest_url
|
| 94 |
+
|
| 95 |
+
source = record_url or (item.record_url if item else None)
|
| 96 |
+
if not source:
|
| 97 |
+
return None
|
| 98 |
+
|
| 99 |
+
ark = self._extract_ark(source)
|
| 100 |
+
if not ark:
|
| 101 |
+
return None
|
| 102 |
+
|
| 103 |
+
return self._manifest_from_ark(ark)
|
| 104 |
+
|
| 105 |
+
async def healthcheck(self) -> dict[str, str]:
|
| 106 |
+
"""Run lightweight Gallica availability check."""
|
| 107 |
+
|
| 108 |
+
if settings.gallica_use_fixtures:
|
| 109 |
+
return {"status": "ok", "mode": "fixtures"}
|
| 110 |
+
|
| 111 |
+
params_query = quote_plus('dc.title all "dante"')
|
| 112 |
+
url = (
|
| 113 |
+
f"{settings.gallica_sru_base_url}?version=1.2&operation=searchRetrieve"
|
| 114 |
+
f"&query={params_query}&maximumRecords=1"
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
async with await build_async_client() as client:
|
| 118 |
+
response = await client.get(url)
|
| 119 |
+
if response.status_code >= 400:
|
| 120 |
+
return {"status": "error", "mode": "live"}
|
| 121 |
+
return {"status": "ok", "mode": "live"}
|
| 122 |
+
|
| 123 |
+
async def capabilities(self) -> SourceCapabilities:
|
| 124 |
+
"""Return Gallica connector capabilities."""
|
| 125 |
+
|
| 126 |
+
return SourceCapabilities(search=True, get_item=True, resolve_manifest=True)
|
| 127 |
+
|
| 128 |
+
async def _fetch_search_records(
|
| 129 |
+
self,
|
| 130 |
+
query: str,
|
| 131 |
+
page: int,
|
| 132 |
+
page_size: int,
|
| 133 |
+
raw_query: bool = False,
|
| 134 |
+
) -> list[ET.Element]:
|
| 135 |
+
if settings.gallica_use_fixtures:
|
| 136 |
+
raise RuntimeError("fixtures mode enabled")
|
| 137 |
+
|
| 138 |
+
sru_query = query if raw_query else f'dc.title all "{query}"'
|
| 139 |
+
start_record = ((page - 1) * page_size) + 1
|
| 140 |
+
encoded_query = quote_plus(sru_query)
|
| 141 |
+
|
| 142 |
+
url = (
|
| 143 |
+
f"{settings.gallica_sru_base_url}?version=1.2&operation=searchRetrieve"
|
| 144 |
+
f"&query={encoded_query}&startRecord={start_record}&maximumRecords={page_size}"
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
async with await build_async_client() as client:
|
| 148 |
+
response = await client.get(url)
|
| 149 |
+
response.raise_for_status()
|
| 150 |
+
|
| 151 |
+
root = ET.fromstring(response.text)
|
| 152 |
+
return [record for record in root.iter() if record.tag.endswith("record")]
|
| 153 |
+
|
| 154 |
+
def _map_record(self, record: ET.Element, index: int) -> NormalizedItem:
|
| 155 |
+
dc_values = self._extract_dc_values(record)
|
| 156 |
+
|
| 157 |
+
source_item_id = self._extract_ark(dc_values.get("identifier", []))
|
| 158 |
+
if not source_item_id:
|
| 159 |
+
source_item_id = f"gallica-record-{index}"
|
| 160 |
+
|
| 161 |
+
title = self._first(dc_values.get("title", []), default="Document Gallica")
|
| 162 |
+
creators = dc_values.get("creator", [])
|
| 163 |
+
date_display = self._first(dc_values.get("date", []), default=None)
|
| 164 |
+
object_type = self._map_object_type(self._first(dc_values.get("type", []), default="other"))
|
| 165 |
+
record_url = self._first(dc_values.get("identifier", []), default=None)
|
| 166 |
+
manifest_url = self._manifest_from_ark(source_item_id) if source_item_id.startswith("ark:/") else None
|
| 167 |
+
|
| 168 |
+
warnings: list[str] = []
|
| 169 |
+
if not creators:
|
| 170 |
+
warnings.append("missing_creators")
|
| 171 |
+
if record_url is None:
|
| 172 |
+
warnings.append("missing_record_url")
|
| 173 |
+
|
| 174 |
+
return NormalizedItem(
|
| 175 |
+
id=make_global_id(self.name, source_item_id),
|
| 176 |
+
source=self.name,
|
| 177 |
+
source_label=self.label,
|
| 178 |
+
source_item_id=source_item_id,
|
| 179 |
+
title=title,
|
| 180 |
+
creators=creators,
|
| 181 |
+
date_display=date_display,
|
| 182 |
+
object_type=object_type,
|
| 183 |
+
institution="Bibliothèque nationale de France",
|
| 184 |
+
thumbnail_url=None,
|
| 185 |
+
record_url=record_url,
|
| 186 |
+
manifest_url=manifest_url,
|
| 187 |
+
has_iiif_manifest=manifest_url is not None,
|
| 188 |
+
has_images=True,
|
| 189 |
+
has_ocr=False,
|
| 190 |
+
availability="public",
|
| 191 |
+
relevance_score=max(0.0, 1.0 - (index * 0.01)),
|
| 192 |
+
normalization_warnings=warnings,
|
| 193 |
+
)
|
| 194 |
+
|
| 195 |
+
def _map_fixture_record(self, fixture: dict[str, object], index: int) -> NormalizedItem:
|
| 196 |
+
source_item_id = str(fixture["source_item_id"])
|
| 197 |
+
manifest_url = self._manifest_from_ark(source_item_id)
|
| 198 |
+
return NormalizedItem(
|
| 199 |
+
id=make_global_id(self.name, source_item_id),
|
| 200 |
+
source=self.name,
|
| 201 |
+
source_label=self.label,
|
| 202 |
+
source_item_id=source_item_id,
|
| 203 |
+
title=str(fixture["title"]),
|
| 204 |
+
creators=[str(value) for value in fixture.get("creators", [])],
|
| 205 |
+
date_display=str(fixture.get("date_display")) if fixture.get("date_display") else None,
|
| 206 |
+
object_type=str(fixture.get("object_type", "other")),
|
| 207 |
+
institution=str(fixture.get("institution")) if fixture.get("institution") else None,
|
| 208 |
+
thumbnail_url=str(fixture.get("thumbnail_url")) if fixture.get("thumbnail_url") else None,
|
| 209 |
+
record_url=str(fixture.get("record_url")) if fixture.get("record_url") else None,
|
| 210 |
+
manifest_url=manifest_url,
|
| 211 |
+
has_iiif_manifest=True,
|
| 212 |
+
has_images=True,
|
| 213 |
+
has_ocr=False,
|
| 214 |
+
availability="public",
|
| 215 |
+
relevance_score=max(0.0, 1.0 - (index * 0.01)),
|
| 216 |
+
normalization_warnings=["fixture_mode"],
|
| 217 |
+
)
|
| 218 |
+
|
| 219 |
+
def _extract_dc_values(self, record: ET.Element) -> dict[str, list[str]]:
|
| 220 |
+
values: dict[str, list[str]] = {}
|
| 221 |
+
for node in record.iter():
|
| 222 |
+
if not node.tag.startswith("{"):
|
| 223 |
+
continue
|
| 224 |
+
local_name = node.tag.split("}", maxsplit=1)[1]
|
| 225 |
+
if local_name in {"title", "creator", "date", "identifier", "type"} and node.text:
|
| 226 |
+
values.setdefault(local_name, []).append(node.text.strip())
|
| 227 |
+
return values
|
| 228 |
+
|
| 229 |
+
def _extract_ark(self, identifiers: list[str] | str) -> str | None:
|
| 230 |
+
values = [identifiers] if isinstance(identifiers, str) else identifiers
|
| 231 |
+
for value in values:
|
| 232 |
+
if "ark:/" not in value:
|
| 233 |
+
continue
|
| 234 |
+
ark = value[value.index("ark:/") :]
|
| 235 |
+
return ark.split("?")[0].rstrip("/")
|
| 236 |
+
return None
|
| 237 |
+
|
| 238 |
+
def _manifest_from_ark(self, ark: str) -> str:
|
| 239 |
+
return f"https://gallica.bnf.fr/iiif/{ark}/manifest.json"
|
| 240 |
+
|
| 241 |
+
def _search_fixtures(self, query: str) -> list[dict[str, object]]:
|
| 242 |
+
lowered = query.lower().strip()
|
| 243 |
+
return [
|
| 244 |
+
record
|
| 245 |
+
for record in FIXTURE_GALLICA_RECORDS
|
| 246 |
+
if lowered in str(record["title"]).lower() or lowered in " ".join(record.get("creators", [])).lower()
|
| 247 |
+
]
|
| 248 |
+
|
| 249 |
+
def _map_object_type(self, raw_type: str) -> str:
|
| 250 |
+
lowered = raw_type.lower()
|
| 251 |
+
if "manus" in lowered:
|
| 252 |
+
return "manuscript"
|
| 253 |
+
if "book" in lowered or "livre" in lowered:
|
| 254 |
+
return "book"
|
| 255 |
+
if "map" in lowered or "carte" in lowered:
|
| 256 |
+
return "map"
|
| 257 |
+
if "image" in lowered or "estampe" in lowered:
|
| 258 |
+
return "image"
|
| 259 |
+
if "journal" in lowered or "newspaper" in lowered:
|
| 260 |
+
return "newspaper"
|
| 261 |
+
return "other"
|
| 262 |
+
|
| 263 |
+
def _first(self, values: list[str], default: str | None) -> str | None:
|
| 264 |
+
return values[0] if values else default
|
app/backend/app/connectors/gallica/fixtures.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Static fixture records for Gallica connector fallback mode."""
|
| 2 |
+
|
| 3 |
+
FIXTURE_GALLICA_RECORDS: list[dict[str, object]] = [
|
| 4 |
+
{
|
| 5 |
+
"source_item_id": "ark:/12148/btv1b55002481n",
|
| 6 |
+
"title": "Livre d'heures à l'usage de Rome",
|
| 7 |
+
"creators": ["Anonyme"],
|
| 8 |
+
"date_display": "XVe siècle",
|
| 9 |
+
"object_type": "manuscript",
|
| 10 |
+
"institution": "Bibliothèque nationale de France",
|
| 11 |
+
"record_url": "https://gallica.bnf.fr/ark:/12148/btv1b55002481n",
|
| 12 |
+
"thumbnail_url": "https://gallica.bnf.fr/ark:/12148/btv1b55002481n.thumbnail",
|
| 13 |
+
},
|
| 14 |
+
{
|
| 15 |
+
"source_item_id": "ark:/12148/bpt6k1512248m",
|
| 16 |
+
"title": "La Divine Comédie de Dante Alighieri",
|
| 17 |
+
"creators": ["Dante Alighieri"],
|
| 18 |
+
"date_display": "1898",
|
| 19 |
+
"object_type": "book",
|
| 20 |
+
"institution": "Bibliothèque nationale de France",
|
| 21 |
+
"record_url": "https://gallica.bnf.fr/ark:/12148/bpt6k1512248m",
|
| 22 |
+
"thumbnail_url": "https://gallica.bnf.fr/ark:/12148/bpt6k1512248m.thumbnail",
|
| 23 |
+
},
|
| 24 |
+
]
|
app/backend/app/connectors/manifest_by_url_connector.py
ADDED
|
@@ -0,0 +1,105 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Generic connector resolving IIIF manifests from arbitrary URLs."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
from urllib.parse import urlparse
|
| 6 |
+
|
| 7 |
+
from app.connectors.base import BaseConnector
|
| 8 |
+
from app.models.normalized_item import NormalizedItem
|
| 9 |
+
from app.models.search_models import PartialFailure, SearchResponse
|
| 10 |
+
from app.models.source_models import SourceCapabilities
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
class ManifestByUrlConnector(BaseConnector):
|
| 14 |
+
"""Connector dedicated to URL import and generic manifest detection heuristics."""
|
| 15 |
+
|
| 16 |
+
name = "manifest_by_url"
|
| 17 |
+
label = "Manifest by URL"
|
| 18 |
+
source_type = "generic"
|
| 19 |
+
|
| 20 |
+
async def search(
|
| 21 |
+
self,
|
| 22 |
+
query: str,
|
| 23 |
+
filters: dict[str, object],
|
| 24 |
+
page: int,
|
| 25 |
+
page_size: int,
|
| 26 |
+
) -> SearchResponse:
|
| 27 |
+
"""Return empty search results because this connector is import-only."""
|
| 28 |
+
|
| 29 |
+
return SearchResponse(
|
| 30 |
+
query=query,
|
| 31 |
+
page=page,
|
| 32 |
+
page_size=page_size,
|
| 33 |
+
total_estimated=0,
|
| 34 |
+
results=[],
|
| 35 |
+
sources_used=[self.name],
|
| 36 |
+
partial_failures=[PartialFailure(source=self.name, status="ok")],
|
| 37 |
+
duration_ms=1,
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
async def get_item(self, source_item_id: str) -> NormalizedItem | None:
|
| 41 |
+
"""Return no item because this connector does not expose source IDs."""
|
| 42 |
+
|
| 43 |
+
return None
|
| 44 |
+
|
| 45 |
+
async def resolve_manifest(
|
| 46 |
+
self,
|
| 47 |
+
item: NormalizedItem | None = None,
|
| 48 |
+
record_url: str | None = None,
|
| 49 |
+
) -> str | None:
|
| 50 |
+
"""Resolve manifest URL by direct detection or lightweight notice heuristics."""
|
| 51 |
+
|
| 52 |
+
candidate_url = record_url or (item.record_url if item else None)
|
| 53 |
+
if not candidate_url:
|
| 54 |
+
return None
|
| 55 |
+
|
| 56 |
+
# Heuristic 1: direct manifest URL patterns
|
| 57 |
+
if self._looks_like_manifest_url(candidate_url):
|
| 58 |
+
return candidate_url
|
| 59 |
+
|
| 60 |
+
# Heuristic 2: common notice -> manifest patterns
|
| 61 |
+
generated_candidates = self._notice_to_manifest_candidates(candidate_url)
|
| 62 |
+
for candidate in generated_candidates:
|
| 63 |
+
if self._looks_like_manifest_url(candidate):
|
| 64 |
+
return candidate
|
| 65 |
+
|
| 66 |
+
return None
|
| 67 |
+
|
| 68 |
+
async def healthcheck(self) -> dict[str, str]:
|
| 69 |
+
"""Return healthy status for the local heuristic connector."""
|
| 70 |
+
|
| 71 |
+
return {"status": "ok"}
|
| 72 |
+
|
| 73 |
+
async def capabilities(self) -> SourceCapabilities:
|
| 74 |
+
"""Declare capabilities for this connector."""
|
| 75 |
+
|
| 76 |
+
return SourceCapabilities(search=False, get_item=False, resolve_manifest=True)
|
| 77 |
+
|
| 78 |
+
def _looks_like_manifest_url(self, url: str) -> bool:
|
| 79 |
+
parsed = urlparse(url)
|
| 80 |
+
lowered_path = parsed.path.lower()
|
| 81 |
+
lowered_query = parsed.query.lower()
|
| 82 |
+
return (
|
| 83 |
+
"manifest" in lowered_path
|
| 84 |
+
or lowered_path.endswith("manifest.json")
|
| 85 |
+
or lowered_query.startswith("manifest=")
|
| 86 |
+
or "iiif_manifest" in lowered_query
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
def _notice_to_manifest_candidates(self, url: str) -> list[str]:
|
| 90 |
+
parsed = urlparse(url)
|
| 91 |
+
clean_path = parsed.path.rstrip("/")
|
| 92 |
+
|
| 93 |
+
suffixes = [
|
| 94 |
+
"/manifest",
|
| 95 |
+
"/manifest.json",
|
| 96 |
+
"/iiif/manifest",
|
| 97 |
+
"/iiif/manifest.json",
|
| 98 |
+
]
|
| 99 |
+
|
| 100 |
+
candidates: list[str] = []
|
| 101 |
+
for suffix in suffixes:
|
| 102 |
+
candidate = parsed._replace(path=f"{clean_path}{suffix}", query="").geturl()
|
| 103 |
+
candidates.append(candidate)
|
| 104 |
+
|
| 105 |
+
return candidates
|
app/backend/app/connectors/mock_connector.py
ADDED
|
@@ -0,0 +1,109 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Mock connector used by backend lot 1 to provide stable demo data."""
|
| 2 |
+
|
| 3 |
+
from app.connectors.base import BaseConnector
|
| 4 |
+
from app.models.normalized_item import NormalizedItem
|
| 5 |
+
from app.models.search_models import PartialFailure, SearchResponse
|
| 6 |
+
from app.models.source_models import SourceCapabilities
|
| 7 |
+
from app.utils.ids import make_global_id
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class MockConnector(BaseConnector):
|
| 11 |
+
"""Simple in-memory connector implementing BaseConnector contract."""
|
| 12 |
+
|
| 13 |
+
name = "mock"
|
| 14 |
+
label = "Mock Heritage Source"
|
| 15 |
+
source_type = "mock"
|
| 16 |
+
|
| 17 |
+
def __init__(self) -> None:
|
| 18 |
+
self._items = {
|
| 19 |
+
"ms-1": NormalizedItem(
|
| 20 |
+
id=make_global_id("mock", "ms-1"),
|
| 21 |
+
source="mock",
|
| 22 |
+
source_label=self.label,
|
| 23 |
+
source_item_id="ms-1",
|
| 24 |
+
title="Book of Hours (Mock)",
|
| 25 |
+
creators=["Unknown"],
|
| 26 |
+
institution="Mock Institution",
|
| 27 |
+
object_type="manuscript",
|
| 28 |
+
record_url="https://mock.example.org/items/ms-1",
|
| 29 |
+
manifest_url="https://mock.example.org/iiif/ms-1/manifest",
|
| 30 |
+
has_iiif_manifest=True,
|
| 31 |
+
has_images=True,
|
| 32 |
+
has_ocr=False,
|
| 33 |
+
availability="public",
|
| 34 |
+
relevance_score=0.9,
|
| 35 |
+
),
|
| 36 |
+
"ms-2": NormalizedItem(
|
| 37 |
+
id=make_global_id("mock", "ms-2"),
|
| 38 |
+
source="mock",
|
| 39 |
+
source_label=self.label,
|
| 40 |
+
source_item_id="ms-2",
|
| 41 |
+
title="Dante Manuscript (Mock)",
|
| 42 |
+
creators=["Anonymous"],
|
| 43 |
+
institution="Mock Institution",
|
| 44 |
+
object_type="manuscript",
|
| 45 |
+
record_url="https://mock.example.org/items/ms-2",
|
| 46 |
+
manifest_url=None,
|
| 47 |
+
has_iiif_manifest=False,
|
| 48 |
+
has_images=True,
|
| 49 |
+
has_ocr=True,
|
| 50 |
+
availability="public",
|
| 51 |
+
relevance_score=0.8,
|
| 52 |
+
),
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
async def search(
|
| 56 |
+
self,
|
| 57 |
+
query: str,
|
| 58 |
+
filters: dict[str, object],
|
| 59 |
+
page: int,
|
| 60 |
+
page_size: int,
|
| 61 |
+
) -> SearchResponse:
|
| 62 |
+
"""Return normalized in-memory results filtered by query substring."""
|
| 63 |
+
|
| 64 |
+
lowered = query.lower().strip()
|
| 65 |
+
filtered = [item for item in self._items.values() if lowered in item.title.lower()]
|
| 66 |
+
start = (page - 1) * page_size
|
| 67 |
+
end = start + page_size
|
| 68 |
+
page_items = filtered[start:end]
|
| 69 |
+
return SearchResponse(
|
| 70 |
+
query=query,
|
| 71 |
+
page=page,
|
| 72 |
+
page_size=page_size,
|
| 73 |
+
total_estimated=len(filtered),
|
| 74 |
+
results=page_items,
|
| 75 |
+
sources_used=[self.name],
|
| 76 |
+
partial_failures=[PartialFailure(source=self.name, status="ok")],
|
| 77 |
+
duration_ms=1,
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
async def get_item(self, source_item_id: str) -> NormalizedItem | None:
|
| 81 |
+
"""Return normalized item when available."""
|
| 82 |
+
|
| 83 |
+
return self._items.get(source_item_id)
|
| 84 |
+
|
| 85 |
+
async def resolve_manifest(
|
| 86 |
+
self,
|
| 87 |
+
item: NormalizedItem | None = None,
|
| 88 |
+
record_url: str | None = None,
|
| 89 |
+
) -> str | None:
|
| 90 |
+
"""Resolve manifest URL from provided item or known record URL."""
|
| 91 |
+
|
| 92 |
+
if item is not None and item.manifest_url:
|
| 93 |
+
return item.manifest_url
|
| 94 |
+
|
| 95 |
+
if record_url:
|
| 96 |
+
for candidate in self._items.values():
|
| 97 |
+
if candidate.record_url == record_url:
|
| 98 |
+
return candidate.manifest_url
|
| 99 |
+
return None
|
| 100 |
+
|
| 101 |
+
async def healthcheck(self) -> dict[str, str]:
|
| 102 |
+
"""Return static healthy status for demonstration connector."""
|
| 103 |
+
|
| 104 |
+
return {"status": "ok"}
|
| 105 |
+
|
| 106 |
+
async def capabilities(self) -> SourceCapabilities:
|
| 107 |
+
"""Return static capabilities for the mock connector."""
|
| 108 |
+
|
| 109 |
+
return SourceCapabilities(search=True, get_item=True, resolve_manifest=True)
|
app/backend/app/connectors/registry.py
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Registry storing available connectors."""
|
| 2 |
+
|
| 3 |
+
from app.connectors.base import BaseConnector
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ConnectorRegistry:
|
| 7 |
+
"""In-memory registry for connector instances."""
|
| 8 |
+
|
| 9 |
+
def __init__(self) -> None:
|
| 10 |
+
self._connectors: dict[str, BaseConnector] = {}
|
| 11 |
+
|
| 12 |
+
def register(self, connector: BaseConnector) -> None:
|
| 13 |
+
"""Register a connector instance by unique connector name."""
|
| 14 |
+
|
| 15 |
+
self._connectors[connector.name] = connector
|
| 16 |
+
|
| 17 |
+
def list_names(self) -> list[str]:
|
| 18 |
+
"""Return sorted connector names."""
|
| 19 |
+
|
| 20 |
+
return sorted(self._connectors.keys())
|
| 21 |
+
|
| 22 |
+
def get(self, name: str) -> BaseConnector:
|
| 23 |
+
"""Return connector instance for the provided name."""
|
| 24 |
+
|
| 25 |
+
return self._connectors[name]
|
| 26 |
+
|
| 27 |
+
def has(self, name: str) -> bool:
|
| 28 |
+
"""Return whether a connector with the given name is registered."""
|
| 29 |
+
|
| 30 |
+
return name in self._connectors
|
| 31 |
+
|
| 32 |
+
def list_connectors(self) -> list[BaseConnector]:
|
| 33 |
+
"""Return registered connector instances."""
|
| 34 |
+
|
| 35 |
+
return [self._connectors[name] for name in self.list_names()]
|
app/backend/app/main.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""FastAPI application entrypoint for backend lot 1."""
|
| 2 |
+
|
| 3 |
+
from fastapi import FastAPI, Request
|
| 4 |
+
from fastapi.middleware.cors import CORSMiddleware
|
| 5 |
+
from fastapi.responses import JSONResponse
|
| 6 |
+
|
| 7 |
+
from app.api.router import api_router
|
| 8 |
+
from app.config.settings import settings
|
| 9 |
+
from app.models.error_models import ErrorResponse
|
| 10 |
+
from app.utils.errors import AppError, BadRequestError, NotFoundError
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def create_app() -> FastAPI:
|
| 14 |
+
"""Create and configure FastAPI application."""
|
| 15 |
+
|
| 16 |
+
application = FastAPI(title=settings.app_name, version=settings.app_version, debug=settings.debug)
|
| 17 |
+
application.add_middleware(
|
| 18 |
+
CORSMiddleware,
|
| 19 |
+
allow_origins=settings.cors_allow_origins,
|
| 20 |
+
allow_credentials=True,
|
| 21 |
+
allow_methods=["*"],
|
| 22 |
+
allow_headers=["*"],
|
| 23 |
+
)
|
| 24 |
+
application.include_router(api_router)
|
| 25 |
+
|
| 26 |
+
@application.exception_handler(BadRequestError)
|
| 27 |
+
async def handle_bad_request(_: Request, exc: BadRequestError) -> JSONResponse:
|
| 28 |
+
payload = ErrorResponse(error="bad_request", details=str(exc)).model_dump()
|
| 29 |
+
return JSONResponse(status_code=400, content=payload)
|
| 30 |
+
|
| 31 |
+
@application.exception_handler(NotFoundError)
|
| 32 |
+
async def handle_not_found(_: Request, exc: NotFoundError) -> JSONResponse:
|
| 33 |
+
payload = ErrorResponse(error="not_found", details=str(exc)).model_dump()
|
| 34 |
+
return JSONResponse(status_code=404, content=payload)
|
| 35 |
+
|
| 36 |
+
@application.exception_handler(AppError)
|
| 37 |
+
async def handle_app_error(_: Request, exc: AppError) -> JSONResponse:
|
| 38 |
+
payload = ErrorResponse(error="application_error", details=str(exc)).model_dump()
|
| 39 |
+
return JSONResponse(status_code=500, content=payload)
|
| 40 |
+
|
| 41 |
+
return application
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
app = create_app()
|
app/backend/app/models/__init__.py
ADDED
|
File without changes
|
app/backend/app/models/error_models.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Error payload model for consistent API error responses."""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ErrorResponse(BaseModel):
|
| 7 |
+
"""Structured error payload returned by exception handlers."""
|
| 8 |
+
|
| 9 |
+
error: str
|
| 10 |
+
details: str | None = None
|
app/backend/app/models/import_models.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Models for import-by-URL endpoint."""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 4 |
+
|
| 5 |
+
from app.models.normalized_item import NormalizedItem
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ImportRequest(BaseModel):
|
| 9 |
+
"""Input payload for item import using a URL."""
|
| 10 |
+
|
| 11 |
+
model_config = ConfigDict(extra="forbid")
|
| 12 |
+
|
| 13 |
+
url: str = Field(min_length=1)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class ImportResponse(BaseModel):
|
| 17 |
+
"""Result of import URL analysis and manifest resolution."""
|
| 18 |
+
|
| 19 |
+
detected_source: str | None = None
|
| 20 |
+
record_url: str | None = None
|
| 21 |
+
manifest_url: str | None = None
|
| 22 |
+
item: NormalizedItem | None = None
|
app/backend/app/models/manifest_models.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Models for manifest resolution operations."""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class ResolveManifestRequest(BaseModel):
|
| 7 |
+
"""Input payload for manifest resolution."""
|
| 8 |
+
|
| 9 |
+
model_config = ConfigDict(extra="forbid")
|
| 10 |
+
|
| 11 |
+
source: str = Field(min_length=1)
|
| 12 |
+
source_item_id: str = Field(min_length=1)
|
| 13 |
+
record_url: str | None = None
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class ResolveManifestResponse(BaseModel):
|
| 17 |
+
"""Output payload for manifest resolution."""
|
| 18 |
+
|
| 19 |
+
manifest_url: str | None = None
|
| 20 |
+
status: str
|
| 21 |
+
method: str | None = None
|
app/backend/app/models/normalized_item.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Normalized item model shared by all connectors and APIs."""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, ConfigDict, Field, model_validator
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class NormalizedItem(BaseModel):
|
| 7 |
+
"""Normalized representation of an item from any source."""
|
| 8 |
+
|
| 9 |
+
model_config = ConfigDict(extra="forbid")
|
| 10 |
+
|
| 11 |
+
id: str = Field(description="Global identifier in format source:source_item_id.")
|
| 12 |
+
source: str
|
| 13 |
+
source_label: str
|
| 14 |
+
source_item_id: str
|
| 15 |
+
title: str
|
| 16 |
+
creators: list[str] = Field(default_factory=list)
|
| 17 |
+
date_display: str | None = None
|
| 18 |
+
object_type: str = "other"
|
| 19 |
+
institution: str | None = None
|
| 20 |
+
thumbnail_url: str | None = None
|
| 21 |
+
record_url: str | None = None
|
| 22 |
+
manifest_url: str | None = None
|
| 23 |
+
has_iiif_manifest: bool = False
|
| 24 |
+
has_images: bool = False
|
| 25 |
+
has_ocr: bool = False
|
| 26 |
+
availability: str = "unknown"
|
| 27 |
+
relevance_score: float = 0.0
|
| 28 |
+
normalization_warnings: list[str] = Field(default_factory=list)
|
| 29 |
+
|
| 30 |
+
@model_validator(mode="after")
|
| 31 |
+
def validate_global_id(self) -> "NormalizedItem":
|
| 32 |
+
"""Ensure `id` follows the stable MVP rule `source:source_item_id`."""
|
| 33 |
+
|
| 34 |
+
expected_id = f"{self.source}:{self.source_item_id}"
|
| 35 |
+
if self.id != expected_id:
|
| 36 |
+
raise ValueError("id must match source:source_item_id")
|
| 37 |
+
return self
|
app/backend/app/models/search_models.py
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Models for search requests and responses."""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, ConfigDict, Field
|
| 4 |
+
|
| 5 |
+
from app.models.normalized_item import NormalizedItem
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class SearchRequest(BaseModel):
|
| 9 |
+
"""Input payload for federated search."""
|
| 10 |
+
|
| 11 |
+
model_config = ConfigDict(extra="forbid")
|
| 12 |
+
|
| 13 |
+
query: str = Field(min_length=1)
|
| 14 |
+
sources: list[str] | None = None
|
| 15 |
+
filters: dict[str, object] = Field(default_factory=dict)
|
| 16 |
+
page: int = Field(default=1, ge=1)
|
| 17 |
+
page_size: int = Field(default=24, ge=1, le=100)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class PartialFailure(BaseModel):
|
| 21 |
+
"""Per-source failure report for partial success responses."""
|
| 22 |
+
|
| 23 |
+
source: str
|
| 24 |
+
status: str
|
| 25 |
+
error: str | None = None
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class SearchResponse(BaseModel):
|
| 29 |
+
"""Unified search response returned by backend APIs."""
|
| 30 |
+
|
| 31 |
+
query: str
|
| 32 |
+
page: int
|
| 33 |
+
page_size: int
|
| 34 |
+
total_estimated: int
|
| 35 |
+
results: list[NormalizedItem]
|
| 36 |
+
sources_used: list[str]
|
| 37 |
+
partial_failures: list[PartialFailure] = Field(default_factory=list)
|
| 38 |
+
duration_ms: int
|
app/backend/app/models/source_models.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Models describing source capabilities and source listing responses."""
|
| 2 |
+
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class SourceCapabilities(BaseModel):
|
| 7 |
+
"""Capabilities declared by a connector."""
|
| 8 |
+
|
| 9 |
+
search: bool = True
|
| 10 |
+
get_item: bool = True
|
| 11 |
+
resolve_manifest: bool = True
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class SourceDescriptor(BaseModel):
|
| 15 |
+
"""Source metadata exposed through /api/sources."""
|
| 16 |
+
|
| 17 |
+
name: str
|
| 18 |
+
label: str
|
| 19 |
+
source_type: str
|
| 20 |
+
capabilities: SourceCapabilities
|
| 21 |
+
healthy: bool
|
| 22 |
+
notes: str | None = None
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class SourcesResponse(BaseModel):
|
| 26 |
+
"""Response payload for source listing endpoint."""
|
| 27 |
+
|
| 28 |
+
sources: list[SourceDescriptor] = Field(default_factory=list)
|
app/backend/app/services/__init__.py
ADDED
|
File without changes
|
app/backend/app/services/import_service.py
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Service for URL import and initial source detection."""
|
| 2 |
+
|
| 3 |
+
from app.connectors.registry import ConnectorRegistry
|
| 4 |
+
from app.models.import_models import ImportResponse
|
| 5 |
+
from app.utils.url_validation import validate_http_url
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ImportService:
|
| 9 |
+
"""Handle import flow from URL to normalized item and manifest."""
|
| 10 |
+
|
| 11 |
+
def __init__(self, registry: ConnectorRegistry) -> None:
|
| 12 |
+
self._registry = registry
|
| 13 |
+
|
| 14 |
+
async def import_url(self, url: str) -> ImportResponse:
|
| 15 |
+
"""Validate URL then resolve manifest through source and generic connectors.
|
| 16 |
+
|
| 17 |
+
Resolution order:
|
| 18 |
+
1. Source-specific connectors (e.g. mock) try exact record URL mapping.
|
| 19 |
+
2. `manifest_by_url` connector applies generic URL heuristics:
|
| 20 |
+
- direct manifest pattern detection;
|
| 21 |
+
- notice -> manifest candidate generation.
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
safe_url = validate_http_url(url)
|
| 25 |
+
|
| 26 |
+
source_connectors = [
|
| 27 |
+
connector
|
| 28 |
+
for connector in self._registry.list_connectors()
|
| 29 |
+
if connector.name != "manifest_by_url"
|
| 30 |
+
]
|
| 31 |
+
generic_connectors = [
|
| 32 |
+
connector
|
| 33 |
+
for connector in self._registry.list_connectors()
|
| 34 |
+
if connector.name == "manifest_by_url"
|
| 35 |
+
]
|
| 36 |
+
|
| 37 |
+
for connector in [*source_connectors, *generic_connectors]:
|
| 38 |
+
matched_item = None
|
| 39 |
+
if connector.name == "mock":
|
| 40 |
+
# Lot 4 keeps source-specific matching minimal for mock demo data.
|
| 41 |
+
for candidate_id in ("ms-1", "ms-2"):
|
| 42 |
+
candidate = await connector.get_item(candidate_id)
|
| 43 |
+
if candidate is not None and candidate.record_url == safe_url:
|
| 44 |
+
matched_item = candidate
|
| 45 |
+
break
|
| 46 |
+
|
| 47 |
+
manifest = await connector.resolve_manifest(item=matched_item, record_url=safe_url)
|
| 48 |
+
if manifest:
|
| 49 |
+
return ImportResponse(
|
| 50 |
+
detected_source=connector.name,
|
| 51 |
+
record_url=safe_url,
|
| 52 |
+
manifest_url=manifest,
|
| 53 |
+
item=matched_item,
|
| 54 |
+
)
|
| 55 |
+
|
| 56 |
+
return ImportResponse(
|
| 57 |
+
detected_source=None,
|
| 58 |
+
record_url=safe_url,
|
| 59 |
+
manifest_url=None,
|
| 60 |
+
item=None,
|
| 61 |
+
)
|
app/backend/app/services/item_service.py
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Service for item retrieval by global identifier."""
|
| 2 |
+
|
| 3 |
+
from app.connectors.registry import ConnectorRegistry
|
| 4 |
+
from app.models.normalized_item import NormalizedItem
|
| 5 |
+
from app.utils.errors import BadRequestError, NotFoundError
|
| 6 |
+
from app.utils.ids import split_global_id
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ItemService:
|
| 10 |
+
"""Resolve item details using global id policy source:source_item_id."""
|
| 11 |
+
|
| 12 |
+
def __init__(self, registry: ConnectorRegistry) -> None:
|
| 13 |
+
self._registry = registry
|
| 14 |
+
|
| 15 |
+
async def get_item(self, global_id: str) -> NormalizedItem:
|
| 16 |
+
"""Fetch normalized item from connector using global id."""
|
| 17 |
+
|
| 18 |
+
try:
|
| 19 |
+
source, source_item_id = split_global_id(global_id)
|
| 20 |
+
except ValueError:
|
| 21 |
+
raise BadRequestError("Invalid id format, expected source:source_item_id")
|
| 22 |
+
if not self._registry.has(source):
|
| 23 |
+
raise NotFoundError(f"Unknown source '{source}'")
|
| 24 |
+
item = await self._registry.get(source).get_item(source_item_id)
|
| 25 |
+
if item is None:
|
| 26 |
+
raise NotFoundError(f"Item '{global_id}' not found")
|
| 27 |
+
return item
|
app/backend/app/services/manifest_resolver.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Service responsible for manifest resolution through connectors."""
|
| 2 |
+
|
| 3 |
+
from app.connectors.registry import ConnectorRegistry
|
| 4 |
+
from app.models.manifest_models import ResolveManifestRequest, ResolveManifestResponse
|
| 5 |
+
from app.utils.errors import NotFoundError
|
| 6 |
+
from app.utils.ids import make_global_id
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ManifestResolver:
|
| 10 |
+
"""Resolve manifests by delegating to source connectors."""
|
| 11 |
+
|
| 12 |
+
def __init__(self, registry: ConnectorRegistry) -> None:
|
| 13 |
+
self._registry = registry
|
| 14 |
+
|
| 15 |
+
async def resolve(self, request: ResolveManifestRequest) -> ResolveManifestResponse:
|
| 16 |
+
"""Resolve manifest URL for a source item identifier."""
|
| 17 |
+
|
| 18 |
+
if not self._registry.has(request.source):
|
| 19 |
+
raise NotFoundError(f"Unknown source '{request.source}'")
|
| 20 |
+
connector = self._registry.get(request.source)
|
| 21 |
+
item = await connector.get_item(request.source_item_id)
|
| 22 |
+
manifest_url = await connector.resolve_manifest(item=item, record_url=request.record_url)
|
| 23 |
+
status = "resolved" if manifest_url else "not_found"
|
| 24 |
+
method = "metadata" if manifest_url else None
|
| 25 |
+
return ResolveManifestResponse(manifest_url=manifest_url, status=status, method=method)
|
| 26 |
+
|
| 27 |
+
async def openable_global_id(self, source: str, source_item_id: str) -> str:
|
| 28 |
+
"""Return deterministic global id associated to manifest operation."""
|
| 29 |
+
|
| 30 |
+
return make_global_id(source, source_item_id)
|
app/backend/app/services/search_orchestrator.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Federated search orchestration over registered connectors."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import asyncio
|
| 6 |
+
import time
|
| 7 |
+
|
| 8 |
+
from app.connectors.registry import ConnectorRegistry
|
| 9 |
+
from app.models.search_models import PartialFailure, SearchRequest, SearchResponse
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class SearchOrchestrator:
|
| 13 |
+
"""Coordinate multi-source search with partial failure tolerance."""
|
| 14 |
+
|
| 15 |
+
def __init__(self, registry: ConnectorRegistry) -> None:
|
| 16 |
+
self._registry = registry
|
| 17 |
+
|
| 18 |
+
async def search(self, request: SearchRequest) -> SearchResponse:
|
| 19 |
+
"""Run federated search across selected connectors and merge results."""
|
| 20 |
+
|
| 21 |
+
selected = request.sources or self._registry.list_names()
|
| 22 |
+
tasks = [self._search_one(source, request) for source in selected]
|
| 23 |
+
start = time.perf_counter()
|
| 24 |
+
gathered = await asyncio.gather(*tasks, return_exceptions=True)
|
| 25 |
+
duration_ms = int((time.perf_counter() - start) * 1000)
|
| 26 |
+
|
| 27 |
+
merged_results = []
|
| 28 |
+
partial_failures: list[PartialFailure] = []
|
| 29 |
+
sources_used: list[str] = []
|
| 30 |
+
|
| 31 |
+
for source_name, outcome in zip(selected, gathered, strict=True):
|
| 32 |
+
if isinstance(outcome, Exception):
|
| 33 |
+
partial_failures.append(
|
| 34 |
+
PartialFailure(source=source_name, status="error", error=str(outcome))
|
| 35 |
+
)
|
| 36 |
+
continue
|
| 37 |
+
sources_used.append(source_name)
|
| 38 |
+
merged_results.extend(outcome.results)
|
| 39 |
+
partial_failures.extend(outcome.partial_failures)
|
| 40 |
+
|
| 41 |
+
merged_results.sort(key=lambda item: item.relevance_score, reverse=True)
|
| 42 |
+
|
| 43 |
+
return SearchResponse(
|
| 44 |
+
query=request.query,
|
| 45 |
+
page=request.page,
|
| 46 |
+
page_size=request.page_size,
|
| 47 |
+
total_estimated=len(merged_results),
|
| 48 |
+
results=merged_results,
|
| 49 |
+
sources_used=sources_used,
|
| 50 |
+
partial_failures=partial_failures,
|
| 51 |
+
duration_ms=duration_ms,
|
| 52 |
+
)
|
| 53 |
+
|
| 54 |
+
async def _search_one(self, source: str, request: SearchRequest) -> SearchResponse:
|
| 55 |
+
if not self._registry.has(source):
|
| 56 |
+
raise ValueError(f"Unknown source '{source}'")
|
| 57 |
+
connector = self._registry.get(source)
|
| 58 |
+
return await connector.search(
|
| 59 |
+
query=request.query,
|
| 60 |
+
filters=request.filters,
|
| 61 |
+
page=request.page,
|
| 62 |
+
page_size=request.page_size,
|
| 63 |
+
)
|
app/backend/app/services/source_service.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Service exposing source capabilities and health state."""
|
| 2 |
+
|
| 3 |
+
from app.connectors.registry import ConnectorRegistry
|
| 4 |
+
from app.models.source_models import SourceDescriptor, SourcesResponse
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
class SourceService:
|
| 8 |
+
"""Provide data for `/api/sources` endpoint."""
|
| 9 |
+
|
| 10 |
+
def __init__(self, registry: ConnectorRegistry) -> None:
|
| 11 |
+
self._registry = registry
|
| 12 |
+
|
| 13 |
+
async def list_sources(self) -> SourcesResponse:
|
| 14 |
+
"""Return registered sources with capabilities and health flags."""
|
| 15 |
+
|
| 16 |
+
sources: list[SourceDescriptor] = []
|
| 17 |
+
for connector in self._registry.list_connectors():
|
| 18 |
+
capabilities = await connector.capabilities()
|
| 19 |
+
health = await connector.healthcheck()
|
| 20 |
+
sources.append(
|
| 21 |
+
SourceDescriptor(
|
| 22 |
+
name=connector.name,
|
| 23 |
+
label=connector.label,
|
| 24 |
+
source_type=connector.source_type,
|
| 25 |
+
capabilities=capabilities,
|
| 26 |
+
healthy=health.get("status") == "ok",
|
| 27 |
+
)
|
| 28 |
+
)
|
| 29 |
+
return SourcesResponse(sources=sources)
|
app/backend/app/utils/__init__.py
ADDED
|
File without changes
|
app/backend/app/utils/errors.py
ADDED
|
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Domain exceptions used across backend services and API layers."""
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class AppError(Exception):
|
| 5 |
+
"""Base class for application-level errors."""
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class NotFoundError(AppError):
|
| 9 |
+
"""Raised when an entity cannot be found."""
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class BadRequestError(AppError):
|
| 13 |
+
"""Raised when user input is invalid for business logic."""
|
app/backend/app/utils/http_client.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Async HTTP client utilities for connector network calls."""
|
| 2 |
+
|
| 3 |
+
import httpx
|
| 4 |
+
|
| 5 |
+
from app.config.settings import settings
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
async def build_async_client() -> httpx.AsyncClient:
|
| 9 |
+
"""Create an AsyncClient configured with MVP-safe defaults."""
|
| 10 |
+
|
| 11 |
+
timeout = httpx.Timeout(settings.request_timeout_seconds)
|
| 12 |
+
return httpx.AsyncClient(timeout=timeout, follow_redirects=True)
|
app/backend/app/utils/ids.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Helpers to create and parse stable global identifiers."""
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
def make_global_id(source: str, source_item_id: str) -> str:
|
| 5 |
+
"""Build stable global id using source:source_item_id format."""
|
| 6 |
+
|
| 7 |
+
return f"{source}:{source_item_id}"
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def split_global_id(global_id: str) -> tuple[str, str]:
|
| 11 |
+
"""Split global identifier into source and source_item_id."""
|
| 12 |
+
|
| 13 |
+
if ":" not in global_id:
|
| 14 |
+
raise ValueError("global id must include ':' separator")
|
| 15 |
+
source, source_item_id = global_id.split(":", maxsplit=1)
|
| 16 |
+
if not source or not source_item_id:
|
| 17 |
+
raise ValueError("global id must contain source and source_item_id")
|
| 18 |
+
return source, source_item_id
|
app/backend/app/utils/url_validation.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""URL validation helpers for import and manifest resolution endpoints."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import ipaddress
|
| 6 |
+
import socket
|
| 7 |
+
from urllib.parse import urlparse
|
| 8 |
+
|
| 9 |
+
from app.utils.errors import BadRequestError
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def validate_http_url(url: str) -> str:
|
| 13 |
+
"""Validate URL scheme/host and apply basic SSRF protections for MVP."""
|
| 14 |
+
|
| 15 |
+
parsed = urlparse(url)
|
| 16 |
+
if parsed.scheme not in {"http", "https"}:
|
| 17 |
+
raise BadRequestError("URL scheme must be http or https")
|
| 18 |
+
|
| 19 |
+
if not parsed.netloc or parsed.hostname is None:
|
| 20 |
+
raise BadRequestError("URL must contain a valid host")
|
| 21 |
+
|
| 22 |
+
_reject_local_hostnames(parsed.hostname)
|
| 23 |
+
_reject_private_or_local_ip_literals(parsed.hostname)
|
| 24 |
+
_reject_hostnames_resolving_to_private_or_local_ips(parsed.hostname)
|
| 25 |
+
|
| 26 |
+
return url
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _reject_local_hostnames(hostname: str) -> None:
|
| 30 |
+
lowered = hostname.lower()
|
| 31 |
+
blocked = {"localhost", "localhost.localdomain"}
|
| 32 |
+
if lowered in blocked:
|
| 33 |
+
raise BadRequestError("Local hosts are not allowed")
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def _reject_private_or_local_ip_literals(hostname: str) -> None:
|
| 37 |
+
try:
|
| 38 |
+
ip = ipaddress.ip_address(hostname)
|
| 39 |
+
except ValueError:
|
| 40 |
+
return
|
| 41 |
+
|
| 42 |
+
if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_unspecified:
|
| 43 |
+
raise BadRequestError("Private or local IPs are not allowed")
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def _reject_hostnames_resolving_to_private_or_local_ips(hostname: str) -> None:
|
| 47 |
+
try:
|
| 48 |
+
addrinfos = socket.getaddrinfo(hostname, None)
|
| 49 |
+
except socket.gaierror:
|
| 50 |
+
# If hostname cannot be resolved, keep request valid and let downstream
|
| 51 |
+
# connector/network errors explain the failure.
|
| 52 |
+
return
|
| 53 |
+
|
| 54 |
+
for info in addrinfos:
|
| 55 |
+
raw_ip = info[4][0]
|
| 56 |
+
ip = ipaddress.ip_address(raw_ip)
|
| 57 |
+
if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_unspecified:
|
| 58 |
+
raise BadRequestError("Resolved host points to a private or local IP")
|
app/frontend/index.html
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!doctype html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8" />
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
| 6 |
+
<title>Clafoutis</title>
|
| 7 |
+
</head>
|
| 8 |
+
<body class="bg-slate-50 text-slate-900">
|
| 9 |
+
<div id="root"></div>
|
| 10 |
+
<script type="module" src="/src/main.tsx"></script>
|
| 11 |
+
</body>
|
| 12 |
+
</html>
|
app/frontend/package.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "clafoutis-frontend",
|
| 3 |
+
"version": "0.1.0",
|
| 4 |
+
"private": true,
|
| 5 |
+
"type": "module",
|
| 6 |
+
"scripts": {
|
| 7 |
+
"dev": "vite",
|
| 8 |
+
"build": "tsc -b && vite build",
|
| 9 |
+
"preview": "vite preview",
|
| 10 |
+
"typecheck": "tsc --noEmit"
|
| 11 |
+
},
|
| 12 |
+
"dependencies": {
|
| 13 |
+
"@tanstack/react-query": "^5.59.16",
|
| 14 |
+
"mirador": "^3.3.0",
|
| 15 |
+
"react": "^18.3.1",
|
| 16 |
+
"react-dom": "^18.3.1",
|
| 17 |
+
"react-router-dom": "^6.28.0",
|
| 18 |
+
"zustand": "^5.0.1"
|
| 19 |
+
},
|
| 20 |
+
"devDependencies": {
|
| 21 |
+
"@types/react": "^18.3.12",
|
| 22 |
+
"@types/react-dom": "^18.3.1",
|
| 23 |
+
"@vitejs/plugin-react": "^4.3.3",
|
| 24 |
+
"autoprefixer": "^10.4.20",
|
| 25 |
+
"postcss": "^8.4.49",
|
| 26 |
+
"tailwindcss": "^3.4.15",
|
| 27 |
+
"typescript": "^5.6.3",
|
| 28 |
+
"vite": "^5.4.10"
|
| 29 |
+
}
|
| 30 |
+
}
|
app/frontend/postcss.config.js
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
export default {
|
| 2 |
+
plugins: {
|
| 3 |
+
tailwindcss: {},
|
| 4 |
+
autoprefixer: {},
|
| 5 |
+
},
|
| 6 |
+
}
|
app/frontend/src/app/providers.tsx
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { QueryClientProvider } from '@tanstack/react-query'
|
| 2 |
+
import { ReactNode } from 'react'
|
| 3 |
+
import { BrowserRouter } from 'react-router-dom'
|
| 4 |
+
|
| 5 |
+
import { queryClient } from '../lib/queryClient'
|
| 6 |
+
|
| 7 |
+
interface ProvidersProps {
|
| 8 |
+
children: ReactNode
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
export function Providers({ children }: ProvidersProps) {
|
| 12 |
+
return (
|
| 13 |
+
<QueryClientProvider client={queryClient}>
|
| 14 |
+
<BrowserRouter>{children}</BrowserRouter>
|
| 15 |
+
</QueryClientProvider>
|
| 16 |
+
)
|
| 17 |
+
}
|
app/frontend/src/app/router.tsx
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { Link, Navigate, Route, Routes } from 'react-router-dom'
|
| 2 |
+
|
| 3 |
+
import { AboutPage } from '../pages/AboutPage'
|
| 4 |
+
import { ImportPage } from '../pages/ImportPage'
|
| 5 |
+
import { ReaderPage } from '../pages/ReaderPage'
|
| 6 |
+
import { SearchPage } from '../pages/SearchPage'
|
| 7 |
+
import { SourcesPage } from '../pages/SourcesPage'
|
| 8 |
+
|
| 9 |
+
function AppLayout() {
|
| 10 |
+
return (
|
| 11 |
+
<div className="mx-auto min-h-screen max-w-7xl p-4">
|
| 12 |
+
<header className="mb-6 rounded-md border border-slate-200 bg-white p-3">
|
| 13 |
+
<nav className="flex flex-wrap gap-3 text-sm">
|
| 14 |
+
<Link className="rounded bg-slate-100 px-2 py-1 hover:bg-slate-200" to="/search">
|
| 15 |
+
Recherche
|
| 16 |
+
</Link>
|
| 17 |
+
<Link className="rounded bg-slate-100 px-2 py-1 hover:bg-slate-200" to="/reader">
|
| 18 |
+
Lecture
|
| 19 |
+
</Link>
|
| 20 |
+
<Link className="rounded bg-slate-100 px-2 py-1 hover:bg-slate-200" to="/import">
|
| 21 |
+
Import
|
| 22 |
+
</Link>
|
| 23 |
+
<Link className="rounded bg-slate-100 px-2 py-1 hover:bg-slate-200" to="/sources">
|
| 24 |
+
Sources
|
| 25 |
+
</Link>
|
| 26 |
+
<Link className="rounded bg-slate-100 px-2 py-1 hover:bg-slate-200" to="/about">
|
| 27 |
+
À propos
|
| 28 |
+
</Link>
|
| 29 |
+
</nav>
|
| 30 |
+
</header>
|
| 31 |
+
<Routes>
|
| 32 |
+
<Route element={<Navigate replace to="/search" />} path="/" />
|
| 33 |
+
<Route element={<SearchPage />} path="/search" />
|
| 34 |
+
<Route element={<ReaderPage />} path="/reader" />
|
| 35 |
+
<Route element={<ImportPage />} path="/import" />
|
| 36 |
+
<Route element={<SourcesPage />} path="/sources" />
|
| 37 |
+
<Route element={<AboutPage />} path="/about" />
|
| 38 |
+
</Routes>
|
| 39 |
+
</div>
|
| 40 |
+
)
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
export function AppRouter() {
|
| 44 |
+
return <AppLayout />
|
| 45 |
+
}
|
app/frontend/src/components/results/ResultCard.tsx
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import type { NormalizedItem } from '../../types/normalized'
|
| 2 |
+
|
| 3 |
+
interface ResultCardProps {
|
| 4 |
+
item: NormalizedItem
|
| 5 |
+
selected: boolean
|
| 6 |
+
onToggleCompare: (itemId: string) => void
|
| 7 |
+
onPrepareRead: (item: NormalizedItem) => void
|
| 8 |
+
}
|
| 9 |
+
|
| 10 |
+
export function ResultCard({ item, selected, onToggleCompare, onPrepareRead }: ResultCardProps) {
|
| 11 |
+
return (
|
| 12 |
+
<article className="rounded-md border border-slate-200 bg-white p-4 shadow-sm">
|
| 13 |
+
<h3 className="mb-1 line-clamp-2 text-sm font-semibold">{item.title}</h3>
|
| 14 |
+
<p className="text-xs text-slate-600">{item.source_label}</p>
|
| 15 |
+
<p className="text-xs text-slate-600">{item.institution ?? 'Institution inconnue'}</p>
|
| 16 |
+
<div className="mt-3 flex flex-wrap gap-2">
|
| 17 |
+
<button
|
| 18 |
+
className="rounded bg-slate-900 px-2 py-1 text-xs text-white"
|
| 19 |
+
type="button"
|
| 20 |
+
onClick={() => onPrepareRead(item)}
|
| 21 |
+
>
|
| 22 |
+
Préparer lecture
|
| 23 |
+
</button>
|
| 24 |
+
<button
|
| 25 |
+
className="rounded border border-slate-300 px-2 py-1 text-xs"
|
| 26 |
+
type="button"
|
| 27 |
+
onClick={() => onToggleCompare(item.id)}
|
| 28 |
+
>
|
| 29 |
+
{selected ? 'Retirer comparaison' : 'Comparer'}
|
| 30 |
+
</button>
|
| 31 |
+
</div>
|
| 32 |
+
</article>
|
| 33 |
+
)
|
| 34 |
+
}
|
app/frontend/src/components/search/ResultsGrid.tsx
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { ResultCard } from '../results/ResultCard'
|
| 2 |
+
import type { NormalizedItem } from '../../types/normalized'
|
| 3 |
+
|
| 4 |
+
interface ResultsGridProps {
|
| 5 |
+
items: NormalizedItem[]
|
| 6 |
+
selectedForComparison: string[]
|
| 7 |
+
onToggleCompare: (itemId: string) => void
|
| 8 |
+
onPrepareRead: (item: NormalizedItem) => void
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
export function ResultsGrid({
|
| 12 |
+
items,
|
| 13 |
+
selectedForComparison,
|
| 14 |
+
onToggleCompare,
|
| 15 |
+
onPrepareRead,
|
| 16 |
+
}: ResultsGridProps) {
|
| 17 |
+
if (items.length === 0) {
|
| 18 |
+
return <p className="rounded-md border border-dashed border-slate-300 p-6 text-sm">Aucun résultat.</p>
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
return (
|
| 22 |
+
<section className="grid gap-3 sm:grid-cols-2 lg:grid-cols-3">
|
| 23 |
+
{items.map((item) => (
|
| 24 |
+
<ResultCard
|
| 25 |
+
key={item.id}
|
| 26 |
+
item={item}
|
| 27 |
+
selected={selectedForComparison.includes(item.id)}
|
| 28 |
+
onPrepareRead={onPrepareRead}
|
| 29 |
+
onToggleCompare={onToggleCompare}
|
| 30 |
+
/>
|
| 31 |
+
))}
|
| 32 |
+
</section>
|
| 33 |
+
)
|
| 34 |
+
}
|
app/frontend/src/components/search/SearchBar.tsx
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { FormEvent, useState } from 'react'
|
| 2 |
+
|
| 3 |
+
interface SearchBarProps {
|
| 4 |
+
initialQuery: string
|
| 5 |
+
onSubmit: (query: string) => void
|
| 6 |
+
}
|
| 7 |
+
|
| 8 |
+
export function SearchBar({ initialQuery, onSubmit }: SearchBarProps) {
|
| 9 |
+
const [query, setQuery] = useState(initialQuery)
|
| 10 |
+
|
| 11 |
+
const handleSubmit = (event: FormEvent<HTMLFormElement>) => {
|
| 12 |
+
event.preventDefault()
|
| 13 |
+
onSubmit(query.trim())
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
return (
|
| 17 |
+
<form className="flex gap-2" onSubmit={handleSubmit}>
|
| 18 |
+
<input
|
| 19 |
+
className="w-full rounded-md border border-slate-300 bg-white px-3 py-2"
|
| 20 |
+
placeholder="Rechercher un objet patrimonial"
|
| 21 |
+
value={query}
|
| 22 |
+
onChange={(event) => setQuery(event.target.value)}
|
| 23 |
+
/>
|
| 24 |
+
<button className="rounded-md bg-slate-900 px-4 py-2 text-white" type="submit">
|
| 25 |
+
Rechercher
|
| 26 |
+
</button>
|
| 27 |
+
</form>
|
| 28 |
+
)
|
| 29 |
+
}
|
app/frontend/src/components/search/SearchFilters.tsx
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import { useSources } from '../../hooks/useSources'
|
| 2 |
+
import type { SearchFilters as SearchFiltersType } from '../../types/filters'
|
| 3 |
+
|
| 4 |
+
interface SearchFiltersProps {
|
| 5 |
+
value: SearchFiltersType
|
| 6 |
+
onChange: (filters: SearchFiltersType) => void
|
| 7 |
+
}
|
| 8 |
+
|
| 9 |
+
export function SearchFilters({ value, onChange }: SearchFiltersProps) {
|
| 10 |
+
const sourcesQuery = useSources()
|
| 11 |
+
|
| 12 |
+
return (
|
| 13 |
+
<aside className="space-y-4 rounded-md border border-slate-200 bg-white p-4">
|
| 14 |
+
<h2 className="text-sm font-semibold uppercase tracking-wide text-slate-600">Filtres</h2>
|
| 15 |
+
|
| 16 |
+
<div>
|
| 17 |
+
<p className="mb-2 text-sm font-medium">Sources</p>
|
| 18 |
+
<div className="space-y-2">
|
| 19 |
+
{(sourcesQuery.data?.sources ?? []).map((source) => {
|
| 20 |
+
const checked = value.sources.includes(source.name)
|
| 21 |
+
return (
|
| 22 |
+
<label key={source.name} className="flex items-center gap-2 text-sm">
|
| 23 |
+
<input
|
| 24 |
+
checked={checked}
|
| 25 |
+
type="checkbox"
|
| 26 |
+
onChange={(event) => {
|
| 27 |
+
const nextSources = event.target.checked
|
| 28 |
+
? [...value.sources, source.name]
|
| 29 |
+
: value.sources.filter((candidate) => candidate !== source.name)
|
| 30 |
+
onChange({ ...value, sources: nextSources })
|
| 31 |
+
}}
|
| 32 |
+
/>
|
| 33 |
+
{source.label}
|
| 34 |
+
</label>
|
| 35 |
+
)
|
| 36 |
+
})}
|
| 37 |
+
</div>
|
| 38 |
+
</div>
|
| 39 |
+
|
| 40 |
+
<label className="flex items-center gap-2 text-sm">
|
| 41 |
+
<input
|
| 42 |
+
checked={value.hasIiifOnly}
|
| 43 |
+
type="checkbox"
|
| 44 |
+
onChange={(event) => onChange({ ...value, hasIiifOnly: event.target.checked })}
|
| 45 |
+
/>
|
| 46 |
+
IIIF uniquement
|
| 47 |
+
</label>
|
| 48 |
+
</aside>
|
| 49 |
+
)
|
| 50 |
+
}
|