{item.title}
+{item.source_label}
+{item.institution ?? 'Institution inconnue'}
+diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..d05b0f0a9e02d3688e9610caa73d2a29b4f88ae9 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +__pycache__/ +*.pyc +.venv/ + +app/frontend/node_modules/ +app/frontend/dist/ diff --git a/README.md b/README.md index 5c6f48ab46db306045b9e81d006949683a96c5a5..eefd0f62cedbe0861887f94591ebb1ea82592188 100644 --- a/README.md +++ b/README.md @@ -145,6 +145,34 @@ Le moteur fédéré ne doit jamais échouer globalement à cause d’un seul con - `POST /api/resolve-manifest` - `POST /api/import` +### Heuristiques MVP de `/api/import` + +Le connecteur générique `manifest_by_url` applique des heuristiques minimales et explicites : + +1. **Manifest direct** : l’URL est considérée comme manifest si son chemin contient `manifest` + (ou se termine par `manifest.json`). +2. **Notice -> manifest** : si l’URL ne ressemble pas à un manifest, le backend tente des suffixes + courants, dans cet ordre : + - `/manifest` + - `/manifest.json` + - `/iiif/manifest` + - `/iiif/manifest.json` + +Ces heuristiques sont volontairement simples au MVP et seront enrichies par source aux lots +connecteurs réels. + +### Sécurité MVP import URL (validation + SSRF basique) + +`/api/import` applique une validation stricte avant résolution : + +- schémas autorisés : `http`, `https` uniquement ; +- rejet explicite de `localhost`/hôtes locaux ; +- rejet des IP privées/loopback/link-local/réservées/unspecified ; +- rejet des hôtes DNS qui résolvent vers ces plages privées/locales. + +Limite connue MVP : cette protection SSRF reste basique et devra être durcie (allowlist, +résolution DNS contrôlée, protections réseau infra) avant production. + ## Outils MCP prévus - `search_items` @@ -165,11 +193,10 @@ Le moteur fédéré ne doit jamais échouer globalement à cause d’un seul con ### Backend ```bash -cd app/backend python -m venv .venv source .venv/bin/activate -pip install -r requirements.txt -uvicorn app.main:app --reload +pip install -e '.[dev]' +uvicorn app.main:app --app-dir app/backend --reload ``` ### Frontend @@ -180,6 +207,14 @@ npm install npm run dev ``` +Par défaut, le frontend appelle `http://localhost:8000`. + +Optionnel : + +```bash +VITE_API_BASE_URL=http://localhost:8000 npm run dev +``` + ## Variables d’environnement Créer un fichier `.env` à partir de `.env.example`. @@ -210,6 +245,38 @@ docker run -p 8000:8000 universal-iiif-portal - Europeana - connecteur générique `manifest-by-url` +## Connecteur Gallica (lot 5) + +### Hypothèses de mapping `NormalizedItem` + +- `source_item_id` : ARK extrait des identifiants Gallica (`ark:/...`) ; +- `id` global : `gallica:{source_item_id}` ; +- `title` : premier champ `dc:title` disponible ; +- `creators` : liste des `dc:creator` ; +- `date_display` : premier `dc:date` ; +- `object_type` : dérivé de `dc:type` via mapping simple (`manuscript`, `book`, `map`, `image`, `newspaper`, `other`) ; +- `record_url` : premier `dc:identifier` ; +- `manifest_url` : construit depuis l’ARK (`https://gallica.bnf.fr/iiif/{ark}/manifest.json`) ; +- `institution` : `Bibliothèque nationale de France`. + +### Stratégie de résolution de manifest + +1. si `item.manifest_url` est déjà présent, il est renvoyé ; +2. sinon, extraction d’un ARK depuis `record_url` (ou URL fournie) ; +3. construction déterministe de l’URL IIIF manifest Gallica. + +### Robustesse / mode fallback + +- Le connecteur tente un mode live SRU Gallica ; +- pour éviter de casser la suite en environnement instable, un mode fixtures est disponible (`CLAFOUTIS_GALLICA_USE_FIXTURES=true` au MVP, valeur par défaut) ; +- en cas d’échec live, le connecteur renvoie un succès dégradé avec données fixtures et `partial_failures` explicite. + +### Limites connues (MVP) + +- le parsing SRU est volontairement minimal et basé sur un sous-ensemble Dublin Core ; +- certains champs Gallica restent absents/incertains selon les notices ; +- la détection fine des types documentaires sera améliorée aux lots suivants. + ## Principes de développement - code modulaire ; diff --git a/app/backend/app/__init__.py b/app/backend/app/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app/backend/app/api/__init__.py b/app/backend/app/api/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app/backend/app/api/dependencies.py b/app/backend/app/api/dependencies.py new file mode 100644 index 0000000000000000000000000000000000000000..94f6ec778cf8379c6941663dcd6439a587288925 --- /dev/null +++ b/app/backend/app/api/dependencies.py @@ -0,0 +1,54 @@ +"""Dependency providers for API routes.""" + +from functools import lru_cache + +from app.connectors.gallica import GallicaConnector +from app.connectors.manifest_by_url_connector import ManifestByUrlConnector +from app.connectors.mock_connector import MockConnector +from app.connectors.registry import ConnectorRegistry +from app.services.import_service import ImportService +from app.services.item_service import ItemService +from app.services.manifest_resolver import ManifestResolver +from app.services.search_orchestrator import SearchOrchestrator +from app.services.source_service import SourceService + + +@lru_cache(maxsize=1) +def get_registry() -> ConnectorRegistry: + """Create and cache connector registry with MVP connectors.""" + + registry = ConnectorRegistry() + registry.register(MockConnector()) + registry.register(GallicaConnector()) + registry.register(ManifestByUrlConnector()) + return registry + + +def get_search_orchestrator() -> SearchOrchestrator: + """Return orchestrator instance wired with connector registry.""" + + return SearchOrchestrator(get_registry()) + + +def get_source_service() -> SourceService: + """Return source service instance.""" + + return SourceService(get_registry()) + + +def get_item_service() -> ItemService: + """Return item service instance.""" + + return ItemService(get_registry()) + + +def get_manifest_resolver() -> ManifestResolver: + """Return manifest resolver instance.""" + + return ManifestResolver(get_registry()) + + +def get_import_service() -> ImportService: + """Return import service instance.""" + + return ImportService(get_registry()) diff --git a/app/backend/app/api/health.py b/app/backend/app/api/health.py new file mode 100644 index 0000000000000000000000000000000000000000..c96aa5ae77ba5880ff70d5e99f7d98a14d59c172 --- /dev/null +++ b/app/backend/app/api/health.py @@ -0,0 +1,12 @@ +"""Health endpoint.""" + +from fastapi import APIRouter + +router = APIRouter(tags=["health"]) + + +@router.get("/health") +async def health() -> dict[str, str]: + """Return backend health status.""" + + return {"status": "ok"} diff --git a/app/backend/app/api/import_.py b/app/backend/app/api/import_.py new file mode 100644 index 0000000000000000000000000000000000000000..27eeece0b6e757c4bfdcf6fb2445be5d076685de --- /dev/null +++ b/app/backend/app/api/import_.py @@ -0,0 +1,19 @@ +"""Import endpoint for notice or manifest URLs.""" + +from fastapi import APIRouter, Depends + +from app.api.dependencies import get_import_service +from app.models.import_models import ImportRequest, ImportResponse +from app.services.import_service import ImportService + +router = APIRouter(tags=["import"]) + + +@router.post("/import", response_model=ImportResponse) +async def import_item( + payload: ImportRequest, + service: ImportService = Depends(get_import_service), +) -> ImportResponse: + """Import an external URL and attempt to resolve source and manifest.""" + + return await service.import_url(payload.url) diff --git a/app/backend/app/api/items.py b/app/backend/app/api/items.py new file mode 100644 index 0000000000000000000000000000000000000000..4e3ac1f77d6b400b0665992b07ebeb3a0486cb9d --- /dev/null +++ b/app/backend/app/api/items.py @@ -0,0 +1,19 @@ +"""Item detail endpoint.""" + +from fastapi import APIRouter, Depends + +from app.api.dependencies import get_item_service +from app.models.normalized_item import NormalizedItem +from app.services.item_service import ItemService + +router = APIRouter(tags=["items"]) + + +@router.get("/item/{global_id}", response_model=NormalizedItem) +async def get_item( + global_id: str, + service: ItemService = Depends(get_item_service), +) -> NormalizedItem: + """Return a normalized item by global identifier.""" + + return await service.get_item(global_id) diff --git a/app/backend/app/api/manifest.py b/app/backend/app/api/manifest.py new file mode 100644 index 0000000000000000000000000000000000000000..5206fd9489c34c959052eef86f1c277129d707f3 --- /dev/null +++ b/app/backend/app/api/manifest.py @@ -0,0 +1,19 @@ +"""Manifest resolution endpoint.""" + +from fastapi import APIRouter, Depends + +from app.api.dependencies import get_manifest_resolver +from app.models.manifest_models import ResolveManifestRequest, ResolveManifestResponse +from app.services.manifest_resolver import ManifestResolver + +router = APIRouter(tags=["manifest"]) + + +@router.post("/resolve-manifest", response_model=ResolveManifestResponse) +async def resolve_manifest( + payload: ResolveManifestRequest, + resolver: ManifestResolver = Depends(get_manifest_resolver), +) -> ResolveManifestResponse: + """Resolve manifest URL for a source item.""" + + return await resolver.resolve(payload) diff --git a/app/backend/app/api/router.py b/app/backend/app/api/router.py new file mode 100644 index 0000000000000000000000000000000000000000..f02e57843a28ac12ec707e8b74a3173c68e5f686 --- /dev/null +++ b/app/backend/app/api/router.py @@ -0,0 +1,13 @@ +"""Top-level API router.""" + +from fastapi import APIRouter + +from app.api import health, import_, items, manifest, search, sources + +api_router = APIRouter(prefix="/api") +api_router.include_router(health.router) +api_router.include_router(sources.router) +api_router.include_router(search.router) +api_router.include_router(items.router) +api_router.include_router(manifest.router) +api_router.include_router(import_.router) diff --git a/app/backend/app/api/search.py b/app/backend/app/api/search.py new file mode 100644 index 0000000000000000000000000000000000000000..aa3e0ea8c95bd1499957658b929290354347ae0f --- /dev/null +++ b/app/backend/app/api/search.py @@ -0,0 +1,19 @@ +"""Search endpoint.""" + +from fastapi import APIRouter, Depends + +from app.api.dependencies import get_search_orchestrator +from app.models.search_models import SearchRequest, SearchResponse +from app.services.search_orchestrator import SearchOrchestrator + +router = APIRouter(tags=["search"]) + + +@router.post("/search", response_model=SearchResponse) +async def search_items( + payload: SearchRequest, + orchestrator: SearchOrchestrator = Depends(get_search_orchestrator), +) -> SearchResponse: + """Run federated search and return normalized results.""" + + return await orchestrator.search(payload) diff --git a/app/backend/app/api/sources.py b/app/backend/app/api/sources.py new file mode 100644 index 0000000000000000000000000000000000000000..5b29678d591bee1867d1068129092b747d8233e5 --- /dev/null +++ b/app/backend/app/api/sources.py @@ -0,0 +1,16 @@ +"""Source listing endpoint.""" + +from fastapi import APIRouter, Depends + +from app.api.dependencies import get_source_service +from app.models.source_models import SourcesResponse +from app.services.source_service import SourceService + +router = APIRouter(tags=["sources"]) + + +@router.get("/sources", response_model=SourcesResponse) +async def list_sources(service: SourceService = Depends(get_source_service)) -> SourcesResponse: + """List registered sources and capabilities.""" + + return await service.list_sources() diff --git a/app/backend/app/config/__init__.py b/app/backend/app/config/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app/backend/app/config/settings.py b/app/backend/app/config/settings.py new file mode 100644 index 0000000000000000000000000000000000000000..e4568be2e3b80bd867b11a8ded135062a9cdc009 --- /dev/null +++ b/app/backend/app/config/settings.py @@ -0,0 +1,21 @@ +"""Application settings loaded from environment variables.""" + +from pydantic import Field +from pydantic_settings import BaseSettings, SettingsConfigDict + + +class Settings(BaseSettings): + """Runtime settings for the backend application.""" + + app_name: str = "Clafoutis Backend" + app_version: str = "0.1.0" + debug: bool = False + request_timeout_seconds: float = Field(default=8.0, gt=0) + cors_allow_origins: list[str] = Field(default_factory=lambda: ["http://localhost:5173"]) + gallica_sru_base_url: str = "https://gallica.bnf.fr/SRU" + gallica_use_fixtures: bool = True + + model_config = SettingsConfigDict(env_prefix="CLAFOUTIS_", extra="ignore") + + +settings = Settings() diff --git a/app/backend/app/connectors/__init__.py b/app/backend/app/connectors/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app/backend/app/connectors/base.py b/app/backend/app/connectors/base.py new file mode 100644 index 0000000000000000000000000000000000000000..48f1525009265bee4ab0f1031a7c86033574a8ed --- /dev/null +++ b/app/backend/app/connectors/base.py @@ -0,0 +1,47 @@ +"""Abstract connector interface for all external sources.""" + +from __future__ import annotations + +from abc import ABC, abstractmethod + +from app.models.normalized_item import NormalizedItem +from app.models.search_models import SearchResponse +from app.models.source_models import SourceCapabilities + + +class BaseConnector(ABC): + """Common contract implemented by every source connector.""" + + name: str + label: str + source_type: str + + @abstractmethod + async def search( + self, + query: str, + filters: dict[str, object], + page: int, + page_size: int, + ) -> SearchResponse: + """Execute source search and return normalized results.""" + + @abstractmethod + async def get_item(self, source_item_id: str) -> NormalizedItem | None: + """Get a single normalized item by source-specific identifier.""" + + @abstractmethod + async def resolve_manifest( + self, + item: NormalizedItem | None = None, + record_url: str | None = None, + ) -> str | None: + """Resolve a IIIF manifest URL from item metadata or record URL.""" + + @abstractmethod + async def healthcheck(self) -> dict[str, str]: + """Check connector health and return a compact status report.""" + + @abstractmethod + async def capabilities(self) -> SourceCapabilities: + """Declare static connector capabilities.""" diff --git a/app/backend/app/connectors/gallica/__init__.py b/app/backend/app/connectors/gallica/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..369cf45d800c807626b8b1e8511fa56e514471fc --- /dev/null +++ b/app/backend/app/connectors/gallica/__init__.py @@ -0,0 +1,5 @@ +"""Gallica connector package.""" + +from .connector import GallicaConnector + +__all__ = ["GallicaConnector"] diff --git a/app/backend/app/connectors/gallica/connector.py b/app/backend/app/connectors/gallica/connector.py new file mode 100644 index 0000000000000000000000000000000000000000..47cf89caac86707b7cae9d8e58f82ff1d360a22a --- /dev/null +++ b/app/backend/app/connectors/gallica/connector.py @@ -0,0 +1,264 @@ +"""Gallica connector implementation.""" + +from __future__ import annotations + +import time +import xml.etree.ElementTree as ET +from urllib.parse import quote_plus + +from app.config.settings import settings +from app.connectors.base import BaseConnector +from app.connectors.gallica.fixtures import FIXTURE_GALLICA_RECORDS +from app.models.normalized_item import NormalizedItem +from app.models.search_models import PartialFailure, SearchResponse +from app.models.source_models import SourceCapabilities +from app.utils.http_client import build_async_client +from app.utils.ids import make_global_id + + +class GallicaConnector(BaseConnector): + """Gallica/BnF connector with live SRU mode and deterministic fixture fallback.""" + + name = "gallica" + label = "Gallica / BnF" + source_type = "institution" + + async def search( + self, + query: str, + filters: dict[str, object], + page: int, + page_size: int, + ) -> SearchResponse: + """Search Gallica through SRU and map records to NormalizedItem.""" + + start = time.perf_counter() + try: + records = await self._fetch_search_records(query=query, page=page, page_size=page_size) + items = [self._map_record(record, index) for index, record in enumerate(records)] + partial = [PartialFailure(source=self.name, status="ok")] + except Exception as exc: + records = self._search_fixtures(query) + items = [self._map_fixture_record(record, index) for index, record in enumerate(records)] + partial = [ + PartialFailure( + source=self.name, + status="degraded", + error=f"live_gallica_unavailable: {exc}", + ) + ] + + start_index = (page - 1) * page_size + page_items = items[start_index : start_index + page_size] + + return SearchResponse( + query=query, + page=page, + page_size=page_size, + total_estimated=len(items), + results=page_items, + sources_used=[self.name], + partial_failures=partial, + duration_ms=int((time.perf_counter() - start) * 1000), + ) + + async def get_item(self, source_item_id: str) -> NormalizedItem | None: + """Retrieve one Gallica item by ARK, using live mode then fixtures fallback.""" + + try: + records = await self._fetch_search_records( + query=f'ark all "{source_item_id}"', + page=1, + page_size=1, + raw_query=True, + ) + if records: + return self._map_record(records[0], 0) + except Exception: + pass + + for fixture in FIXTURE_GALLICA_RECORDS: + if fixture["source_item_id"] == source_item_id: + return self._map_fixture_record(fixture, 0) + return None + + async def resolve_manifest( + self, + item: NormalizedItem | None = None, + record_url: str | None = None, + ) -> str | None: + """Resolve Gallica IIIF manifest from normalized item or record URL.""" + + if item and item.manifest_url: + return item.manifest_url + + source = record_url or (item.record_url if item else None) + if not source: + return None + + ark = self._extract_ark(source) + if not ark: + return None + + return self._manifest_from_ark(ark) + + async def healthcheck(self) -> dict[str, str]: + """Run lightweight Gallica availability check.""" + + if settings.gallica_use_fixtures: + return {"status": "ok", "mode": "fixtures"} + + params_query = quote_plus('dc.title all "dante"') + url = ( + f"{settings.gallica_sru_base_url}?version=1.2&operation=searchRetrieve" + f"&query={params_query}&maximumRecords=1" + ) + + async with await build_async_client() as client: + response = await client.get(url) + if response.status_code >= 400: + return {"status": "error", "mode": "live"} + return {"status": "ok", "mode": "live"} + + async def capabilities(self) -> SourceCapabilities: + """Return Gallica connector capabilities.""" + + return SourceCapabilities(search=True, get_item=True, resolve_manifest=True) + + async def _fetch_search_records( + self, + query: str, + page: int, + page_size: int, + raw_query: bool = False, + ) -> list[ET.Element]: + if settings.gallica_use_fixtures: + raise RuntimeError("fixtures mode enabled") + + sru_query = query if raw_query else f'dc.title all "{query}"' + start_record = ((page - 1) * page_size) + 1 + encoded_query = quote_plus(sru_query) + + url = ( + f"{settings.gallica_sru_base_url}?version=1.2&operation=searchRetrieve" + f"&query={encoded_query}&startRecord={start_record}&maximumRecords={page_size}" + ) + + async with await build_async_client() as client: + response = await client.get(url) + response.raise_for_status() + + root = ET.fromstring(response.text) + return [record for record in root.iter() if record.tag.endswith("record")] + + def _map_record(self, record: ET.Element, index: int) -> NormalizedItem: + dc_values = self._extract_dc_values(record) + + source_item_id = self._extract_ark(dc_values.get("identifier", [])) + if not source_item_id: + source_item_id = f"gallica-record-{index}" + + title = self._first(dc_values.get("title", []), default="Document Gallica") + creators = dc_values.get("creator", []) + date_display = self._first(dc_values.get("date", []), default=None) + object_type = self._map_object_type(self._first(dc_values.get("type", []), default="other")) + record_url = self._first(dc_values.get("identifier", []), default=None) + manifest_url = self._manifest_from_ark(source_item_id) if source_item_id.startswith("ark:/") else None + + warnings: list[str] = [] + if not creators: + warnings.append("missing_creators") + if record_url is None: + warnings.append("missing_record_url") + + return NormalizedItem( + id=make_global_id(self.name, source_item_id), + source=self.name, + source_label=self.label, + source_item_id=source_item_id, + title=title, + creators=creators, + date_display=date_display, + object_type=object_type, + institution="Bibliothèque nationale de France", + thumbnail_url=None, + record_url=record_url, + manifest_url=manifest_url, + has_iiif_manifest=manifest_url is not None, + has_images=True, + has_ocr=False, + availability="public", + relevance_score=max(0.0, 1.0 - (index * 0.01)), + normalization_warnings=warnings, + ) + + def _map_fixture_record(self, fixture: dict[str, object], index: int) -> NormalizedItem: + source_item_id = str(fixture["source_item_id"]) + manifest_url = self._manifest_from_ark(source_item_id) + return NormalizedItem( + id=make_global_id(self.name, source_item_id), + source=self.name, + source_label=self.label, + source_item_id=source_item_id, + title=str(fixture["title"]), + creators=[str(value) for value in fixture.get("creators", [])], + date_display=str(fixture.get("date_display")) if fixture.get("date_display") else None, + object_type=str(fixture.get("object_type", "other")), + institution=str(fixture.get("institution")) if fixture.get("institution") else None, + thumbnail_url=str(fixture.get("thumbnail_url")) if fixture.get("thumbnail_url") else None, + record_url=str(fixture.get("record_url")) if fixture.get("record_url") else None, + manifest_url=manifest_url, + has_iiif_manifest=True, + has_images=True, + has_ocr=False, + availability="public", + relevance_score=max(0.0, 1.0 - (index * 0.01)), + normalization_warnings=["fixture_mode"], + ) + + def _extract_dc_values(self, record: ET.Element) -> dict[str, list[str]]: + values: dict[str, list[str]] = {} + for node in record.iter(): + if not node.tag.startswith("{"): + continue + local_name = node.tag.split("}", maxsplit=1)[1] + if local_name in {"title", "creator", "date", "identifier", "type"} and node.text: + values.setdefault(local_name, []).append(node.text.strip()) + return values + + def _extract_ark(self, identifiers: list[str] | str) -> str | None: + values = [identifiers] if isinstance(identifiers, str) else identifiers + for value in values: + if "ark:/" not in value: + continue + ark = value[value.index("ark:/") :] + return ark.split("?")[0].rstrip("/") + return None + + def _manifest_from_ark(self, ark: str) -> str: + return f"https://gallica.bnf.fr/iiif/{ark}/manifest.json" + + def _search_fixtures(self, query: str) -> list[dict[str, object]]: + lowered = query.lower().strip() + return [ + record + for record in FIXTURE_GALLICA_RECORDS + if lowered in str(record["title"]).lower() or lowered in " ".join(record.get("creators", [])).lower() + ] + + def _map_object_type(self, raw_type: str) -> str: + lowered = raw_type.lower() + if "manus" in lowered: + return "manuscript" + if "book" in lowered or "livre" in lowered: + return "book" + if "map" in lowered or "carte" in lowered: + return "map" + if "image" in lowered or "estampe" in lowered: + return "image" + if "journal" in lowered or "newspaper" in lowered: + return "newspaper" + return "other" + + def _first(self, values: list[str], default: str | None) -> str | None: + return values[0] if values else default diff --git a/app/backend/app/connectors/gallica/fixtures.py b/app/backend/app/connectors/gallica/fixtures.py new file mode 100644 index 0000000000000000000000000000000000000000..9b565e1d65e7dd668ee005625a67b9c76473f24a --- /dev/null +++ b/app/backend/app/connectors/gallica/fixtures.py @@ -0,0 +1,24 @@ +"""Static fixture records for Gallica connector fallback mode.""" + +FIXTURE_GALLICA_RECORDS: list[dict[str, object]] = [ + { + "source_item_id": "ark:/12148/btv1b55002481n", + "title": "Livre d'heures à l'usage de Rome", + "creators": ["Anonyme"], + "date_display": "XVe siècle", + "object_type": "manuscript", + "institution": "Bibliothèque nationale de France", + "record_url": "https://gallica.bnf.fr/ark:/12148/btv1b55002481n", + "thumbnail_url": "https://gallica.bnf.fr/ark:/12148/btv1b55002481n.thumbnail", + }, + { + "source_item_id": "ark:/12148/bpt6k1512248m", + "title": "La Divine Comédie de Dante Alighieri", + "creators": ["Dante Alighieri"], + "date_display": "1898", + "object_type": "book", + "institution": "Bibliothèque nationale de France", + "record_url": "https://gallica.bnf.fr/ark:/12148/bpt6k1512248m", + "thumbnail_url": "https://gallica.bnf.fr/ark:/12148/bpt6k1512248m.thumbnail", + }, +] diff --git a/app/backend/app/connectors/manifest_by_url_connector.py b/app/backend/app/connectors/manifest_by_url_connector.py new file mode 100644 index 0000000000000000000000000000000000000000..8229b8d1986534c639e71d83a05f3c037525a90a --- /dev/null +++ b/app/backend/app/connectors/manifest_by_url_connector.py @@ -0,0 +1,105 @@ +"""Generic connector resolving IIIF manifests from arbitrary URLs.""" + +from __future__ import annotations + +from urllib.parse import urlparse + +from app.connectors.base import BaseConnector +from app.models.normalized_item import NormalizedItem +from app.models.search_models import PartialFailure, SearchResponse +from app.models.source_models import SourceCapabilities + + +class ManifestByUrlConnector(BaseConnector): + """Connector dedicated to URL import and generic manifest detection heuristics.""" + + name = "manifest_by_url" + label = "Manifest by URL" + source_type = "generic" + + async def search( + self, + query: str, + filters: dict[str, object], + page: int, + page_size: int, + ) -> SearchResponse: + """Return empty search results because this connector is import-only.""" + + return SearchResponse( + query=query, + page=page, + page_size=page_size, + total_estimated=0, + results=[], + sources_used=[self.name], + partial_failures=[PartialFailure(source=self.name, status="ok")], + duration_ms=1, + ) + + async def get_item(self, source_item_id: str) -> NormalizedItem | None: + """Return no item because this connector does not expose source IDs.""" + + return None + + async def resolve_manifest( + self, + item: NormalizedItem | None = None, + record_url: str | None = None, + ) -> str | None: + """Resolve manifest URL by direct detection or lightweight notice heuristics.""" + + candidate_url = record_url or (item.record_url if item else None) + if not candidate_url: + return None + + # Heuristic 1: direct manifest URL patterns + if self._looks_like_manifest_url(candidate_url): + return candidate_url + + # Heuristic 2: common notice -> manifest patterns + generated_candidates = self._notice_to_manifest_candidates(candidate_url) + for candidate in generated_candidates: + if self._looks_like_manifest_url(candidate): + return candidate + + return None + + async def healthcheck(self) -> dict[str, str]: + """Return healthy status for the local heuristic connector.""" + + return {"status": "ok"} + + async def capabilities(self) -> SourceCapabilities: + """Declare capabilities for this connector.""" + + return SourceCapabilities(search=False, get_item=False, resolve_manifest=True) + + def _looks_like_manifest_url(self, url: str) -> bool: + parsed = urlparse(url) + lowered_path = parsed.path.lower() + lowered_query = parsed.query.lower() + return ( + "manifest" in lowered_path + or lowered_path.endswith("manifest.json") + or lowered_query.startswith("manifest=") + or "iiif_manifest" in lowered_query + ) + + def _notice_to_manifest_candidates(self, url: str) -> list[str]: + parsed = urlparse(url) + clean_path = parsed.path.rstrip("/") + + suffixes = [ + "/manifest", + "/manifest.json", + "/iiif/manifest", + "/iiif/manifest.json", + ] + + candidates: list[str] = [] + for suffix in suffixes: + candidate = parsed._replace(path=f"{clean_path}{suffix}", query="").geturl() + candidates.append(candidate) + + return candidates diff --git a/app/backend/app/connectors/mock_connector.py b/app/backend/app/connectors/mock_connector.py new file mode 100644 index 0000000000000000000000000000000000000000..239804ce3d420d87a9bfed2b0daeba4c8628be64 --- /dev/null +++ b/app/backend/app/connectors/mock_connector.py @@ -0,0 +1,109 @@ +"""Mock connector used by backend lot 1 to provide stable demo data.""" + +from app.connectors.base import BaseConnector +from app.models.normalized_item import NormalizedItem +from app.models.search_models import PartialFailure, SearchResponse +from app.models.source_models import SourceCapabilities +from app.utils.ids import make_global_id + + +class MockConnector(BaseConnector): + """Simple in-memory connector implementing BaseConnector contract.""" + + name = "mock" + label = "Mock Heritage Source" + source_type = "mock" + + def __init__(self) -> None: + self._items = { + "ms-1": NormalizedItem( + id=make_global_id("mock", "ms-1"), + source="mock", + source_label=self.label, + source_item_id="ms-1", + title="Book of Hours (Mock)", + creators=["Unknown"], + institution="Mock Institution", + object_type="manuscript", + record_url="https://mock.example.org/items/ms-1", + manifest_url="https://mock.example.org/iiif/ms-1/manifest", + has_iiif_manifest=True, + has_images=True, + has_ocr=False, + availability="public", + relevance_score=0.9, + ), + "ms-2": NormalizedItem( + id=make_global_id("mock", "ms-2"), + source="mock", + source_label=self.label, + source_item_id="ms-2", + title="Dante Manuscript (Mock)", + creators=["Anonymous"], + institution="Mock Institution", + object_type="manuscript", + record_url="https://mock.example.org/items/ms-2", + manifest_url=None, + has_iiif_manifest=False, + has_images=True, + has_ocr=True, + availability="public", + relevance_score=0.8, + ), + } + + async def search( + self, + query: str, + filters: dict[str, object], + page: int, + page_size: int, + ) -> SearchResponse: + """Return normalized in-memory results filtered by query substring.""" + + lowered = query.lower().strip() + filtered = [item for item in self._items.values() if lowered in item.title.lower()] + start = (page - 1) * page_size + end = start + page_size + page_items = filtered[start:end] + return SearchResponse( + query=query, + page=page, + page_size=page_size, + total_estimated=len(filtered), + results=page_items, + sources_used=[self.name], + partial_failures=[PartialFailure(source=self.name, status="ok")], + duration_ms=1, + ) + + async def get_item(self, source_item_id: str) -> NormalizedItem | None: + """Return normalized item when available.""" + + return self._items.get(source_item_id) + + async def resolve_manifest( + self, + item: NormalizedItem | None = None, + record_url: str | None = None, + ) -> str | None: + """Resolve manifest URL from provided item or known record URL.""" + + if item is not None and item.manifest_url: + return item.manifest_url + + if record_url: + for candidate in self._items.values(): + if candidate.record_url == record_url: + return candidate.manifest_url + return None + + async def healthcheck(self) -> dict[str, str]: + """Return static healthy status for demonstration connector.""" + + return {"status": "ok"} + + async def capabilities(self) -> SourceCapabilities: + """Return static capabilities for the mock connector.""" + + return SourceCapabilities(search=True, get_item=True, resolve_manifest=True) diff --git a/app/backend/app/connectors/registry.py b/app/backend/app/connectors/registry.py new file mode 100644 index 0000000000000000000000000000000000000000..2760307212461f28ce8906061cda27ac3e710c5e --- /dev/null +++ b/app/backend/app/connectors/registry.py @@ -0,0 +1,35 @@ +"""Registry storing available connectors.""" + +from app.connectors.base import BaseConnector + + +class ConnectorRegistry: + """In-memory registry for connector instances.""" + + def __init__(self) -> None: + self._connectors: dict[str, BaseConnector] = {} + + def register(self, connector: BaseConnector) -> None: + """Register a connector instance by unique connector name.""" + + self._connectors[connector.name] = connector + + def list_names(self) -> list[str]: + """Return sorted connector names.""" + + return sorted(self._connectors.keys()) + + def get(self, name: str) -> BaseConnector: + """Return connector instance for the provided name.""" + + return self._connectors[name] + + def has(self, name: str) -> bool: + """Return whether a connector with the given name is registered.""" + + return name in self._connectors + + def list_connectors(self) -> list[BaseConnector]: + """Return registered connector instances.""" + + return [self._connectors[name] for name in self.list_names()] diff --git a/app/backend/app/main.py b/app/backend/app/main.py new file mode 100644 index 0000000000000000000000000000000000000000..957337c9b54d46b381c8d90d2642806628480036 --- /dev/null +++ b/app/backend/app/main.py @@ -0,0 +1,44 @@ +"""FastAPI application entrypoint for backend lot 1.""" + +from fastapi import FastAPI, Request +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse + +from app.api.router import api_router +from app.config.settings import settings +from app.models.error_models import ErrorResponse +from app.utils.errors import AppError, BadRequestError, NotFoundError + + +def create_app() -> FastAPI: + """Create and configure FastAPI application.""" + + application = FastAPI(title=settings.app_name, version=settings.app_version, debug=settings.debug) + application.add_middleware( + CORSMiddleware, + allow_origins=settings.cors_allow_origins, + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], + ) + application.include_router(api_router) + + @application.exception_handler(BadRequestError) + async def handle_bad_request(_: Request, exc: BadRequestError) -> JSONResponse: + payload = ErrorResponse(error="bad_request", details=str(exc)).model_dump() + return JSONResponse(status_code=400, content=payload) + + @application.exception_handler(NotFoundError) + async def handle_not_found(_: Request, exc: NotFoundError) -> JSONResponse: + payload = ErrorResponse(error="not_found", details=str(exc)).model_dump() + return JSONResponse(status_code=404, content=payload) + + @application.exception_handler(AppError) + async def handle_app_error(_: Request, exc: AppError) -> JSONResponse: + payload = ErrorResponse(error="application_error", details=str(exc)).model_dump() + return JSONResponse(status_code=500, content=payload) + + return application + + +app = create_app() diff --git a/app/backend/app/models/__init__.py b/app/backend/app/models/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app/backend/app/models/error_models.py b/app/backend/app/models/error_models.py new file mode 100644 index 0000000000000000000000000000000000000000..de7901c712ace492f16ede39a3e395f299e28c88 --- /dev/null +++ b/app/backend/app/models/error_models.py @@ -0,0 +1,10 @@ +"""Error payload model for consistent API error responses.""" + +from pydantic import BaseModel + + +class ErrorResponse(BaseModel): + """Structured error payload returned by exception handlers.""" + + error: str + details: str | None = None diff --git a/app/backend/app/models/import_models.py b/app/backend/app/models/import_models.py new file mode 100644 index 0000000000000000000000000000000000000000..2a2f4ea801b9bf313f5c8330b88a2bffd3508b8a --- /dev/null +++ b/app/backend/app/models/import_models.py @@ -0,0 +1,22 @@ +"""Models for import-by-URL endpoint.""" + +from pydantic import BaseModel, ConfigDict, Field + +from app.models.normalized_item import NormalizedItem + + +class ImportRequest(BaseModel): + """Input payload for item import using a URL.""" + + model_config = ConfigDict(extra="forbid") + + url: str = Field(min_length=1) + + +class ImportResponse(BaseModel): + """Result of import URL analysis and manifest resolution.""" + + detected_source: str | None = None + record_url: str | None = None + manifest_url: str | None = None + item: NormalizedItem | None = None diff --git a/app/backend/app/models/manifest_models.py b/app/backend/app/models/manifest_models.py new file mode 100644 index 0000000000000000000000000000000000000000..1f3214729d89730b9d0c75e9ef80845027091356 --- /dev/null +++ b/app/backend/app/models/manifest_models.py @@ -0,0 +1,21 @@ +"""Models for manifest resolution operations.""" + +from pydantic import BaseModel, ConfigDict, Field + + +class ResolveManifestRequest(BaseModel): + """Input payload for manifest resolution.""" + + model_config = ConfigDict(extra="forbid") + + source: str = Field(min_length=1) + source_item_id: str = Field(min_length=1) + record_url: str | None = None + + +class ResolveManifestResponse(BaseModel): + """Output payload for manifest resolution.""" + + manifest_url: str | None = None + status: str + method: str | None = None diff --git a/app/backend/app/models/normalized_item.py b/app/backend/app/models/normalized_item.py new file mode 100644 index 0000000000000000000000000000000000000000..2ea63d86ae99ba3fba4510bc9b1ad02dff9e1db2 --- /dev/null +++ b/app/backend/app/models/normalized_item.py @@ -0,0 +1,37 @@ +"""Normalized item model shared by all connectors and APIs.""" + +from pydantic import BaseModel, ConfigDict, Field, model_validator + + +class NormalizedItem(BaseModel): + """Normalized representation of an item from any source.""" + + model_config = ConfigDict(extra="forbid") + + id: str = Field(description="Global identifier in format source:source_item_id.") + source: str + source_label: str + source_item_id: str + title: str + creators: list[str] = Field(default_factory=list) + date_display: str | None = None + object_type: str = "other" + institution: str | None = None + thumbnail_url: str | None = None + record_url: str | None = None + manifest_url: str | None = None + has_iiif_manifest: bool = False + has_images: bool = False + has_ocr: bool = False + availability: str = "unknown" + relevance_score: float = 0.0 + normalization_warnings: list[str] = Field(default_factory=list) + + @model_validator(mode="after") + def validate_global_id(self) -> "NormalizedItem": + """Ensure `id` follows the stable MVP rule `source:source_item_id`.""" + + expected_id = f"{self.source}:{self.source_item_id}" + if self.id != expected_id: + raise ValueError("id must match source:source_item_id") + return self diff --git a/app/backend/app/models/search_models.py b/app/backend/app/models/search_models.py new file mode 100644 index 0000000000000000000000000000000000000000..ffaa4876671b0ed3a94c9580a6737c22d3a3a9a7 --- /dev/null +++ b/app/backend/app/models/search_models.py @@ -0,0 +1,38 @@ +"""Models for search requests and responses.""" + +from pydantic import BaseModel, ConfigDict, Field + +from app.models.normalized_item import NormalizedItem + + +class SearchRequest(BaseModel): + """Input payload for federated search.""" + + model_config = ConfigDict(extra="forbid") + + query: str = Field(min_length=1) + sources: list[str] | None = None + filters: dict[str, object] = Field(default_factory=dict) + page: int = Field(default=1, ge=1) + page_size: int = Field(default=24, ge=1, le=100) + + +class PartialFailure(BaseModel): + """Per-source failure report for partial success responses.""" + + source: str + status: str + error: str | None = None + + +class SearchResponse(BaseModel): + """Unified search response returned by backend APIs.""" + + query: str + page: int + page_size: int + total_estimated: int + results: list[NormalizedItem] + sources_used: list[str] + partial_failures: list[PartialFailure] = Field(default_factory=list) + duration_ms: int diff --git a/app/backend/app/models/source_models.py b/app/backend/app/models/source_models.py new file mode 100644 index 0000000000000000000000000000000000000000..7365c5bbcb2b5311679391b8e74229858d43e957 --- /dev/null +++ b/app/backend/app/models/source_models.py @@ -0,0 +1,28 @@ +"""Models describing source capabilities and source listing responses.""" + +from pydantic import BaseModel, Field + + +class SourceCapabilities(BaseModel): + """Capabilities declared by a connector.""" + + search: bool = True + get_item: bool = True + resolve_manifest: bool = True + + +class SourceDescriptor(BaseModel): + """Source metadata exposed through /api/sources.""" + + name: str + label: str + source_type: str + capabilities: SourceCapabilities + healthy: bool + notes: str | None = None + + +class SourcesResponse(BaseModel): + """Response payload for source listing endpoint.""" + + sources: list[SourceDescriptor] = Field(default_factory=list) diff --git a/app/backend/app/services/__init__.py b/app/backend/app/services/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app/backend/app/services/import_service.py b/app/backend/app/services/import_service.py new file mode 100644 index 0000000000000000000000000000000000000000..e3600a3b47a0ee0d8e550fd8926514564475da55 --- /dev/null +++ b/app/backend/app/services/import_service.py @@ -0,0 +1,61 @@ +"""Service for URL import and initial source detection.""" + +from app.connectors.registry import ConnectorRegistry +from app.models.import_models import ImportResponse +from app.utils.url_validation import validate_http_url + + +class ImportService: + """Handle import flow from URL to normalized item and manifest.""" + + def __init__(self, registry: ConnectorRegistry) -> None: + self._registry = registry + + async def import_url(self, url: str) -> ImportResponse: + """Validate URL then resolve manifest through source and generic connectors. + + Resolution order: + 1. Source-specific connectors (e.g. mock) try exact record URL mapping. + 2. `manifest_by_url` connector applies generic URL heuristics: + - direct manifest pattern detection; + - notice -> manifest candidate generation. + """ + + safe_url = validate_http_url(url) + + source_connectors = [ + connector + for connector in self._registry.list_connectors() + if connector.name != "manifest_by_url" + ] + generic_connectors = [ + connector + for connector in self._registry.list_connectors() + if connector.name == "manifest_by_url" + ] + + for connector in [*source_connectors, *generic_connectors]: + matched_item = None + if connector.name == "mock": + # Lot 4 keeps source-specific matching minimal for mock demo data. + for candidate_id in ("ms-1", "ms-2"): + candidate = await connector.get_item(candidate_id) + if candidate is not None and candidate.record_url == safe_url: + matched_item = candidate + break + + manifest = await connector.resolve_manifest(item=matched_item, record_url=safe_url) + if manifest: + return ImportResponse( + detected_source=connector.name, + record_url=safe_url, + manifest_url=manifest, + item=matched_item, + ) + + return ImportResponse( + detected_source=None, + record_url=safe_url, + manifest_url=None, + item=None, + ) diff --git a/app/backend/app/services/item_service.py b/app/backend/app/services/item_service.py new file mode 100644 index 0000000000000000000000000000000000000000..fbfa66a0c931861304840c051ec0638a04d94cce --- /dev/null +++ b/app/backend/app/services/item_service.py @@ -0,0 +1,27 @@ +"""Service for item retrieval by global identifier.""" + +from app.connectors.registry import ConnectorRegistry +from app.models.normalized_item import NormalizedItem +from app.utils.errors import BadRequestError, NotFoundError +from app.utils.ids import split_global_id + + +class ItemService: + """Resolve item details using global id policy source:source_item_id.""" + + def __init__(self, registry: ConnectorRegistry) -> None: + self._registry = registry + + async def get_item(self, global_id: str) -> NormalizedItem: + """Fetch normalized item from connector using global id.""" + + try: + source, source_item_id = split_global_id(global_id) + except ValueError: + raise BadRequestError("Invalid id format, expected source:source_item_id") + if not self._registry.has(source): + raise NotFoundError(f"Unknown source '{source}'") + item = await self._registry.get(source).get_item(source_item_id) + if item is None: + raise NotFoundError(f"Item '{global_id}' not found") + return item diff --git a/app/backend/app/services/manifest_resolver.py b/app/backend/app/services/manifest_resolver.py new file mode 100644 index 0000000000000000000000000000000000000000..4e3f906c14a559451bd844159ba29334360d40b2 --- /dev/null +++ b/app/backend/app/services/manifest_resolver.py @@ -0,0 +1,30 @@ +"""Service responsible for manifest resolution through connectors.""" + +from app.connectors.registry import ConnectorRegistry +from app.models.manifest_models import ResolveManifestRequest, ResolveManifestResponse +from app.utils.errors import NotFoundError +from app.utils.ids import make_global_id + + +class ManifestResolver: + """Resolve manifests by delegating to source connectors.""" + + def __init__(self, registry: ConnectorRegistry) -> None: + self._registry = registry + + async def resolve(self, request: ResolveManifestRequest) -> ResolveManifestResponse: + """Resolve manifest URL for a source item identifier.""" + + if not self._registry.has(request.source): + raise NotFoundError(f"Unknown source '{request.source}'") + connector = self._registry.get(request.source) + item = await connector.get_item(request.source_item_id) + manifest_url = await connector.resolve_manifest(item=item, record_url=request.record_url) + status = "resolved" if manifest_url else "not_found" + method = "metadata" if manifest_url else None + return ResolveManifestResponse(manifest_url=manifest_url, status=status, method=method) + + async def openable_global_id(self, source: str, source_item_id: str) -> str: + """Return deterministic global id associated to manifest operation.""" + + return make_global_id(source, source_item_id) diff --git a/app/backend/app/services/search_orchestrator.py b/app/backend/app/services/search_orchestrator.py new file mode 100644 index 0000000000000000000000000000000000000000..71ca34472ec3e9bbcdb3b932bf12092dc805544c --- /dev/null +++ b/app/backend/app/services/search_orchestrator.py @@ -0,0 +1,63 @@ +"""Federated search orchestration over registered connectors.""" + +from __future__ import annotations + +import asyncio +import time + +from app.connectors.registry import ConnectorRegistry +from app.models.search_models import PartialFailure, SearchRequest, SearchResponse + + +class SearchOrchestrator: + """Coordinate multi-source search with partial failure tolerance.""" + + def __init__(self, registry: ConnectorRegistry) -> None: + self._registry = registry + + async def search(self, request: SearchRequest) -> SearchResponse: + """Run federated search across selected connectors and merge results.""" + + selected = request.sources or self._registry.list_names() + tasks = [self._search_one(source, request) for source in selected] + start = time.perf_counter() + gathered = await asyncio.gather(*tasks, return_exceptions=True) + duration_ms = int((time.perf_counter() - start) * 1000) + + merged_results = [] + partial_failures: list[PartialFailure] = [] + sources_used: list[str] = [] + + for source_name, outcome in zip(selected, gathered, strict=True): + if isinstance(outcome, Exception): + partial_failures.append( + PartialFailure(source=source_name, status="error", error=str(outcome)) + ) + continue + sources_used.append(source_name) + merged_results.extend(outcome.results) + partial_failures.extend(outcome.partial_failures) + + merged_results.sort(key=lambda item: item.relevance_score, reverse=True) + + return SearchResponse( + query=request.query, + page=request.page, + page_size=request.page_size, + total_estimated=len(merged_results), + results=merged_results, + sources_used=sources_used, + partial_failures=partial_failures, + duration_ms=duration_ms, + ) + + async def _search_one(self, source: str, request: SearchRequest) -> SearchResponse: + if not self._registry.has(source): + raise ValueError(f"Unknown source '{source}'") + connector = self._registry.get(source) + return await connector.search( + query=request.query, + filters=request.filters, + page=request.page, + page_size=request.page_size, + ) diff --git a/app/backend/app/services/source_service.py b/app/backend/app/services/source_service.py new file mode 100644 index 0000000000000000000000000000000000000000..97269c6b5b891da4f11272b39ac9c2d852687e8c --- /dev/null +++ b/app/backend/app/services/source_service.py @@ -0,0 +1,29 @@ +"""Service exposing source capabilities and health state.""" + +from app.connectors.registry import ConnectorRegistry +from app.models.source_models import SourceDescriptor, SourcesResponse + + +class SourceService: + """Provide data for `/api/sources` endpoint.""" + + def __init__(self, registry: ConnectorRegistry) -> None: + self._registry = registry + + async def list_sources(self) -> SourcesResponse: + """Return registered sources with capabilities and health flags.""" + + sources: list[SourceDescriptor] = [] + for connector in self._registry.list_connectors(): + capabilities = await connector.capabilities() + health = await connector.healthcheck() + sources.append( + SourceDescriptor( + name=connector.name, + label=connector.label, + source_type=connector.source_type, + capabilities=capabilities, + healthy=health.get("status") == "ok", + ) + ) + return SourcesResponse(sources=sources) diff --git a/app/backend/app/utils/__init__.py b/app/backend/app/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/app/backend/app/utils/errors.py b/app/backend/app/utils/errors.py new file mode 100644 index 0000000000000000000000000000000000000000..fe39e511ad1abec154f95edfcd92291051e0cda8 --- /dev/null +++ b/app/backend/app/utils/errors.py @@ -0,0 +1,13 @@ +"""Domain exceptions used across backend services and API layers.""" + + +class AppError(Exception): + """Base class for application-level errors.""" + + +class NotFoundError(AppError): + """Raised when an entity cannot be found.""" + + +class BadRequestError(AppError): + """Raised when user input is invalid for business logic.""" diff --git a/app/backend/app/utils/http_client.py b/app/backend/app/utils/http_client.py new file mode 100644 index 0000000000000000000000000000000000000000..484bc06ab788f02ba799fe549c9f4412eb25e44c --- /dev/null +++ b/app/backend/app/utils/http_client.py @@ -0,0 +1,12 @@ +"""Async HTTP client utilities for connector network calls.""" + +import httpx + +from app.config.settings import settings + + +async def build_async_client() -> httpx.AsyncClient: + """Create an AsyncClient configured with MVP-safe defaults.""" + + timeout = httpx.Timeout(settings.request_timeout_seconds) + return httpx.AsyncClient(timeout=timeout, follow_redirects=True) diff --git a/app/backend/app/utils/ids.py b/app/backend/app/utils/ids.py new file mode 100644 index 0000000000000000000000000000000000000000..e128d5043a0df9c4849287be6139ee18cb8159f6 --- /dev/null +++ b/app/backend/app/utils/ids.py @@ -0,0 +1,18 @@ +"""Helpers to create and parse stable global identifiers.""" + + +def make_global_id(source: str, source_item_id: str) -> str: + """Build stable global id using source:source_item_id format.""" + + return f"{source}:{source_item_id}" + + +def split_global_id(global_id: str) -> tuple[str, str]: + """Split global identifier into source and source_item_id.""" + + if ":" not in global_id: + raise ValueError("global id must include ':' separator") + source, source_item_id = global_id.split(":", maxsplit=1) + if not source or not source_item_id: + raise ValueError("global id must contain source and source_item_id") + return source, source_item_id diff --git a/app/backend/app/utils/url_validation.py b/app/backend/app/utils/url_validation.py new file mode 100644 index 0000000000000000000000000000000000000000..ab4a6b300e61dbaadec96a1b080b19532aa43644 --- /dev/null +++ b/app/backend/app/utils/url_validation.py @@ -0,0 +1,58 @@ +"""URL validation helpers for import and manifest resolution endpoints.""" + +from __future__ import annotations + +import ipaddress +import socket +from urllib.parse import urlparse + +from app.utils.errors import BadRequestError + + +def validate_http_url(url: str) -> str: + """Validate URL scheme/host and apply basic SSRF protections for MVP.""" + + parsed = urlparse(url) + if parsed.scheme not in {"http", "https"}: + raise BadRequestError("URL scheme must be http or https") + + if not parsed.netloc or parsed.hostname is None: + raise BadRequestError("URL must contain a valid host") + + _reject_local_hostnames(parsed.hostname) + _reject_private_or_local_ip_literals(parsed.hostname) + _reject_hostnames_resolving_to_private_or_local_ips(parsed.hostname) + + return url + + +def _reject_local_hostnames(hostname: str) -> None: + lowered = hostname.lower() + blocked = {"localhost", "localhost.localdomain"} + if lowered in blocked: + raise BadRequestError("Local hosts are not allowed") + + +def _reject_private_or_local_ip_literals(hostname: str) -> None: + try: + ip = ipaddress.ip_address(hostname) + except ValueError: + return + + if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_unspecified: + raise BadRequestError("Private or local IPs are not allowed") + + +def _reject_hostnames_resolving_to_private_or_local_ips(hostname: str) -> None: + try: + addrinfos = socket.getaddrinfo(hostname, None) + except socket.gaierror: + # If hostname cannot be resolved, keep request valid and let downstream + # connector/network errors explain the failure. + return + + for info in addrinfos: + raw_ip = info[4][0] + ip = ipaddress.ip_address(raw_ip) + if ip.is_private or ip.is_loopback or ip.is_link_local or ip.is_reserved or ip.is_unspecified: + raise BadRequestError("Resolved host points to a private or local IP") diff --git a/app/frontend/index.html b/app/frontend/index.html new file mode 100644 index 0000000000000000000000000000000000000000..806ed4232a48eb1f697b68e375c50133596c64aa --- /dev/null +++ b/app/frontend/index.html @@ -0,0 +1,12 @@ + + +
+ + +{item.source_label}
+{item.institution ?? 'Institution inconnue'}
+Aucun résultat.
+ } + + return ( ++ Clafoutis est structuré en 3 couches: découverte, lecture et interopérabilité. +
+Mirador reste strictement une couche de lecture.
++ Collez une URL de manifest IIIF ou une URL de notice, puis laissez le backend tenter la + résolution vers un manifest. +
+ + + + {error && ( +{error}
+ )} + + {result && ( ++ Source détectée : {result.detectedSource ?? 'inconnue'} +
++ URL notice : {result.recordUrl ?? 'n/a'} +
++ Manifest résolu : {result.manifestUrl ?? 'non trouvé'} +
+ + {result.manifestUrl && ( + + )} +Mirador est utilisé ici uniquement comme workspace de lecture.
+ +Chargement des résultats…
} + {search.isError && ( ++ Erreur: {search.error.message} +
+ )} + + {!search.isPending && !search.isError && ( +Chargement des sources…
+ } + + if (sources.isError) { + return ( ++ Erreur: {sources.error.message} +
+ ) + } + + return ( +{source.label}
+Nom: {source.name}
+Type: {source.source_type}
+Statut: {source.healthy ? 'ok' : 'error'}
++ Capacités: search={String(source.capabilities.search)}, get_item= + {String(source.capabilities.get_item)}, resolve_manifest= + {String(source.capabilities.resolve_manifest)} +
+