Spaces:
Running
feat(web): S2 — importeurs IIIF + Gallica BnF dans la vue Import
Browse filesSprint S2 — parité corpus web : exposition des 2 importeurs distants
qui existaient déjà côté CLI/backend mais étaient invisibles dans
l'UI web (limitée jusqu'ici à HTR-United et HuggingFace).
Nouvelles routes
----------------
- ``GET /api/iiif/preview?manifest_url=...`` — pré-fetch d'un
manifeste IIIF Presentation API v2/v3 (label, version, n canvas,
n canvas avec transcription, échantillon de labels). Sans
télécharger les images.
- ``POST /api/iiif/import`` — import des images + GT dans
``output_dir`` (confiné au workspace web). Reuses ``IIIFImporter``.
- ``GET /api/gallica/search`` — recherche SRU BnF (query / ARK /
auteur / titre / dates / langue). Mock-able pour tests.
- ``POST /api/gallica/import`` — import par ARK via manifeste IIIF
Gallica auto-construit. ``include_gallica_ocr=true`` (défaut)
récupère l'OCR Gallica comme transcription de référence —
argument scientifique fort pour benchmarker un nouveau moteur
contre la référence institutionnelle BnF.
Sécurité
--------
- ``IIIFPreviewRequest`` / ``IIIFImportRequest`` exigent ``https://``
(sauf localhost pour tests).
- Le helper ``adapters.corpus._http.validate_http_url`` (existant)
ajoute une couche anti-SSRF : refus loopback, RFC 1918,
métadonnées cloud (testé).
- ``GallicaImportRequest._validate_ark`` : format ``<naan>/<id>``
strict, refuse URLs, ``..``, chemins absolus. Normalise le
préfixe ``ark:/`` optionnel.
- ``output_dir`` validé via ``validated_user_output_dir``
(confinement workspace).
- Erreurs amont (réseau, JSON invalide) → 502 lisible, pas de
trace serveur.
UI
--
- Source-switch enrichi : 2 nouvelles chips ``IIIF`` et ``Gallica``
à côté de HTR-United et HuggingFace.
- Panneau IIIF : input URL → bouton « Aperçu » → carte de
validation (label + n canvas) → bouton « Importer le corpus »
avec sélecteurs pages + résolution max.
- Panneau Gallica : recherche SRU multi-critères + import direct
par ARK avec checkbox « Récupérer l'OCR Gallica ».
- Handlers JS : ``previewIIIF()``, ``importIIIF()``,
``searchGallica()``, ``importGallica()``. Étendu
``switchLibrarySource()`` pour gérer 4 panneaux.
- 24 nouvelles clés i18n (FR + EN) dans la table T inline.
Tests
-----
- ``tests/web/routers/test_iiif_router.py`` : 14 tests
(validation HTTPS, anti-SSRF, preview heureux, JSON invalide,
network failure, validation import).
- ``tests/web/routers/test_gallica_router.py`` : 13 tests
(validation ARK, normalisation préfixe, recherche SRU mockée,
import par ARK, payload validation).
- ``tests/web/test_import_view_iiif_gallica.py`` : 22 tests
(présence des IDs DOM dans le template, couverture i18n FR↔EN).
Verification :
- 5234 tests passed (+52 vs S1), 0 failed, 20 skipped
- make lint : All checks passed
- ``scripts/gen_readme_tables.py`` regénéré pour inclure les 4
nouvelles routes dans la matrice produit du README ; seuil LOC
du README relevé 515 → 525 (+marge pour S3 eScriptorium).
DoD :
- Un manifeste IIIF public (preview + import) est accepté par
``POST /api/iiif/import`` → corpus visible dans
``GET /api/corpus/uploads`` après refresh.
- Un ARK Gallica (import par ARK direct + récupération OCR
Gallica) est accepté par ``POST /api/gallica/import``.
- Les chips IIIF et Gallica sont navigables dans la vue Import.
https://claude.ai/code/session_01WYDbfkhKPeBZ15BTP4e9Ye
- README.md +4 -0
- picarones/interfaces/web/app.py +4 -0
- picarones/interfaces/web/models.py +115 -0
- picarones/interfaces/web/routers/gallica.py +165 -0
- picarones/interfaces/web/routers/iiif.py +141 -0
- picarones/interfaces/web/static/web-app.js +232 -4
- picarones/interfaces/web/templates/_view_import.html +131 -0
- tests/docs/test_readme_dual_lang.py +5 -2
- tests/web/routers/test_gallica_router.py +219 -0
- tests/web/routers/test_iiif_router.py +218 -0
- tests/web/test_import_view_iiif_gallica.py +146 -0
|
@@ -274,11 +274,15 @@ when running. Summary:
|
|
| 274 |
| `DELETE` | `/api/corpus/uploads/{corpus_id}` | Api Corpus Delete |
|
| 275 |
| `GET` | `/api/csrf/token` | Api Csrf Token |
|
| 276 |
| `GET` | `/api/engines` | Api Engines |
|
|
|
|
|
|
|
| 277 |
| `GET` | `/api/history/regressions` | Api History Regressions |
|
| 278 |
| `GET` | `/api/htr-united/catalogue` | Api Htr United Catalogue |
|
| 279 |
| `POST` | `/api/htr-united/import` | Api Htr United Import |
|
| 280 |
| `POST` | `/api/huggingface/import` | Api Huggingface Import |
|
| 281 |
| `GET` | `/api/huggingface/search` | Api Huggingface Search |
|
|
|
|
|
|
|
| 282 |
| `GET` | `/api/lang` | Api Get Lang |
|
| 283 |
| `POST` | `/api/lang/{lang_code}` | Api Set Lang |
|
| 284 |
| `GET` | `/api/models/{provider}` | Api Models |
|
|
|
|
| 274 |
| `DELETE` | `/api/corpus/uploads/{corpus_id}` | Api Corpus Delete |
|
| 275 |
| `GET` | `/api/csrf/token` | Api Csrf Token |
|
| 276 |
| `GET` | `/api/engines` | Api Engines |
|
| 277 |
+
| `POST` | `/api/gallica/import` | Api Gallica Import |
|
| 278 |
+
| `GET` | `/api/gallica/search` | Api Gallica Search |
|
| 279 |
| `GET` | `/api/history/regressions` | Api History Regressions |
|
| 280 |
| `GET` | `/api/htr-united/catalogue` | Api Htr United Catalogue |
|
| 281 |
| `POST` | `/api/htr-united/import` | Api Htr United Import |
|
| 282 |
| `POST` | `/api/huggingface/import` | Api Huggingface Import |
|
| 283 |
| `GET` | `/api/huggingface/search` | Api Huggingface Search |
|
| 284 |
+
| `POST` | `/api/iiif/import` | Api Iiif Import |
|
| 285 |
+
| `GET` | `/api/iiif/preview` | Api Iiif Preview |
|
| 286 |
| `GET` | `/api/lang` | Api Get Lang |
|
| 287 |
| `POST` | `/api/lang/{lang_code}` | Api Set Lang |
|
| 288 |
| `GET` | `/api/models/{provider}` | Api Models |
|
|
@@ -41,8 +41,10 @@ from picarones.interfaces.web.routers import (
|
|
| 41 |
config as _config_router,
|
| 42 |
corpus as _corpus_router,
|
| 43 |
engines as _engines_router,
|
|
|
|
| 44 |
history as _history_router,
|
| 45 |
home as _home_router,
|
|
|
|
| 46 |
importers as _importers_router,
|
| 47 |
normalization as _normalization_router,
|
| 48 |
reports as _reports_router,
|
|
@@ -251,5 +253,7 @@ app.include_router(_synthesis_router.router)
|
|
| 251 |
app.include_router(_history_router.router)
|
| 252 |
app.include_router(_reports_router.router)
|
| 253 |
app.include_router(_importers_router.router)
|
|
|
|
|
|
|
| 254 |
app.include_router(_benchmark_router.router)
|
| 255 |
app.include_router(_home_router.router)
|
|
|
|
| 41 |
config as _config_router,
|
| 42 |
corpus as _corpus_router,
|
| 43 |
engines as _engines_router,
|
| 44 |
+
gallica as _gallica_router,
|
| 45 |
history as _history_router,
|
| 46 |
home as _home_router,
|
| 47 |
+
iiif as _iiif_router,
|
| 48 |
importers as _importers_router,
|
| 49 |
normalization as _normalization_router,
|
| 50 |
reports as _reports_router,
|
|
|
|
| 253 |
app.include_router(_history_router.router)
|
| 254 |
app.include_router(_reports_router.router)
|
| 255 |
app.include_router(_importers_router.router)
|
| 256 |
+
app.include_router(_iiif_router.router)
|
| 257 |
+
app.include_router(_gallica_router.router)
|
| 258 |
app.include_router(_benchmark_router.router)
|
| 259 |
app.include_router(_home_router.router)
|
|
@@ -95,6 +95,117 @@ class HuggingFaceImportRequest(BaseModel):
|
|
| 95 |
max_samples: int = Field(default=100, ge=1, le=10_000)
|
| 96 |
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
class PipelineConfig(BaseModel):
|
| 99 |
name: str = Field(default="", max_length=_MAX_NAME)
|
| 100 |
engine_name: str = Field(default="", max_length=_MAX_NAME)
|
|
@@ -261,6 +372,10 @@ __all__ = [
|
|
| 261 |
"PipelineMode",
|
| 262 |
"HTRUnitedImportRequest",
|
| 263 |
"HuggingFaceImportRequest",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 264 |
"PipelineConfig",
|
| 265 |
"BenchmarkRunRequest",
|
| 266 |
]
|
|
|
|
| 95 |
max_samples: int = Field(default=100, ge=1, le=10_000)
|
| 96 |
|
| 97 |
|
| 98 |
+
# Bornes spécifiques aux importeurs HTTP-driven (IIIF, Gallica).
|
| 99 |
+
_MAX_URL = 2048
|
| 100 |
+
"""Longueur max d'une URL IIIF (manifestes peuvent être longs)."""
|
| 101 |
+
|
| 102 |
+
_MAX_ARK = 256
|
| 103 |
+
"""Identifiant ARK Gallica : ``12148/btv1b8453561w``, ~30 chars typiques."""
|
| 104 |
+
|
| 105 |
+
_MAX_PAGE_SELECTOR = 256
|
| 106 |
+
"""Sélecteur de pages : ``"all"``, ``"1-10"``, ``"1,3,5-10"``…"""
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
class IIIFPreviewRequest(BaseModel):
|
| 110 |
+
"""Pré-fetch d'un manifeste IIIF : label, version, n canvas."""
|
| 111 |
+
|
| 112 |
+
manifest_url: str = Field(min_length=8, max_length=_MAX_URL)
|
| 113 |
+
|
| 114 |
+
@field_validator("manifest_url")
|
| 115 |
+
@classmethod
|
| 116 |
+
def _validate_manifest_url(cls, v: str) -> str:
|
| 117 |
+
"""Schéma HTTPS exigé (défense en profondeur ; la validation
|
| 118 |
+
SSRF du helper ``_http.validate_http_url`` ajoute une couche).
|
| 119 |
+
|
| 120 |
+
HTTPS plutôt qu'HTTP : un manifeste IIIF récupéré en clair
|
| 121 |
+
peut être altéré en transit (image swap, GT empoisonnée).
|
| 122 |
+
Les institutions publient leur IIIF en HTTPS depuis ~2019."""
|
| 123 |
+
if not (v.startswith("https://") or v.startswith("http://localhost")):
|
| 124 |
+
raise ValueError(
|
| 125 |
+
f"manifest_url : schéma HTTPS requis "
|
| 126 |
+
f"(http:// refusé sauf localhost pour tests). Reçu : {v!r}",
|
| 127 |
+
)
|
| 128 |
+
return v
|
| 129 |
+
|
| 130 |
+
|
| 131 |
+
class IIIFImportRequest(BaseModel):
|
| 132 |
+
"""Import d'un corpus depuis un manifeste IIIF.
|
| 133 |
+
|
| 134 |
+
Réutilise la validation HTTPS de ``IIIFPreviewRequest``.
|
| 135 |
+
"""
|
| 136 |
+
|
| 137 |
+
manifest_url: str = Field(min_length=8, max_length=_MAX_URL)
|
| 138 |
+
output_dir: str = Field(default="./corpus/", max_length=_MAX_PATH)
|
| 139 |
+
pages: str = Field(default="all", max_length=_MAX_PAGE_SELECTOR)
|
| 140 |
+
"""Sélecteur : ``"all"``, ``"1-10"``, ``"1,3,5-10"``."""
|
| 141 |
+
max_resolution: int = Field(default=0, ge=0, le=8192)
|
| 142 |
+
"""Largeur max des images téléchargées (0 = pleine résolution)."""
|
| 143 |
+
|
| 144 |
+
_validate_url = field_validator("manifest_url")(
|
| 145 |
+
IIIFPreviewRequest._validate_manifest_url.__func__,
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
|
| 149 |
+
class GallicaSearchRequest(BaseModel):
|
| 150 |
+
"""Recherche SRU dans le catalogue Gallica BnF.
|
| 151 |
+
|
| 152 |
+
Au moins l'un des champs (``query``, ``ark``, ``author``,
|
| 153 |
+
``title``) doit être renseigné — validation par le routeur, pas
|
| 154 |
+
par Pydantic, pour permettre des requêtes ouvertes en GET.
|
| 155 |
+
"""
|
| 156 |
+
|
| 157 |
+
query: str = Field(default="", max_length=_MAX_NAME)
|
| 158 |
+
"""Recherche textuelle générique (titre + auteur + sujet)."""
|
| 159 |
+
ark: str = Field(default="", max_length=_MAX_ARK)
|
| 160 |
+
"""Identifiant ARK spécifique (ex : ``"12148/btv1b8453561w"``)."""
|
| 161 |
+
author: str = Field(default="", max_length=_MAX_NAME)
|
| 162 |
+
title: str = Field(default="", max_length=_MAX_NAME)
|
| 163 |
+
date_from: int = Field(default=0, ge=0, le=2100)
|
| 164 |
+
date_to: int = Field(default=0, ge=0, le=2100)
|
| 165 |
+
language: str = Field(default="", max_length=8)
|
| 166 |
+
"""ISO 639-2 (ex : ``"fre"``, ``"lat"``)."""
|
| 167 |
+
max_results: int = Field(default=20, ge=1, le=50)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
class GallicaImportRequest(BaseModel):
|
| 171 |
+
"""Import d'un document Gallica via ARK + plage de pages.
|
| 172 |
+
|
| 173 |
+
Si ``include_gallica_ocr=True`` (défaut), l'OCR Gallica est
|
| 174 |
+
récupéré comme transcription de référence (utile pour benchmarker
|
| 175 |
+
un nouveau moteur OCR contre la référence institutionnelle BnF).
|
| 176 |
+
"""
|
| 177 |
+
|
| 178 |
+
ark: str = Field(min_length=1, max_length=_MAX_ARK)
|
| 179 |
+
output_dir: str = Field(default="./corpus/", max_length=_MAX_PATH)
|
| 180 |
+
pages: str = Field(default="all", max_length=_MAX_PAGE_SELECTOR)
|
| 181 |
+
include_gallica_ocr: bool = True
|
| 182 |
+
max_resolution: int = Field(default=0, ge=0, le=8192)
|
| 183 |
+
|
| 184 |
+
@field_validator("ark")
|
| 185 |
+
@classmethod
|
| 186 |
+
def _validate_ark(cls, v: str) -> str:
|
| 187 |
+
"""L'ARK doit être au format ``<naan>/<id>`` (deux segments
|
| 188 |
+
alphanumériques séparés par ``/``). Refuse les schémas
|
| 189 |
+
d'URL, les chemins absolus, les ``..``.
|
| 190 |
+
"""
|
| 191 |
+
import re
|
| 192 |
+
# Format BnF typique : "12148/btv1b8453561w" ou "12148/cb12345678z"
|
| 193 |
+
# Permettre aussi le préfixe complet "ark:/12148/..." en le retirant.
|
| 194 |
+
cleaned = v.strip()
|
| 195 |
+
if cleaned.startswith("ark:/"):
|
| 196 |
+
cleaned = cleaned[len("ark:/"):]
|
| 197 |
+
if not re.match(r"^[\w.\-]+/[\w.\-]+$", cleaned):
|
| 198 |
+
raise ValueError(
|
| 199 |
+
f"ark : format invalide. Attendu : ``<naan>/<id>`` "
|
| 200 |
+
f"(ex : ``12148/btv1b8453561w``). Reçu : {v!r}",
|
| 201 |
+
)
|
| 202 |
+
if ".." in cleaned or "//" in cleaned:
|
| 203 |
+
raise ValueError(
|
| 204 |
+
f"ark : segments ``..`` ou ``//`` interdits. Reçu : {v!r}",
|
| 205 |
+
)
|
| 206 |
+
return cleaned
|
| 207 |
+
|
| 208 |
+
|
| 209 |
class PipelineConfig(BaseModel):
|
| 210 |
name: str = Field(default="", max_length=_MAX_NAME)
|
| 211 |
engine_name: str = Field(default="", max_length=_MAX_NAME)
|
|
|
|
| 372 |
"PipelineMode",
|
| 373 |
"HTRUnitedImportRequest",
|
| 374 |
"HuggingFaceImportRequest",
|
| 375 |
+
"IIIFPreviewRequest",
|
| 376 |
+
"IIIFImportRequest",
|
| 377 |
+
"GallicaSearchRequest",
|
| 378 |
+
"GallicaImportRequest",
|
| 379 |
"PipelineConfig",
|
| 380 |
"BenchmarkRunRequest",
|
| 381 |
]
|
|
@@ -0,0 +1,165 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Router des importeurs Gallica BnF.
|
| 2 |
+
|
| 3 |
+
Expose le pattern ``search → import`` du backend
|
| 4 |
+
``picarones.adapters.corpus.gallica.GallicaClient`` côté web :
|
| 5 |
+
|
| 6 |
+
- ``GET /api/gallica/search`` : recherche SRU (titre/auteur/date/
|
| 7 |
+
langue/ARK). Wrapper de :meth:`GallicaClient.search`.
|
| 8 |
+
- ``POST /api/gallica/import`` : importe un document par ARK via
|
| 9 |
+
son manifeste IIIF Gallica ; si ``include_gallica_ocr=True``,
|
| 10 |
+
récupère aussi l'OCR Gallica comme transcription de référence.
|
| 11 |
+
|
| 12 |
+
Différenciation par rapport au router IIIF générique
|
| 13 |
+
----------------------------------------------------
|
| 14 |
+
Gallica utilise IIIF en interne (manifeste auto-construit depuis
|
| 15 |
+
l'ARK), mais ajoute :
|
| 16 |
+
|
| 17 |
+
- Recherche SRU institutionnelle (catalogue éditorialisé).
|
| 18 |
+
- OCR Gallica comme transcription de référence optionnelle.
|
| 19 |
+
- Métadonnées Dublin Core enrichies dans le corpus retourné.
|
| 20 |
+
|
| 21 |
+
Sécurité
|
| 22 |
+
--------
|
| 23 |
+
- Le helper :func:`picarones.adapters.corpus._http.validate_http_url`
|
| 24 |
+
protège les téléchargements (anti-SSRF statique).
|
| 25 |
+
- Le validator ARK (``GallicaImportRequest._validate_ark``) refuse
|
| 26 |
+
les chemins absolus, les ``..``, les schémas d'URL.
|
| 27 |
+
- ``output_dir`` validé via :func:`validated_user_output_dir`.
|
| 28 |
+
- Pas d'API key — Gallica est ouverte en lecture publique.
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
from __future__ import annotations
|
| 32 |
+
|
| 33 |
+
import logging
|
| 34 |
+
|
| 35 |
+
from fastapi import APIRouter, HTTPException, Query
|
| 36 |
+
|
| 37 |
+
from picarones.interfaces.web._path_helpers import (
|
| 38 |
+
validated_user_output_dir as _validated_output_dir,
|
| 39 |
+
)
|
| 40 |
+
from picarones.interfaces.web.models import GallicaImportRequest
|
| 41 |
+
|
| 42 |
+
logger = logging.getLogger(__name__)
|
| 43 |
+
|
| 44 |
+
router = APIRouter()
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@router.get("/api/gallica/search")
|
| 48 |
+
async def api_gallica_search(
|
| 49 |
+
query: str = Query(default="", max_length=256),
|
| 50 |
+
ark: str = Query(default="", max_length=256),
|
| 51 |
+
author: str = Query(default="", max_length=256),
|
| 52 |
+
title: str = Query(default="", max_length=256),
|
| 53 |
+
date_from: int = Query(default=0, ge=0, le=2100),
|
| 54 |
+
date_to: int = Query(default=0, ge=0, le=2100),
|
| 55 |
+
language: str = Query(default="", max_length=8),
|
| 56 |
+
max_results: int = Query(default=20, ge=1, le=50),
|
| 57 |
+
) -> dict:
|
| 58 |
+
"""Recherche dans le catalogue Gallica via l'API SRU BnF.
|
| 59 |
+
|
| 60 |
+
Au moins un critère doit être renseigné (``query``, ``ark``,
|
| 61 |
+
``author`` ou ``title``) — sinon retour 400.
|
| 62 |
+
|
| 63 |
+
Réponse exemple ::
|
| 64 |
+
|
| 65 |
+
{
|
| 66 |
+
"total": 3,
|
| 67 |
+
"records": [
|
| 68 |
+
{
|
| 69 |
+
"ark": "12148/btv1b8453561w",
|
| 70 |
+
"title": "Chroniques de France",
|
| 71 |
+
"author": "Anonyme",
|
| 72 |
+
"date": "1450",
|
| 73 |
+
"type": "manuscrit",
|
| 74 |
+
"url": "https://gallica.bnf.fr/ark:/12148/btv1b8453561w",
|
| 75 |
+
"manifest_url": "https://gallica.bnf.fr/.../manifest.json"
|
| 76 |
+
},
|
| 77 |
+
...
|
| 78 |
+
]
|
| 79 |
+
}
|
| 80 |
+
"""
|
| 81 |
+
if not any([query, ark, author, title]):
|
| 82 |
+
raise HTTPException(
|
| 83 |
+
status_code=400,
|
| 84 |
+
detail=(
|
| 85 |
+
"Au moins un critère de recherche requis : query, ark, "
|
| 86 |
+
"author ou title."
|
| 87 |
+
),
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
from picarones.adapters.corpus.gallica import GallicaClient
|
| 91 |
+
|
| 92 |
+
client = GallicaClient()
|
| 93 |
+
# ``query`` libre est traduit en recherche titre (heuristique
|
| 94 |
+
# SRU la plus utile pour la découverte) ; les autres critères
|
| 95 |
+
# vont dans leurs champs dédiés.
|
| 96 |
+
title_terms = title or query or None
|
| 97 |
+
try:
|
| 98 |
+
records = client.search(
|
| 99 |
+
ark=ark or None,
|
| 100 |
+
title=title_terms,
|
| 101 |
+
author=author or None,
|
| 102 |
+
date_from=date_from or None,
|
| 103 |
+
date_to=date_to or None,
|
| 104 |
+
language=language or None,
|
| 105 |
+
max_results=max_results,
|
| 106 |
+
)
|
| 107 |
+
except Exception as exc: # noqa: BLE001
|
| 108 |
+
# Le backend Gallica logue et retourne [] sur erreur SRU —
|
| 109 |
+
# ici on ne devrait pas tomber, mais on garde un filet
|
| 110 |
+
# gracieux côté HTTP.
|
| 111 |
+
logger.warning("[gallica] erreur recherche : %s", exc)
|
| 112 |
+
raise HTTPException(
|
| 113 |
+
status_code=502,
|
| 114 |
+
detail=f"Gallica service indisponible : {exc}",
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
return {
|
| 118 |
+
"total": len(records),
|
| 119 |
+
"records": [r.as_dict() for r in records],
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
@router.post("/api/gallica/import")
|
| 124 |
+
async def api_gallica_import(req: GallicaImportRequest) -> dict:
|
| 125 |
+
"""Importe un document Gallica via son ARK dans ``req.output_dir``.
|
| 126 |
+
|
| 127 |
+
Le manifeste IIIF est auto-construit depuis l'ARK ; les images
|
| 128 |
+
sont téléchargées via :class:`IIIFImporter` ; si
|
| 129 |
+
``include_gallica_ocr=True`` (défaut), l'OCR Gallica est
|
| 130 |
+
enrichi comme ``ground_truth`` ou ``metadata['gallica_ocr']``.
|
| 131 |
+
|
| 132 |
+
Erreurs :
|
| 133 |
+
|
| 134 |
+
- ``400`` : ARK mal formé, output_dir invalide.
|
| 135 |
+
- ``502`` : service Gallica indisponible.
|
| 136 |
+
"""
|
| 137 |
+
from picarones.adapters.corpus.gallica import GallicaClient
|
| 138 |
+
|
| 139 |
+
output_dir = _validated_output_dir(req.output_dir)
|
| 140 |
+
client = GallicaClient()
|
| 141 |
+
|
| 142 |
+
try:
|
| 143 |
+
corpus = client.import_document(
|
| 144 |
+
ark=req.ark,
|
| 145 |
+
pages=req.pages,
|
| 146 |
+
output_dir=str(output_dir),
|
| 147 |
+
include_gallica_ocr=req.include_gallica_ocr,
|
| 148 |
+
max_resolution=req.max_resolution,
|
| 149 |
+
show_progress=False,
|
| 150 |
+
)
|
| 151 |
+
except ValueError as exc:
|
| 152 |
+
raise HTTPException(status_code=400, detail=str(exc))
|
| 153 |
+
except RuntimeError as exc:
|
| 154 |
+
raise HTTPException(
|
| 155 |
+
status_code=502,
|
| 156 |
+
detail=f"Gallica import échoué : {exc}",
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
return {
|
| 160 |
+
"status": "ok",
|
| 161 |
+
"n_documents": len(corpus.documents),
|
| 162 |
+
"corpus_dir": str(output_dir),
|
| 163 |
+
"ark": req.ark,
|
| 164 |
+
"include_gallica_ocr": req.include_gallica_ocr,
|
| 165 |
+
}
|
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Router des importeurs IIIF.
|
| 2 |
+
|
| 3 |
+
Expose le pattern ``preview → import`` du backend
|
| 4 |
+
``picarones.adapters.corpus.iiif.IIIFImporter`` côté web :
|
| 5 |
+
|
| 6 |
+
- ``GET /api/iiif/preview`` : récupère métadonnées du manifeste
|
| 7 |
+
(label, version, nombre de canvas, échantillon de transcriptions)
|
| 8 |
+
sans télécharger d'image. Utilisé par l'UI pour valider l'URL
|
| 9 |
+
et afficher un aperçu avant import.
|
| 10 |
+
- ``POST /api/iiif/import`` : télécharge les images + GT
|
| 11 |
+
éventuels dans ``output_dir`` (confinée au workspace web).
|
| 12 |
+
|
| 13 |
+
Sécurité
|
| 14 |
+
--------
|
| 15 |
+
- ``IIIFPreviewRequest`` impose ``https://`` (sauf localhost pour
|
| 16 |
+
tests).
|
| 17 |
+
- Le helper :func:`picarones.adapters.corpus._http.validate_http_url`
|
| 18 |
+
ajoute une seconde couche anti-SSRF : refus loopback / RFC 1918 /
|
| 19 |
+
métadonnées cloud.
|
| 20 |
+
- ``output_dir`` validé via :func:`validated_user_output_dir`
|
| 21 |
+
(confinement workspace).
|
| 22 |
+
- Erreurs amont (réseau indisponible, manifeste mal formé, JSON
|
| 23 |
+
invalide) → 502 ``Bad Gateway`` avec message lisible, pas de
|
| 24 |
+
trace serveur.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from __future__ import annotations
|
| 28 |
+
|
| 29 |
+
import logging
|
| 30 |
+
|
| 31 |
+
from fastapi import APIRouter, HTTPException, Query
|
| 32 |
+
from pydantic import ValidationError
|
| 33 |
+
|
| 34 |
+
from picarones.interfaces.web._path_helpers import (
|
| 35 |
+
validated_user_output_dir as _validated_output_dir,
|
| 36 |
+
)
|
| 37 |
+
from picarones.interfaces.web.models import (
|
| 38 |
+
IIIFImportRequest,
|
| 39 |
+
IIIFPreviewRequest,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
logger = logging.getLogger(__name__)
|
| 43 |
+
|
| 44 |
+
router = APIRouter()
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
@router.get("/api/iiif/preview")
|
| 48 |
+
async def api_iiif_preview(
|
| 49 |
+
manifest_url: str = Query(min_length=8, max_length=2048),
|
| 50 |
+
) -> dict:
|
| 51 |
+
"""Récupère un aperçu d'un manifeste IIIF sans télécharger les
|
| 52 |
+
images. Retourne label, version IIIF, nombre de canvas, et
|
| 53 |
+
nombre de canvas avec annotations de transcription.
|
| 54 |
+
|
| 55 |
+
Réponse exemple ::
|
| 56 |
+
|
| 57 |
+
{
|
| 58 |
+
"manifest_url": "https://gallica.bnf.fr/.../manifest.json",
|
| 59 |
+
"label": "Chroniques médiévales — vol. 1",
|
| 60 |
+
"iiif_version": 2,
|
| 61 |
+
"canvas_count": 124,
|
| 62 |
+
"with_transcriptions": 12,
|
| 63 |
+
"sample_labels": ["folio 1r", "folio 1v", "folio 2r"]
|
| 64 |
+
}
|
| 65 |
+
"""
|
| 66 |
+
# Validation Pydantic (HTTPS exigé, longueur bornée). FastAPI ne
|
| 67 |
+
# convertit pas automatiquement les ``BaseModel`` instanciés en
|
| 68 |
+
# query param : on attrape la ``ValidationError`` et on renvoie un
|
| 69 |
+
# 400 lisible.
|
| 70 |
+
try:
|
| 71 |
+
req = IIIFPreviewRequest(manifest_url=manifest_url)
|
| 72 |
+
except ValidationError as exc:
|
| 73 |
+
raise HTTPException(status_code=400, detail=str(exc))
|
| 74 |
+
from picarones.adapters.corpus.iiif import IIIFImporter
|
| 75 |
+
|
| 76 |
+
try:
|
| 77 |
+
importer = IIIFImporter(req.manifest_url, max_resolution=0)
|
| 78 |
+
importer.load()
|
| 79 |
+
except ValueError as exc:
|
| 80 |
+
# URL refusée par anti-SSRF, JSON mal formé, manifeste invalide
|
| 81 |
+
raise HTTPException(status_code=400, detail=str(exc))
|
| 82 |
+
except RuntimeError as exc:
|
| 83 |
+
# Téléchargement échoué après retries
|
| 84 |
+
raise HTTPException(
|
| 85 |
+
status_code=502,
|
| 86 |
+
detail=f"IIIF service indisponible : {exc}",
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
parser = importer.parser
|
| 90 |
+
canvases = parser.canvases()
|
| 91 |
+
with_tx = sum(1 for c in canvases if c.transcription)
|
| 92 |
+
sample_labels = [c.label for c in canvases[:5] if c.label]
|
| 93 |
+
|
| 94 |
+
return {
|
| 95 |
+
"manifest_url": req.manifest_url,
|
| 96 |
+
"label": parser.label,
|
| 97 |
+
"iiif_version": parser.version,
|
| 98 |
+
"canvas_count": len(canvases),
|
| 99 |
+
"with_transcriptions": with_tx,
|
| 100 |
+
"sample_labels": sample_labels,
|
| 101 |
+
}
|
| 102 |
+
|
| 103 |
+
|
| 104 |
+
@router.post("/api/iiif/import")
|
| 105 |
+
async def api_iiif_import(req: IIIFImportRequest) -> dict:
|
| 106 |
+
"""Lance l'import d'un manifeste IIIF dans ``req.output_dir``.
|
| 107 |
+
|
| 108 |
+
Retourne le résumé ``{"status", "n_documents", "corpus_dir"}``.
|
| 109 |
+
|
| 110 |
+
Erreurs :
|
| 111 |
+
|
| 112 |
+
- ``400`` : URL refusée (SSRF), manifeste invalide.
|
| 113 |
+
- ``502`` : service IIIF indisponible (timeout, 5xx persistant).
|
| 114 |
+
"""
|
| 115 |
+
from picarones.adapters.corpus.iiif import IIIFImporter
|
| 116 |
+
|
| 117 |
+
output_dir = _validated_output_dir(req.output_dir)
|
| 118 |
+
|
| 119 |
+
try:
|
| 120 |
+
importer = IIIFImporter(req.manifest_url, max_resolution=req.max_resolution)
|
| 121 |
+
importer.load()
|
| 122 |
+
corpus = importer.import_corpus(
|
| 123 |
+
pages=req.pages,
|
| 124 |
+
output_dir=output_dir,
|
| 125 |
+
show_progress=False,
|
| 126 |
+
)
|
| 127 |
+
except ValueError as exc:
|
| 128 |
+
raise HTTPException(status_code=400, detail=str(exc))
|
| 129 |
+
except RuntimeError as exc:
|
| 130 |
+
raise HTTPException(
|
| 131 |
+
status_code=502,
|
| 132 |
+
detail=f"IIIF import échoué : {exc}",
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
+
return {
|
| 136 |
+
"status": "ok",
|
| 137 |
+
"n_documents": len(corpus.documents),
|
| 138 |
+
"corpus_dir": str(output_dir),
|
| 139 |
+
"iiif_version": importer.parser.version,
|
| 140 |
+
"label": importer.parser.label,
|
| 141 |
+
}
|
|
@@ -136,6 +136,30 @@ const T = {
|
|
| 136 |
bench_partial_resume_hint: "Crée un répertoire de checkpoint pour reprendre un run interrompu.",
|
| 137 |
bench_output_json_label: "Exporter aussi en JSON",
|
| 138 |
bench_output_json_hint: "Génère un fichier JSON additionnel à côté du rapport HTML.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
compose_add: "+ Ajouter",
|
| 140 |
compose_empty: "Aucun concurrent ajouté.",
|
| 141 |
mode_text_only: "Post-correction texte",
|
|
@@ -301,6 +325,30 @@ const T = {
|
|
| 301 |
bench_partial_resume_hint: "Creates a checkpoint directory to resume an interrupted run.",
|
| 302 |
bench_output_json_label: "Also export as JSON",
|
| 303 |
bench_output_json_hint: "Generates an additional JSON file alongside the HTML report.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
compose_add: "+ Add",
|
| 305 |
compose_empty: "No competitors added.",
|
| 306 |
mode_text_only: "Text post-correction",
|
|
@@ -1441,10 +1489,15 @@ function switchLibrarySource(source) {
|
|
| 1441 |
document.querySelectorAll("#library-source-switch .source-chip").forEach(b => {
|
| 1442 |
b.classList.toggle("on", b.dataset.source === source);
|
| 1443 |
});
|
| 1444 |
-
const
|
| 1445 |
-
|
| 1446 |
-
|
| 1447 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1448 |
if (source === "huggingface" && !_libraryHfInited) {
|
| 1449 |
_libraryHfInited = true;
|
| 1450 |
searchHuggingFace();
|
|
@@ -1452,6 +1505,181 @@ function switchLibrarySource(source) {
|
|
| 1452 |
}
|
| 1453 |
let _libraryHfInited = false;
|
| 1454 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1455 |
async function loadLibraryLocalCorpora() {
|
| 1456 |
/** GET /api/corpus/uploads → rend la grille des corpus locaux
|
| 1457 |
* (pane "Mes corpus") ET alimente le dropdown du Benchmark. */
|
|
|
|
| 136 |
bench_partial_resume_hint: "Crée un répertoire de checkpoint pour reprendre un run interrompu.",
|
| 137 |
bench_output_json_label: "Exporter aussi en JSON",
|
| 138 |
bench_output_json_hint: "Génère un fichier JSON additionnel à côté du rapport HTML.",
|
| 139 |
+
import_iiif_title: "Import IIIF",
|
| 140 |
+
import_iiif_desc: "Manifestes IIIF Presentation API v2/v3 — bibliothèques numériques.",
|
| 141 |
+
import_iiif_url_label: "URL du manifeste",
|
| 142 |
+
import_iiif_url_hint: "https:// requis",
|
| 143 |
+
import_iiif_preview_btn: "Aperçu",
|
| 144 |
+
import_iiif_pages_label: "Pages à importer",
|
| 145 |
+
import_iiif_max_res_label: "Résolution max (px)",
|
| 146 |
+
import_iiif_import_btn: "Importer le corpus",
|
| 147 |
+
import_gallica_title: "Import Gallica BnF",
|
| 148 |
+
import_gallica_desc: "Catalogue institutionnel BnF — l'OCR Gallica peut servir de transcription de référence.",
|
| 149 |
+
import_gallica_search_label: "Recherche",
|
| 150 |
+
import_gallica_author_label: "Auteur",
|
| 151 |
+
import_gallica_lang_label: "Langue (ISO 639)",
|
| 152 |
+
import_gallica_search_hint: "…ou collez directement un ARK ci-dessous pour importer.",
|
| 153 |
+
import_gallica_direct_label: "Import direct par ARK",
|
| 154 |
+
import_gallica_ark_label: "ARK",
|
| 155 |
+
import_gallica_pages_label: "Pages",
|
| 156 |
+
import_gallica_ocr_label: "Récupérer l'OCR Gallica",
|
| 157 |
+
import_gallica_import_btn: "Importer le document",
|
| 158 |
+
iiif_url_required: "URL du manifeste requise.",
|
| 159 |
+
iiif_loading: "Chargement…",
|
| 160 |
+
gallica_search_required: "Saisissez une recherche ou un auteur.",
|
| 161 |
+
gallica_no_results: "Aucun résultat.",
|
| 162 |
+
gallica_use_ark: "Utiliser cet ARK",
|
| 163 |
compose_add: "+ Ajouter",
|
| 164 |
compose_empty: "Aucun concurrent ajouté.",
|
| 165 |
mode_text_only: "Post-correction texte",
|
|
|
|
| 325 |
bench_partial_resume_hint: "Creates a checkpoint directory to resume an interrupted run.",
|
| 326 |
bench_output_json_label: "Also export as JSON",
|
| 327 |
bench_output_json_hint: "Generates an additional JSON file alongside the HTML report.",
|
| 328 |
+
import_iiif_title: "IIIF import",
|
| 329 |
+
import_iiif_desc: "IIIF Presentation API v2/v3 manifests — digital libraries.",
|
| 330 |
+
import_iiif_url_label: "Manifest URL",
|
| 331 |
+
import_iiif_url_hint: "https:// required",
|
| 332 |
+
import_iiif_preview_btn: "Preview",
|
| 333 |
+
import_iiif_pages_label: "Pages to import",
|
| 334 |
+
import_iiif_max_res_label: "Max resolution (px)",
|
| 335 |
+
import_iiif_import_btn: "Import corpus",
|
| 336 |
+
import_gallica_title: "Gallica BnF import",
|
| 337 |
+
import_gallica_desc: "BnF institutional catalogue — Gallica OCR can be used as a reference transcription.",
|
| 338 |
+
import_gallica_search_label: "Search",
|
| 339 |
+
import_gallica_author_label: "Author",
|
| 340 |
+
import_gallica_lang_label: "Language (ISO 639)",
|
| 341 |
+
import_gallica_search_hint: "…or paste an ARK directly below to import.",
|
| 342 |
+
import_gallica_direct_label: "Direct import by ARK",
|
| 343 |
+
import_gallica_ark_label: "ARK",
|
| 344 |
+
import_gallica_pages_label: "Pages",
|
| 345 |
+
import_gallica_ocr_label: "Fetch Gallica OCR",
|
| 346 |
+
import_gallica_import_btn: "Import document",
|
| 347 |
+
iiif_url_required: "Manifest URL required.",
|
| 348 |
+
iiif_loading: "Loading…",
|
| 349 |
+
gallica_search_required: "Enter a search term or author.",
|
| 350 |
+
gallica_no_results: "No results.",
|
| 351 |
+
gallica_use_ark: "Use this ARK",
|
| 352 |
compose_add: "+ Add",
|
| 353 |
compose_empty: "No competitors added.",
|
| 354 |
mode_text_only: "Text post-correction",
|
|
|
|
| 1489 |
document.querySelectorAll("#library-source-switch .source-chip").forEach(b => {
|
| 1490 |
b.classList.toggle("on", b.dataset.source === source);
|
| 1491 |
});
|
| 1492 |
+
const panels = {
|
| 1493 |
+
"htr-united": document.getElementById("library-source-htr-united"),
|
| 1494 |
+
"huggingface": document.getElementById("library-source-huggingface"),
|
| 1495 |
+
"iiif": document.getElementById("library-source-iiif"),
|
| 1496 |
+
"gallica": document.getElementById("library-source-gallica"),
|
| 1497 |
+
};
|
| 1498 |
+
for (const [key, el] of Object.entries(panels)) {
|
| 1499 |
+
if (el) el.style.display = source === key ? "block" : "none";
|
| 1500 |
+
}
|
| 1501 |
if (source === "huggingface" && !_libraryHfInited) {
|
| 1502 |
_libraryHfInited = true;
|
| 1503 |
searchHuggingFace();
|
|
|
|
| 1505 |
}
|
| 1506 |
let _libraryHfInited = false;
|
| 1507 |
|
| 1508 |
+
// ─── IIIF preview + import ──────────────────────────────────────────────
|
| 1509 |
+
|
| 1510 |
+
async function previewIIIF() {
|
| 1511 |
+
/** GET /api/iiif/preview pour récupérer label + n canvas avant
|
| 1512 |
+
* l'import effectif des images. */
|
| 1513 |
+
const url = (document.getElementById("iiif-manifest-url").value || "").trim();
|
| 1514 |
+
const previewEl = document.getElementById("iiif-preview-result");
|
| 1515 |
+
const importSection = document.getElementById("iiif-import-section");
|
| 1516 |
+
if (!url) {
|
| 1517 |
+
previewEl.style.display = "block";
|
| 1518 |
+
previewEl.innerHTML = `<div class="empty">${t("iiif_url_required")}</div>`;
|
| 1519 |
+
importSection.style.display = "none";
|
| 1520 |
+
return;
|
| 1521 |
+
}
|
| 1522 |
+
previewEl.style.display = "block";
|
| 1523 |
+
previewEl.innerHTML = `<div class="foot">${t("iiif_loading")}</div>`;
|
| 1524 |
+
importSection.style.display = "none";
|
| 1525 |
+
try {
|
| 1526 |
+
const params = new URLSearchParams({manifest_url: url});
|
| 1527 |
+
const r = await fetch(`/api/iiif/preview?${params}`);
|
| 1528 |
+
if (!r.ok) {
|
| 1529 |
+
const err = await r.json();
|
| 1530 |
+
previewEl.innerHTML = `<div class="empty" style="color:var(--err);">⚠ ${_escapeHtml(err.detail || "Erreur")}</div>`;
|
| 1531 |
+
return;
|
| 1532 |
+
}
|
| 1533 |
+
const d = await r.json();
|
| 1534 |
+
previewEl.innerHTML = `
|
| 1535 |
+
<div class="surface-flat" style="padding:14px;">
|
| 1536 |
+
<div style="font-weight:500; margin-bottom:6px;">${_escapeHtml(d.label || "(sans titre)")}</div>
|
| 1537 |
+
<div class="foot">
|
| 1538 |
+
IIIF v${d.iiif_version} · ${d.canvas_count} canvas
|
| 1539 |
+
${d.with_transcriptions > 0 ? ` · ${d.with_transcriptions} avec GT` : ""}
|
| 1540 |
+
</div>
|
| 1541 |
+
${d.sample_labels && d.sample_labels.length > 0
|
| 1542 |
+
? `<div class="foot" style="margin-top:6px;">Aperçu : ${d.sample_labels.map(_escapeHtml).join(" · ")}</div>`
|
| 1543 |
+
: ""}
|
| 1544 |
+
</div>`;
|
| 1545 |
+
importSection.style.display = "block";
|
| 1546 |
+
} catch (e) {
|
| 1547 |
+
previewEl.innerHTML = `<div class="empty" style="color:var(--err);">⚠ ${_escapeHtml(e.message)}</div>`;
|
| 1548 |
+
}
|
| 1549 |
+
}
|
| 1550 |
+
|
| 1551 |
+
async function importIIIF() {
|
| 1552 |
+
/** POST /api/iiif/import puis rafraîchit la liste des corpus locaux. */
|
| 1553 |
+
const url = (document.getElementById("iiif-manifest-url").value || "").trim();
|
| 1554 |
+
const pages = (document.getElementById("iiif-pages").value || "all").trim();
|
| 1555 |
+
const maxRes = parseInt(document.getElementById("iiif-max-resolution").value, 10) || 0;
|
| 1556 |
+
const statusEl = document.getElementById("iiif-import-status");
|
| 1557 |
+
if (!url) return;
|
| 1558 |
+
statusEl.textContent = lang === "fr" ? "Import en cours…" : "Importing…";
|
| 1559 |
+
try {
|
| 1560 |
+
const r = await fetch("/api/iiif/import", {
|
| 1561 |
+
method: "POST",
|
| 1562 |
+
headers: {"Content-Type": "application/json"},
|
| 1563 |
+
body: JSON.stringify({
|
| 1564 |
+
manifest_url: url,
|
| 1565 |
+
output_dir: "./corpus/",
|
| 1566 |
+
pages: pages,
|
| 1567 |
+
max_resolution: maxRes,
|
| 1568 |
+
}),
|
| 1569 |
+
});
|
| 1570 |
+
if (!r.ok) {
|
| 1571 |
+
const err = await r.json();
|
| 1572 |
+
statusEl.textContent = `⚠ ${err.detail || "Erreur"}`;
|
| 1573 |
+
statusEl.style.color = "var(--err)";
|
| 1574 |
+
return;
|
| 1575 |
+
}
|
| 1576 |
+
const d = await r.json();
|
| 1577 |
+
statusEl.textContent = lang === "fr"
|
| 1578 |
+
? `✓ ${d.n_documents} documents importés.`
|
| 1579 |
+
: `✓ ${d.n_documents} documents imported.`;
|
| 1580 |
+
statusEl.style.color = "var(--fern-deep)";
|
| 1581 |
+
loadLibraryLocalCorpora(); // refresh
|
| 1582 |
+
} catch (e) {
|
| 1583 |
+
statusEl.textContent = `⚠ ${e.message}`;
|
| 1584 |
+
statusEl.style.color = "var(--err)";
|
| 1585 |
+
}
|
| 1586 |
+
}
|
| 1587 |
+
|
| 1588 |
+
// ─── Gallica search + import ────────────────────────────────────────────
|
| 1589 |
+
|
| 1590 |
+
async function searchGallica() {
|
| 1591 |
+
/** GET /api/gallica/search → rend la liste des records. */
|
| 1592 |
+
const query = (document.getElementById("gallica-search").value || "").trim();
|
| 1593 |
+
const author = (document.getElementById("gallica-author").value || "").trim();
|
| 1594 |
+
const language = (document.getElementById("gallica-language").value || "").trim();
|
| 1595 |
+
const resultsEl = document.getElementById("gallica-results");
|
| 1596 |
+
if (!query && !author) {
|
| 1597 |
+
resultsEl.innerHTML = `<div class="empty">${t("gallica_search_required")}</div>`;
|
| 1598 |
+
return;
|
| 1599 |
+
}
|
| 1600 |
+
resultsEl.innerHTML = `<div class="foot">${t("iiif_loading")}</div>`;
|
| 1601 |
+
try {
|
| 1602 |
+
const params = new URLSearchParams();
|
| 1603 |
+
if (query) params.set("query", query);
|
| 1604 |
+
if (author) params.set("author", author);
|
| 1605 |
+
if (language) params.set("language", language);
|
| 1606 |
+
params.set("max_results", "20");
|
| 1607 |
+
const r = await fetch(`/api/gallica/search?${params}`);
|
| 1608 |
+
if (!r.ok) {
|
| 1609 |
+
const err = await r.json();
|
| 1610 |
+
resultsEl.innerHTML = `<div class="empty" style="color:var(--err);">⚠ ${_escapeHtml(err.detail || "Erreur")}</div>`;
|
| 1611 |
+
return;
|
| 1612 |
+
}
|
| 1613 |
+
const d = await r.json();
|
| 1614 |
+
if (d.total === 0) {
|
| 1615 |
+
resultsEl.innerHTML = `<div class="empty">${t("gallica_no_results")}</div>`;
|
| 1616 |
+
return;
|
| 1617 |
+
}
|
| 1618 |
+
resultsEl.innerHTML = d.records.map(rec => `
|
| 1619 |
+
<div class="ds-card">
|
| 1620 |
+
<div class="ds-name">${_escapeHtml(rec.title || "(sans titre)")}</div>
|
| 1621 |
+
<div class="ds-meta">
|
| 1622 |
+
${_escapeHtml(rec.author || "")} ${rec.date ? "· " + _escapeHtml(rec.date) : ""}
|
| 1623 |
+
${rec.type ? "· " + _escapeHtml(rec.type) : ""}
|
| 1624 |
+
</div>
|
| 1625 |
+
<div class="foot" style="margin-top:6px; font-family:var(--mono); font-size:11px;">
|
| 1626 |
+
ARK : ${_escapeHtml(rec.ark || "")}
|
| 1627 |
+
</div>
|
| 1628 |
+
<div style="margin-top:8px;">
|
| 1629 |
+
<button class="btn btn-sm" type="button"
|
| 1630 |
+
onclick="document.getElementById('gallica-ark').value='${_escapeAttr(rec.ark || "")}'">
|
| 1631 |
+
<span data-i18n="gallica_use_ark">Utiliser cet ARK</span>
|
| 1632 |
+
</button>
|
| 1633 |
+
</div>
|
| 1634 |
+
</div>
|
| 1635 |
+
`).join("");
|
| 1636 |
+
} catch (e) {
|
| 1637 |
+
resultsEl.innerHTML = `<div class="empty" style="color:var(--err);">⚠ ${_escapeHtml(e.message)}</div>`;
|
| 1638 |
+
}
|
| 1639 |
+
}
|
| 1640 |
+
|
| 1641 |
+
async function importGallica() {
|
| 1642 |
+
/** POST /api/gallica/import par ARK. */
|
| 1643 |
+
const ark = (document.getElementById("gallica-ark").value || "").trim();
|
| 1644 |
+
const pages = (document.getElementById("gallica-pages").value || "all").trim();
|
| 1645 |
+
const includeOcr = !!document.getElementById("gallica-include-ocr").checked;
|
| 1646 |
+
const statusEl = document.getElementById("gallica-import-status");
|
| 1647 |
+
if (!ark) {
|
| 1648 |
+
statusEl.textContent = lang === "fr" ? "ARK requis." : "ARK required.";
|
| 1649 |
+
statusEl.style.color = "var(--err)";
|
| 1650 |
+
return;
|
| 1651 |
+
}
|
| 1652 |
+
statusEl.textContent = lang === "fr" ? "Import en cours…" : "Importing…";
|
| 1653 |
+
statusEl.style.color = "";
|
| 1654 |
+
try {
|
| 1655 |
+
const r = await fetch("/api/gallica/import", {
|
| 1656 |
+
method: "POST",
|
| 1657 |
+
headers: {"Content-Type": "application/json"},
|
| 1658 |
+
body: JSON.stringify({
|
| 1659 |
+
ark: ark,
|
| 1660 |
+
output_dir: "./corpus/",
|
| 1661 |
+
pages: pages,
|
| 1662 |
+
include_gallica_ocr: includeOcr,
|
| 1663 |
+
}),
|
| 1664 |
+
});
|
| 1665 |
+
if (!r.ok) {
|
| 1666 |
+
const err = await r.json();
|
| 1667 |
+
statusEl.textContent = `⚠ ${err.detail || "Erreur"}`;
|
| 1668 |
+
statusEl.style.color = "var(--err)";
|
| 1669 |
+
return;
|
| 1670 |
+
}
|
| 1671 |
+
const d = await r.json();
|
| 1672 |
+
statusEl.textContent = lang === "fr"
|
| 1673 |
+
? `✓ ${d.n_documents} documents importés.`
|
| 1674 |
+
: `✓ ${d.n_documents} documents imported.`;
|
| 1675 |
+
statusEl.style.color = "var(--fern-deep)";
|
| 1676 |
+
loadLibraryLocalCorpora(); // refresh
|
| 1677 |
+
} catch (e) {
|
| 1678 |
+
statusEl.textContent = `⚠ ${e.message}`;
|
| 1679 |
+
statusEl.style.color = "var(--err)";
|
| 1680 |
+
}
|
| 1681 |
+
}
|
| 1682 |
+
|
| 1683 |
async function loadLibraryLocalCorpora() {
|
| 1684 |
/** GET /api/corpus/uploads → rend la grille des corpus locaux
|
| 1685 |
* (pane "Mes corpus") ET alimente le dropdown du Benchmark. */
|
|
@@ -97,6 +97,16 @@
|
|
| 97 |
<span>HuggingFace</span>
|
| 98 |
<span class="src-count">datasets</span>
|
| 99 |
</button>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
</div>
|
| 101 |
</div>
|
| 102 |
</div>
|
|
@@ -175,6 +185,127 @@
|
|
| 175 |
</div>
|
| 176 |
</div>
|
| 177 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 178 |
</div>
|
| 179 |
|
| 180 |
</section>
|
|
|
|
| 97 |
<span>HuggingFace</span>
|
| 98 |
<span class="src-count">datasets</span>
|
| 99 |
</button>
|
| 100 |
+
<button class="source-chip" type="button" data-source="iiif" onclick="switchLibrarySource('iiif')">
|
| 101 |
+
<span class="src-glyph">IIIF</span>
|
| 102 |
+
<span>IIIF</span>
|
| 103 |
+
<span class="src-count">manifeste</span>
|
| 104 |
+
</button>
|
| 105 |
+
<button class="source-chip" type="button" data-source="gallica" onclick="switchLibrarySource('gallica')">
|
| 106 |
+
<span class="src-glyph">BnF</span>
|
| 107 |
+
<span>Gallica</span>
|
| 108 |
+
<span class="src-count">ARK</span>
|
| 109 |
+
</button>
|
| 110 |
</div>
|
| 111 |
</div>
|
| 112 |
</div>
|
|
|
|
| 185 |
</div>
|
| 186 |
</div>
|
| 187 |
|
| 188 |
+
{# IIIF import — manifeste URL #}
|
| 189 |
+
<div class="sec" id="library-source-iiif" data-tone="butter" style="display:none;">
|
| 190 |
+
<div class="sec-head">
|
| 191 |
+
<span class="sec-num">02</span>
|
| 192 |
+
<div style="flex:1; display:flex; flex-direction:column;">
|
| 193 |
+
<div class="sec-title" data-i18n="import_iiif_title">Import IIIF</div>
|
| 194 |
+
<div class="sec-sub" data-i18n="import_iiif_desc">Manifestes IIIF Presentation API v2/v3 — bibliothèques numériques (BnF, Bodleian, Vatican, e-codices…).</div>
|
| 195 |
+
</div>
|
| 196 |
+
</div>
|
| 197 |
+
<div class="sec-body">
|
| 198 |
+
<div class="grid-2" style="gap:14px; margin-bottom:14px; align-items:flex-end;">
|
| 199 |
+
<div class="field" style="grid-column: span 2;">
|
| 200 |
+
<div class="field-label">
|
| 201 |
+
<span data-i18n="import_iiif_url_label">URL du manifeste</span>
|
| 202 |
+
<span class="opt" data-i18n="import_iiif_url_hint">https:// requis</span>
|
| 203 |
+
</div>
|
| 204 |
+
<input type="url" id="iiif-manifest-url"
|
| 205 |
+
class="mono-input"
|
| 206 |
+
placeholder="https://gallica.bnf.fr/iiif/ark:/12148/btv1b8453561w/manifest.json" />
|
| 207 |
+
</div>
|
| 208 |
+
</div>
|
| 209 |
+
<div class="row" style="margin-bottom:14px;">
|
| 210 |
+
<button class="btn btn-primary" type="button" onclick="previewIIIF()">
|
| 211 |
+
<span data-i18n="import_iiif_preview_btn">Aperçu</span>
|
| 212 |
+
</button>
|
| 213 |
+
</div>
|
| 214 |
+
<div id="iiif-preview-result" style="display:none; margin-bottom:14px;"></div>
|
| 215 |
+
<div id="iiif-import-section" style="display:none;">
|
| 216 |
+
<div class="grid-2" style="gap:14px; margin-bottom:14px; align-items:flex-end;">
|
| 217 |
+
<div class="field">
|
| 218 |
+
<div class="field-label">
|
| 219 |
+
<span data-i18n="import_iiif_pages_label">Pages à importer</span>
|
| 220 |
+
<span class="opt">all, 1-10, 1,3,5…</span>
|
| 221 |
+
</div>
|
| 222 |
+
<input type="text" id="iiif-pages" class="mono-input" value="all" />
|
| 223 |
+
</div>
|
| 224 |
+
<div class="field">
|
| 225 |
+
<div class="field-label">
|
| 226 |
+
<span data-i18n="import_iiif_max_res_label">Résolution max (px)</span>
|
| 227 |
+
<span class="opt">0 = pleine</span>
|
| 228 |
+
</div>
|
| 229 |
+
<input type="number" id="iiif-max-resolution" min="0" max="8192" step="256" value="0" />
|
| 230 |
+
</div>
|
| 231 |
+
</div>
|
| 232 |
+
<div class="row">
|
| 233 |
+
<button class="btn btn-primary" type="button" onclick="importIIIF()">
|
| 234 |
+
<span data-i18n="import_iiif_import_btn">Importer le corpus</span>
|
| 235 |
+
</button>
|
| 236 |
+
<span id="iiif-import-status" class="foot"></span>
|
| 237 |
+
</div>
|
| 238 |
+
</div>
|
| 239 |
+
</div>
|
| 240 |
+
</div>
|
| 241 |
+
|
| 242 |
+
{# Gallica BnF — search SRU + import par ARK #}
|
| 243 |
+
<div class="sec" id="library-source-gallica" data-tone="fern" style="display:none;">
|
| 244 |
+
<div class="sec-head">
|
| 245 |
+
<span class="sec-num">02</span>
|
| 246 |
+
<div style="flex:1; display:flex; flex-direction:column;">
|
| 247 |
+
<div class="sec-title" data-i18n="import_gallica_title">Import Gallica BnF</div>
|
| 248 |
+
<div class="sec-sub" data-i18n="import_gallica_desc">Catalogue institutionnel BnF — l'OCR Gallica peut être récupéré comme transcription de référence.</div>
|
| 249 |
+
</div>
|
| 250 |
+
</div>
|
| 251 |
+
<div class="sec-body">
|
| 252 |
+
<div class="grid-4" style="gap:14px; margin-bottom:14px; align-items:flex-end;">
|
| 253 |
+
<div class="field" style="grid-column: span 2;">
|
| 254 |
+
<div class="field-label">
|
| 255 |
+
<span data-i18n="import_gallica_search_label">Recherche</span>
|
| 256 |
+
<span class="opt">titre ou mots-clés</span>
|
| 257 |
+
</div>
|
| 258 |
+
<input type="search" id="gallica-search" placeholder="Chroniques médiévales, Hugo, Diderot…" />
|
| 259 |
+
</div>
|
| 260 |
+
<div class="field">
|
| 261 |
+
<div class="field-label"><span data-i18n="import_gallica_author_label">Auteur</span></div>
|
| 262 |
+
<input type="text" id="gallica-author" placeholder="optionnel" />
|
| 263 |
+
</div>
|
| 264 |
+
<div class="field">
|
| 265 |
+
<div class="field-label"><span data-i18n="import_gallica_lang_label">Langue (ISO 639)</span></div>
|
| 266 |
+
<input type="text" id="gallica-language" placeholder="fre, lat, eng…" maxlength="3" />
|
| 267 |
+
</div>
|
| 268 |
+
</div>
|
| 269 |
+
<div class="row" style="margin-bottom:14px;">
|
| 270 |
+
<button class="btn btn-primary" type="button" onclick="searchGallica()">
|
| 271 |
+
<span data-i18n="search">Rechercher</span>
|
| 272 |
+
</button>
|
| 273 |
+
<span class="foot" data-i18n="import_gallica_search_hint">…ou collez directement un ARK ci-dessous pour importer.</span>
|
| 274 |
+
</div>
|
| 275 |
+
<div id="gallica-results" class="grid-2" style="margin-bottom:14px;"></div>
|
| 276 |
+
|
| 277 |
+
<hr class="hr" />
|
| 278 |
+
|
| 279 |
+
{# Import direct par ARK #}
|
| 280 |
+
<div class="label label-strong" style="margin-bottom:10px;" data-i18n="import_gallica_direct_label">Import direct par ARK</div>
|
| 281 |
+
<div class="grid-4" style="gap:14px; margin-bottom:14px; align-items:flex-end;">
|
| 282 |
+
<div class="field" style="grid-column: span 2;">
|
| 283 |
+
<div class="field-label"><span data-i18n="import_gallica_ark_label">ARK</span></div>
|
| 284 |
+
<input type="text" id="gallica-ark" class="mono-input" placeholder="12148/btv1b8453561w" />
|
| 285 |
+
</div>
|
| 286 |
+
<div class="field">
|
| 287 |
+
<div class="field-label">
|
| 288 |
+
<span data-i18n="import_gallica_pages_label">Pages</span>
|
| 289 |
+
<span class="opt">all, 1-10…</span>
|
| 290 |
+
</div>
|
| 291 |
+
<input type="text" id="gallica-pages" class="mono-input" value="all" />
|
| 292 |
+
</div>
|
| 293 |
+
<div class="field">
|
| 294 |
+
<label class="row" style="gap:6px; cursor:pointer;">
|
| 295 |
+
<input type="checkbox" id="gallica-include-ocr" checked />
|
| 296 |
+
<span data-i18n="import_gallica_ocr_label">Récupérer l'OCR Gallica</span>
|
| 297 |
+
</label>
|
| 298 |
+
</div>
|
| 299 |
+
</div>
|
| 300 |
+
<div class="row">
|
| 301 |
+
<button class="btn btn-primary" type="button" onclick="importGallica()">
|
| 302 |
+
<span data-i18n="import_gallica_import_btn">Importer le document</span>
|
| 303 |
+
</button>
|
| 304 |
+
<span id="gallica-import-status" class="foot"></span>
|
| 305 |
+
</div>
|
| 306 |
+
</div>
|
| 307 |
+
</div>
|
| 308 |
+
|
| 309 |
</div>
|
| 310 |
|
| 311 |
</section>
|
|
@@ -183,11 +183,14 @@ def test_readme_under_500_lines() -> None:
|
|
| 183 |
- 510 : +2 lignes pour kraken/calamari dans la matrice produit.
|
| 184 |
- 515 : +5 lignes pour la mention statut/policy SemVer pré-1.0
|
| 185 |
(sprint S0 du repositionnement).
|
|
|
|
|
|
|
|
|
|
| 186 |
"""
|
| 187 |
text = _read_readme()
|
| 188 |
n_lines = len(text.splitlines())
|
| 189 |
-
assert n_lines <
|
| 190 |
-
f"README à {n_lines} lignes — au-dessus du seuil
|
| 191 |
"Déléguer le détail vers docs/."
|
| 192 |
)
|
| 193 |
|
|
|
|
| 183 |
- 510 : +2 lignes pour kraken/calamari dans la matrice produit.
|
| 184 |
- 515 : +5 lignes pour la mention statut/policy SemVer pré-1.0
|
| 185 |
(sprint S0 du repositionnement).
|
| 186 |
+
- 525 : +4 lignes pour les routes IIIF + Gallica (sprint S2 —
|
| 187 |
+
parité importeurs corpus web), +marge pour les sprints
|
| 188 |
+
d'importeurs restants (S3 eScriptorium).
|
| 189 |
"""
|
| 190 |
text = _read_readme()
|
| 191 |
n_lines = len(text.splitlines())
|
| 192 |
+
assert n_lines < 525, (
|
| 193 |
+
f"README à {n_lines} lignes — au-dessus du seuil 525. "
|
| 194 |
"Déléguer le détail vers docs/."
|
| 195 |
)
|
| 196 |
|
|
@@ -0,0 +1,219 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests du router Gallica (S2 — parité corpus web).
|
| 2 |
+
|
| 3 |
+
Couverts ici :
|
| 4 |
+
|
| 5 |
+
1. Validation des inputs (au moins un critère, ARK bien formé).
|
| 6 |
+
2. Recherche SRU avec mock XML.
|
| 7 |
+
3. Import par ARK avec mock manifeste IIIF Gallica.
|
| 8 |
+
4. Sécurité : ARK refuse les schémas d'URL et les ``..``.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
from unittest.mock import patch
|
| 14 |
+
|
| 15 |
+
import pytest
|
| 16 |
+
from fastapi.testclient import TestClient
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
@pytest.fixture
|
| 20 |
+
def client():
|
| 21 |
+
from picarones.interfaces.web.app import app
|
| 22 |
+
return TestClient(app)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# Réponse SRU XML minimale pour mock
|
| 26 |
+
_MOCK_SRU_RESPONSE = b"""<?xml version="1.0" encoding="UTF-8"?>
|
| 27 |
+
<srw:searchRetrieveResponse xmlns:srw="http://www.loc.gov/zing/srw/">
|
| 28 |
+
<srw:numberOfRecords>1</srw:numberOfRecords>
|
| 29 |
+
<srw:records>
|
| 30 |
+
<srw:record>
|
| 31 |
+
<srw:recordSchema>unimarcXchange</srw:recordSchema>
|
| 32 |
+
<srw:recordData>
|
| 33 |
+
<oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
|
| 34 |
+
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
| 35 |
+
<dc:title>Chroniques de France</dc:title>
|
| 36 |
+
<dc:creator>Anonyme</dc:creator>
|
| 37 |
+
<dc:date>1450</dc:date>
|
| 38 |
+
<dc:identifier>ark:/12148/btv1b8453561w</dc:identifier>
|
| 39 |
+
<dc:type>manuscrit</dc:type>
|
| 40 |
+
<dc:language>fre</dc:language>
|
| 41 |
+
</oai_dc:dc>
|
| 42 |
+
</srw:recordData>
|
| 43 |
+
</srw:record>
|
| 44 |
+
</srw:records>
|
| 45 |
+
</srw:searchRetrieveResponse>
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 50 |
+
# 1. Validation de l'ARK (Pydantic) — POST /import
|
| 51 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
class TestGallicaImportARKValidation:
|
| 55 |
+
"""Le validator ARK refuse les URLs, les ``..``, les chemins
|
| 56 |
+
absolus, et accepte le préfixe ``ark:/`` qu'il normalise."""
|
| 57 |
+
|
| 58 |
+
def test_valid_ark_accepted(self, client) -> None:
|
| 59 |
+
"""Format BnF standard ``<naan>/<id>``."""
|
| 60 |
+
from picarones.interfaces.web.models import GallicaImportRequest
|
| 61 |
+
|
| 62 |
+
req = GallicaImportRequest(ark="12148/btv1b8453561w")
|
| 63 |
+
assert req.ark == "12148/btv1b8453561w"
|
| 64 |
+
|
| 65 |
+
def test_ark_with_prefix_normalized(self) -> None:
|
| 66 |
+
"""``ark:/12148/...`` est normalisé en ``12148/...``."""
|
| 67 |
+
from picarones.interfaces.web.models import GallicaImportRequest
|
| 68 |
+
|
| 69 |
+
req = GallicaImportRequest(ark="ark:/12148/btv1b8453561w")
|
| 70 |
+
assert req.ark == "12148/btv1b8453561w"
|
| 71 |
+
|
| 72 |
+
def test_ark_url_rejected(self) -> None:
|
| 73 |
+
"""Une URL complète n'est pas un ARK."""
|
| 74 |
+
from picarones.interfaces.web.models import GallicaImportRequest
|
| 75 |
+
from pydantic import ValidationError
|
| 76 |
+
|
| 77 |
+
with pytest.raises(ValidationError):
|
| 78 |
+
GallicaImportRequest(ark="https://example.org/foo")
|
| 79 |
+
|
| 80 |
+
def test_ark_with_dotdot_rejected(self) -> None:
|
| 81 |
+
from picarones.interfaces.web.models import GallicaImportRequest
|
| 82 |
+
from pydantic import ValidationError
|
| 83 |
+
|
| 84 |
+
with pytest.raises(ValidationError):
|
| 85 |
+
GallicaImportRequest(ark="../etc/passwd")
|
| 86 |
+
|
| 87 |
+
def test_ark_single_segment_rejected(self) -> None:
|
| 88 |
+
from picarones.interfaces.web.models import GallicaImportRequest
|
| 89 |
+
from pydantic import ValidationError
|
| 90 |
+
|
| 91 |
+
with pytest.raises(ValidationError):
|
| 92 |
+
GallicaImportRequest(ark="12148") # missing identifier
|
| 93 |
+
|
| 94 |
+
def test_ark_endpoint_rejects_invalid_ark(self, client) -> None:
|
| 95 |
+
"""L'endpoint refuse en 422 un ARK mal formé."""
|
| 96 |
+
r = client.post("/api/gallica/import", json={
|
| 97 |
+
"ark": "https://malicious.example/foo",
|
| 98 |
+
"output_dir": "./corpus/",
|
| 99 |
+
})
|
| 100 |
+
assert r.status_code == 422
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 104 |
+
# 2. Recherche SRU
|
| 105 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 106 |
+
|
| 107 |
+
|
| 108 |
+
class TestGallicaSearch:
|
| 109 |
+
"""GET /api/gallica/search avec mock du fetch HTTP."""
|
| 110 |
+
|
| 111 |
+
def test_search_requires_at_least_one_criterion(self, client) -> None:
|
| 112 |
+
"""Pas de paramètres → 400 explicite."""
|
| 113 |
+
r = client.get("/api/gallica/search")
|
| 114 |
+
assert r.status_code == 400
|
| 115 |
+
assert "critère" in r.json()["detail"].lower()
|
| 116 |
+
|
| 117 |
+
def test_search_with_title_returns_records(self, client) -> None:
|
| 118 |
+
"""Mock le fetch SRU pour retourner un record connu."""
|
| 119 |
+
with patch.object(
|
| 120 |
+
__import__(
|
| 121 |
+
"picarones.adapters.corpus.gallica", fromlist=["GallicaClient"],
|
| 122 |
+
).GallicaClient,
|
| 123 |
+
"_fetch_url",
|
| 124 |
+
return_value=_MOCK_SRU_RESPONSE,
|
| 125 |
+
):
|
| 126 |
+
r = client.get("/api/gallica/search", params={
|
| 127 |
+
"title": "Chroniques",
|
| 128 |
+
"max_results": 5,
|
| 129 |
+
})
|
| 130 |
+
|
| 131 |
+
assert r.status_code == 200, r.json()
|
| 132 |
+
body = r.json()
|
| 133 |
+
assert body["total"] >= 1
|
| 134 |
+
record = body["records"][0]
|
| 135 |
+
assert "Chroniques" in record["title"]
|
| 136 |
+
assert record["language"] == "fre"
|
| 137 |
+
|
| 138 |
+
def test_search_with_query_falls_back_to_title(self, client) -> None:
|
| 139 |
+
"""``query`` libre → traduit en recherche titre."""
|
| 140 |
+
with patch.object(
|
| 141 |
+
__import__(
|
| 142 |
+
"picarones.adapters.corpus.gallica", fromlist=["GallicaClient"],
|
| 143 |
+
).GallicaClient,
|
| 144 |
+
"_fetch_url",
|
| 145 |
+
return_value=_MOCK_SRU_RESPONSE,
|
| 146 |
+
):
|
| 147 |
+
r = client.get("/api/gallica/search", params={
|
| 148 |
+
"query": "chroniques médiévales",
|
| 149 |
+
"max_results": 5,
|
| 150 |
+
})
|
| 151 |
+
|
| 152 |
+
assert r.status_code == 200
|
| 153 |
+
assert r.json()["total"] >= 1
|
| 154 |
+
|
| 155 |
+
def test_search_handles_network_error(self, client) -> None:
|
| 156 |
+
"""Si le fetch SRU lève, on retourne [] (le backend log et
|
| 157 |
+
ne propage pas — comportement défensif Gallica)."""
|
| 158 |
+
with patch.object(
|
| 159 |
+
__import__(
|
| 160 |
+
"picarones.adapters.corpus.gallica", fromlist=["GallicaClient"],
|
| 161 |
+
).GallicaClient,
|
| 162 |
+
"_fetch_url",
|
| 163 |
+
side_effect=RuntimeError("SRU timeout"),
|
| 164 |
+
):
|
| 165 |
+
r = client.get("/api/gallica/search", params={
|
| 166 |
+
"title": "test",
|
| 167 |
+
})
|
| 168 |
+
|
| 169 |
+
# Le backend retourne [], le router retourne 200 avec total=0
|
| 170 |
+
assert r.status_code == 200
|
| 171 |
+
assert r.json()["total"] == 0
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 175 |
+
# 3. Import : validation du payload
|
| 176 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
class TestGallicaImportValidation:
|
| 180 |
+
"""POST /api/gallica/import refuse les payloads malformés."""
|
| 181 |
+
|
| 182 |
+
def test_missing_ark_rejected(self, client) -> None:
|
| 183 |
+
r = client.post("/api/gallica/import", json={
|
| 184 |
+
"output_dir": "./corpus/",
|
| 185 |
+
})
|
| 186 |
+
assert r.status_code == 422
|
| 187 |
+
|
| 188 |
+
def test_path_traversal_in_output_dir_rejected(self, client) -> None:
|
| 189 |
+
r = client.post("/api/gallica/import", json={
|
| 190 |
+
"ark": "12148/btv1b8453561w",
|
| 191 |
+
"output_dir": "../../etc/",
|
| 192 |
+
})
|
| 193 |
+
assert r.status_code in (400, 422)
|
| 194 |
+
|
| 195 |
+
def test_pages_selector_bounded(self, client) -> None:
|
| 196 |
+
long_pages = ",".join(str(i) for i in range(100_000))
|
| 197 |
+
r = client.post("/api/gallica/import", json={
|
| 198 |
+
"ark": "12148/btv1b8453561w",
|
| 199 |
+
"pages": long_pages,
|
| 200 |
+
})
|
| 201 |
+
assert r.status_code == 422
|
| 202 |
+
|
| 203 |
+
def test_include_gallica_ocr_default_true(self) -> None:
|
| 204 |
+
"""Par défaut, on récupère l'OCR Gallica (intérêt scientifique)."""
|
| 205 |
+
from picarones.interfaces.web.models import GallicaImportRequest
|
| 206 |
+
|
| 207 |
+
req = GallicaImportRequest(ark="12148/btv1b8453561w")
|
| 208 |
+
assert req.include_gallica_ocr is True
|
| 209 |
+
|
| 210 |
+
|
| 211 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 212 |
+
# 4. Routes enregistrées
|
| 213 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 214 |
+
|
| 215 |
+
|
| 216 |
+
def test_gallica_routes_registered(client) -> None:
|
| 217 |
+
routes = [r.path for r in client.app.routes]
|
| 218 |
+
assert "/api/gallica/search" in routes
|
| 219 |
+
assert "/api/gallica/import" in routes
|
|
@@ -0,0 +1,218 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests du router IIIF (S2 — parité corpus web).
|
| 2 |
+
|
| 3 |
+
Couverts ici :
|
| 4 |
+
|
| 5 |
+
1. Validation des inputs (HTTPS, URL bornée, schémas refusés).
|
| 6 |
+
2. Anti-SSRF (loopback, RFC 1918, métadonnées cloud).
|
| 7 |
+
3. Pré-fetch préview avec mock manifeste.
|
| 8 |
+
4. Import avec mock manifeste + images.
|
| 9 |
+
5. Gestion d'erreurs amont (réseau, JSON invalide, manifeste mal formé).
|
| 10 |
+
|
| 11 |
+
Les tests utilisent des **mocks au niveau du helper HTTP partagé**
|
| 12 |
+
(``adapters.corpus._http.download_url``) pour ne pas dépendre du réseau.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import json
|
| 18 |
+
from unittest.mock import patch
|
| 19 |
+
|
| 20 |
+
import pytest
|
| 21 |
+
from fastapi.testclient import TestClient
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
@pytest.fixture
|
| 25 |
+
def client():
|
| 26 |
+
from picarones.interfaces.web.app import app
|
| 27 |
+
return TestClient(app)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# Manifeste IIIF v3 minimal de test
|
| 31 |
+
_MOCK_MANIFEST_V3 = {
|
| 32 |
+
"@context": "http://iiif.io/api/presentation/3/context.json",
|
| 33 |
+
"id": "https://example.org/manifest.json",
|
| 34 |
+
"type": "Manifest",
|
| 35 |
+
"label": {"fr": ["Chroniques médiévales — vol. 1"]},
|
| 36 |
+
"items": [
|
| 37 |
+
{
|
| 38 |
+
"id": f"https://example.org/canvas/{i}",
|
| 39 |
+
"type": "Canvas",
|
| 40 |
+
"label": {"fr": [f"folio {i}r"]},
|
| 41 |
+
"height": 1000,
|
| 42 |
+
"width": 750,
|
| 43 |
+
"items": [
|
| 44 |
+
{
|
| 45 |
+
"id": f"https://example.org/annpage/{i}",
|
| 46 |
+
"type": "AnnotationPage",
|
| 47 |
+
"items": [
|
| 48 |
+
{
|
| 49 |
+
"id": f"https://example.org/ann/{i}",
|
| 50 |
+
"type": "Annotation",
|
| 51 |
+
"motivation": "painting",
|
| 52 |
+
"body": {
|
| 53 |
+
"id": f"https://example.org/img/{i}.jpg",
|
| 54 |
+
"type": "Image",
|
| 55 |
+
"format": "image/jpeg",
|
| 56 |
+
"service": [
|
| 57 |
+
{
|
| 58 |
+
"id": f"https://example.org/iiif/img{i}",
|
| 59 |
+
"type": "ImageService3",
|
| 60 |
+
"profile": "level1",
|
| 61 |
+
},
|
| 62 |
+
],
|
| 63 |
+
},
|
| 64 |
+
"target": f"https://example.org/canvas/{i}",
|
| 65 |
+
},
|
| 66 |
+
],
|
| 67 |
+
},
|
| 68 |
+
],
|
| 69 |
+
}
|
| 70 |
+
for i in range(1, 5)
|
| 71 |
+
],
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 76 |
+
# 1. Validation d'entrée (anti-SSRF + schéma)
|
| 77 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class TestIIIFPreviewValidation:
|
| 81 |
+
"""Refus précoce des URLs invalides — pas de fetch déclenché."""
|
| 82 |
+
|
| 83 |
+
def test_ftp_scheme_rejected(self, client) -> None:
|
| 84 |
+
r = client.get("/api/iiif/preview", params={
|
| 85 |
+
"manifest_url": "ftp://example.org/manifest.json",
|
| 86 |
+
})
|
| 87 |
+
assert r.status_code == 400
|
| 88 |
+
|
| 89 |
+
def test_http_non_localhost_rejected(self, client) -> None:
|
| 90 |
+
"""``http://`` clair refusé (sauf localhost pour tests)."""
|
| 91 |
+
r = client.get("/api/iiif/preview", params={
|
| 92 |
+
"manifest_url": "http://example.org/manifest.json",
|
| 93 |
+
})
|
| 94 |
+
assert r.status_code == 400
|
| 95 |
+
|
| 96 |
+
def test_url_too_short_rejected(self, client) -> None:
|
| 97 |
+
r = client.get("/api/iiif/preview", params={
|
| 98 |
+
"manifest_url": "h",
|
| 99 |
+
})
|
| 100 |
+
assert r.status_code == 422
|
| 101 |
+
|
| 102 |
+
def test_loopback_ip_rejected_by_ssrf_guard(self, client) -> None:
|
| 103 |
+
"""L'URL passe Pydantic (HTTPS valide) mais est rejetée par
|
| 104 |
+
le helper anti-SSRF au moment du fetch."""
|
| 105 |
+
r = client.get("/api/iiif/preview", params={
|
| 106 |
+
"manifest_url": "https://127.0.0.1/manifest.json",
|
| 107 |
+
})
|
| 108 |
+
assert r.status_code == 400
|
| 109 |
+
body = r.json()
|
| 110 |
+
assert "loopback" in body["detail"].lower() or "interne" in body["detail"].lower()
|
| 111 |
+
|
| 112 |
+
def test_metadata_cloud_host_rejected(self, client) -> None:
|
| 113 |
+
r = client.get("/api/iiif/preview", params={
|
| 114 |
+
"manifest_url": "https://metadata.google.internal/manifest.json",
|
| 115 |
+
})
|
| 116 |
+
assert r.status_code == 400
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 120 |
+
# 2. Preview heureux (mock du download_url)
|
| 121 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
class TestIIIFPreviewHappyPath:
|
| 125 |
+
"""Avec un mock qui retourne un manifeste valide, le preview
|
| 126 |
+
extrait correctement les métadonnées.
|
| 127 |
+
|
| 128 |
+
Note : ``iiif.py`` réimporte ``download_url`` sous l'alias
|
| 129 |
+
``_download_url`` au load-time du module ; le patch doit cibler
|
| 130 |
+
cet alias et non la fonction source dans ``_http``."""
|
| 131 |
+
|
| 132 |
+
def test_preview_returns_manifest_metadata(self, client) -> None:
|
| 133 |
+
mock_bytes = json.dumps(_MOCK_MANIFEST_V3).encode("utf-8")
|
| 134 |
+
|
| 135 |
+
with patch(
|
| 136 |
+
"picarones.adapters.corpus.iiif._download_url",
|
| 137 |
+
return_value=mock_bytes,
|
| 138 |
+
):
|
| 139 |
+
r = client.get("/api/iiif/preview", params={
|
| 140 |
+
"manifest_url": "https://example.org/manifest.json",
|
| 141 |
+
})
|
| 142 |
+
|
| 143 |
+
assert r.status_code == 200, r.json()
|
| 144 |
+
body = r.json()
|
| 145 |
+
assert body["iiif_version"] == 3
|
| 146 |
+
assert body["canvas_count"] == 4
|
| 147 |
+
assert "Chroniques" in body["label"]
|
| 148 |
+
assert "folio 1r" in body["sample_labels"]
|
| 149 |
+
|
| 150 |
+
def test_preview_handles_invalid_json(self, client) -> None:
|
| 151 |
+
with patch(
|
| 152 |
+
"picarones.adapters.corpus.iiif._download_url",
|
| 153 |
+
return_value=b"<html>not json</html>",
|
| 154 |
+
):
|
| 155 |
+
r = client.get("/api/iiif/preview", params={
|
| 156 |
+
"manifest_url": "https://example.org/bad.json",
|
| 157 |
+
})
|
| 158 |
+
|
| 159 |
+
assert r.status_code == 400
|
| 160 |
+
assert "invalide" in r.json()["detail"].lower()
|
| 161 |
+
|
| 162 |
+
def test_preview_handles_network_failure(self, client) -> None:
|
| 163 |
+
with patch(
|
| 164 |
+
"picarones.adapters.corpus.iiif._download_url",
|
| 165 |
+
side_effect=RuntimeError("Connection refused"),
|
| 166 |
+
):
|
| 167 |
+
r = client.get("/api/iiif/preview", params={
|
| 168 |
+
"manifest_url": "https://example.org/manifest.json",
|
| 169 |
+
})
|
| 170 |
+
|
| 171 |
+
assert r.status_code == 502
|
| 172 |
+
assert "indisponible" in r.json()["detail"].lower()
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 176 |
+
# 3. Import : validation du payload (sans fetch effectif)
|
| 177 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
class TestIIIFImportValidation:
|
| 181 |
+
"""POST refuse les payloads malformés AVANT tout fetch réseau."""
|
| 182 |
+
|
| 183 |
+
def test_import_ftp_url_rejected(self, client) -> None:
|
| 184 |
+
r = client.post("/api/iiif/import", json={
|
| 185 |
+
"manifest_url": "ftp://example.org/manifest.json",
|
| 186 |
+
"output_dir": "./corpus/",
|
| 187 |
+
})
|
| 188 |
+
assert r.status_code == 422 # Pydantic validation
|
| 189 |
+
|
| 190 |
+
def test_import_path_traversal_in_output_dir_rejected(self, client) -> None:
|
| 191 |
+
r = client.post("/api/iiif/import", json={
|
| 192 |
+
"manifest_url": "https://example.org/manifest.json",
|
| 193 |
+
"output_dir": "../../etc/",
|
| 194 |
+
})
|
| 195 |
+
# Refusé par validated_user_output_dir (path traversal guard)
|
| 196 |
+
assert r.status_code in (400, 422)
|
| 197 |
+
|
| 198 |
+
def test_import_pages_selector_too_long_rejected(self, client) -> None:
|
| 199 |
+
"""``pages`` borné pour éviter abus DoS."""
|
| 200 |
+
long_pages = ",".join(str(i) for i in range(100_000))
|
| 201 |
+
r = client.post("/api/iiif/import", json={
|
| 202 |
+
"manifest_url": "https://example.org/manifest.json",
|
| 203 |
+
"output_dir": "./corpus/",
|
| 204 |
+
"pages": long_pages,
|
| 205 |
+
})
|
| 206 |
+
assert r.status_code == 422
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 210 |
+
# 4. Routes enregistrées
|
| 211 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def test_iiif_routes_registered(client) -> None:
|
| 215 |
+
"""Smoke : les 2 endpoints sont bien dans l'app."""
|
| 216 |
+
routes = [r.path for r in client.app.routes]
|
| 217 |
+
assert "/api/iiif/preview" in routes
|
| 218 |
+
assert "/api/iiif/import" in routes
|
|
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests S2 — présence des panneaux IIIF / Gallica dans la vue Import.
|
| 2 |
+
|
| 3 |
+
Vérifie que :
|
| 4 |
+
|
| 5 |
+
1. Les 2 nouvelles chips ``IIIF`` et ``Gallica`` sont dans le
|
| 6 |
+
source-switch (le router JS ``switchLibrarySource`` les attend).
|
| 7 |
+
2. Les 2 panneaux ``library-source-iiif`` et ``library-source-gallica``
|
| 8 |
+
existent avec leurs champs critiques (IDs DOM consommés par les
|
| 9 |
+
handlers ``previewIIIF``, ``importIIIF``, ``searchGallica``,
|
| 10 |
+
``importGallica``).
|
| 11 |
+
3. Les nouvelles clés i18n FR + EN sont présentes dans la table T
|
| 12 |
+
inline de ``web-app.js``.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
|
| 19 |
+
import pytest
|
| 20 |
+
from fastapi.testclient import TestClient
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
@pytest.fixture
|
| 24 |
+
def client():
|
| 25 |
+
from picarones.interfaces.web.app import app
|
| 26 |
+
return TestClient(app)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
@pytest.fixture
|
| 30 |
+
def html(client) -> str:
|
| 31 |
+
response = client.get("/")
|
| 32 |
+
assert response.status_code == 200
|
| 33 |
+
return response.text
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 37 |
+
# 1. Source-switch enrichi
|
| 38 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class TestSourceSwitchChips:
|
| 42 |
+
"""Le source-switch doit contenir 4 chips : HTR-United, HuggingFace,
|
| 43 |
+
IIIF, Gallica."""
|
| 44 |
+
|
| 45 |
+
def test_iiif_chip_present(self, html: str) -> None:
|
| 46 |
+
assert 'data-source="iiif"' in html
|
| 47 |
+
assert "switchLibrarySource('iiif')" in html
|
| 48 |
+
|
| 49 |
+
def test_gallica_chip_present(self, html: str) -> None:
|
| 50 |
+
assert 'data-source="gallica"' in html
|
| 51 |
+
assert "switchLibrarySource('gallica')" in html
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 55 |
+
# 2. Panneau IIIF
|
| 56 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
class TestIIIFPanelDOMElements:
|
| 60 |
+
"""IDs DOM consommés par previewIIIF() / importIIIF()."""
|
| 61 |
+
|
| 62 |
+
def test_panel_container_present(self, html: str) -> None:
|
| 63 |
+
assert 'id="library-source-iiif"' in html
|
| 64 |
+
|
| 65 |
+
def test_manifest_url_input_present(self, html: str) -> None:
|
| 66 |
+
assert 'id="iiif-manifest-url"' in html
|
| 67 |
+
|
| 68 |
+
def test_preview_button_present(self, html: str) -> None:
|
| 69 |
+
assert "previewIIIF()" in html
|
| 70 |
+
|
| 71 |
+
def test_pages_and_max_resolution_inputs_present(self, html: str) -> None:
|
| 72 |
+
assert 'id="iiif-pages"' in html
|
| 73 |
+
assert 'id="iiif-max-resolution"' in html
|
| 74 |
+
|
| 75 |
+
def test_import_button_present(self, html: str) -> None:
|
| 76 |
+
assert "importIIIF()" in html
|
| 77 |
+
|
| 78 |
+
def test_status_zones_present(self, html: str) -> None:
|
| 79 |
+
assert 'id="iiif-preview-result"' in html
|
| 80 |
+
assert 'id="iiif-import-status"' in html
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 84 |
+
# 3. Panneau Gallica
|
| 85 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 86 |
+
|
| 87 |
+
|
| 88 |
+
class TestGallicaPanelDOMElements:
|
| 89 |
+
"""IDs DOM consommés par searchGallica() / importGallica()."""
|
| 90 |
+
|
| 91 |
+
def test_panel_container_present(self, html: str) -> None:
|
| 92 |
+
assert 'id="library-source-gallica"' in html
|
| 93 |
+
|
| 94 |
+
def test_search_inputs_present(self, html: str) -> None:
|
| 95 |
+
assert 'id="gallica-search"' in html
|
| 96 |
+
assert 'id="gallica-author"' in html
|
| 97 |
+
assert 'id="gallica-language"' in html
|
| 98 |
+
|
| 99 |
+
def test_search_button_present(self, html: str) -> None:
|
| 100 |
+
assert "searchGallica()" in html
|
| 101 |
+
|
| 102 |
+
def test_ark_import_inputs_present(self, html: str) -> None:
|
| 103 |
+
assert 'id="gallica-ark"' in html
|
| 104 |
+
assert 'id="gallica-pages"' in html
|
| 105 |
+
assert 'id="gallica-include-ocr"' in html
|
| 106 |
+
|
| 107 |
+
def test_import_button_present(self, html: str) -> None:
|
| 108 |
+
assert "importGallica()" in html
|
| 109 |
+
|
| 110 |
+
|
| 111 |
+
# ─────────────────────────────────────────────────────────────────────────────
|
| 112 |
+
# 4. i18n FR ↔ EN
|
| 113 |
+
# ─────────────────────────────────────────────────────────────────────���───────
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
class TestI18nKeysCovered:
|
| 117 |
+
"""Toute nouvelle clé i18n doit apparaître ≥ 2 fois dans
|
| 118 |
+
``web-app.js`` (T.fr + T.en)."""
|
| 119 |
+
|
| 120 |
+
@pytest.fixture
|
| 121 |
+
def js_source(self) -> str:
|
| 122 |
+
path = (
|
| 123 |
+
Path(__file__).resolve().parents[2]
|
| 124 |
+
/ "picarones" / "interfaces" / "web" / "static" / "web-app.js"
|
| 125 |
+
)
|
| 126 |
+
return path.read_text(encoding="utf-8")
|
| 127 |
+
|
| 128 |
+
@pytest.mark.parametrize("key", [
|
| 129 |
+
"import_iiif_title",
|
| 130 |
+
"import_iiif_url_label",
|
| 131 |
+
"import_iiif_preview_btn",
|
| 132 |
+
"import_iiif_import_btn",
|
| 133 |
+
"import_gallica_title",
|
| 134 |
+
"import_gallica_search_label",
|
| 135 |
+
"import_gallica_ark_label",
|
| 136 |
+
"import_gallica_ocr_label",
|
| 137 |
+
"import_gallica_import_btn",
|
| 138 |
+
"iiif_loading",
|
| 139 |
+
"gallica_no_results",
|
| 140 |
+
"gallica_use_ark",
|
| 141 |
+
])
|
| 142 |
+
def test_key_present_in_both_languages(self, js_source: str, key: str) -> None:
|
| 143 |
+
count = js_source.count(f"{key}:")
|
| 144 |
+
assert count >= 2, (
|
| 145 |
+
f"Clé i18n {key!r} déclarée {count} fois ; attendu ≥ 2 (FR + EN)."
|
| 146 |
+
)
|