Spaces:
Sleeping
Sprint 4 — Adaptateurs API OCR, import IIIF, CER diplomatique
Browse files## Nouveautés
### Adaptateurs OCR distants
- `MistralOCREngine` : OCR via l'API Mistral AI (Pixtral)
- `GoogleVisionEngine` : Google Cloud Vision (SDK + REST avec GOOGLE_API_KEY)
- `AzureDocIntelEngine` : Azure Document Intelligence (SDK + REST polling async)
### Import IIIF
- `IIIFImporter` : import de corpus depuis manifestes IIIF v2/v3
- Parsing auto-détecté (v2/v3), sélecteur de pages flexible (1-10, 1,3,5, all)
- Téléchargement avec retry exponentiel, extraction GT si annotée dans le manifeste
- CLI : `picarones import iiif <url> --pages 1-10 --output ./corpus/`
### Normalisation diplomatique & CER diplomatique
- `NormalizationProfile` : NFC + caseless + table de correspondances historiques
- Profils préconfigurés : medieval_french, early_modern_french, medieval_latin, minimal
- `compute_metrics()` calcule désormais `cer_diplomatic` par défaut (profil médiéval)
- `aggregate_metrics()` agrège le CER diplomatique avec indication du profil utilisé
- Rapport HTML : colonne "CER diplo." dans le tableau de classement + badge dans les
panneaux de diff document avec indicateur delta (−X.X% avec normalisation)
- Textes GT de démo mis à jour avec graphies médiévales (ſ, &, u/v, i/j)
### Tests
- 103 nouveaux tests Sprint 4 (normalization, IIIF, moteurs API, CLI, rapport)
- Suite complète : 257 tests, 0 échec
https://claude.ai/code/session_017gXea9mxBQqDTAsSQd7aAq
- picarones/cli.py +105 -0
- picarones/core/metrics.py +58 -6
- picarones/core/normalization.py +286 -0
- picarones/engines/__init__.py +11 -1
- picarones/engines/azure_doc_intel.py +153 -0
- picarones/engines/google_vision.py +133 -0
- picarones/engines/mistral_ocr.py +91 -0
- picarones/fixtures.py +13 -12
- picarones/importers/__init__.py +5 -0
- picarones/importers/iiif.py +583 -0
- picarones/report/generator.py +36 -1
- rapport_demo.html +0 -0
- tests/test_sprint4_normalization_iiif.py +834 -0
|
@@ -381,5 +381,110 @@ def demo_cmd(output: str, docs: int, json_output: str | None) -> None:
|
|
| 381 |
click.echo(f"Ouvrez-le dans un navigateur : file://{path}")
|
| 382 |
|
| 383 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 384 |
if __name__ == "__main__":
|
| 385 |
cli()
|
|
|
|
| 381 |
click.echo(f"Ouvrez-le dans un navigateur : file://{path}")
|
| 382 |
|
| 383 |
|
| 384 |
+
# ---------------------------------------------------------------------------
|
| 385 |
+
# picarones import (groupe de sous-commandes)
|
| 386 |
+
# ---------------------------------------------------------------------------
|
| 387 |
+
|
| 388 |
+
@cli.group("import")
|
| 389 |
+
def import_group() -> None:
|
| 390 |
+
"""Importe un corpus depuis une source distante (IIIF, HuggingFace…)."""
|
| 391 |
+
|
| 392 |
+
|
| 393 |
+
@import_group.command("iiif")
|
| 394 |
+
@click.argument("manifest_url")
|
| 395 |
+
@click.option(
|
| 396 |
+
"--pages", "-p",
|
| 397 |
+
default="all",
|
| 398 |
+
show_default=True,
|
| 399 |
+
help=(
|
| 400 |
+
"Pages à importer. Formats : '1-10', '1,3,5', '1-5,10,15-20', 'all'. "
|
| 401 |
+
"Les numéros sont 1-based (1 = première page du manifeste)."
|
| 402 |
+
),
|
| 403 |
+
)
|
| 404 |
+
@click.option(
|
| 405 |
+
"--output", "-o",
|
| 406 |
+
default="./corpus_iiif/",
|
| 407 |
+
show_default=True,
|
| 408 |
+
type=click.Path(resolve_path=True),
|
| 409 |
+
help="Dossier de destination pour les images et les fichiers .gt.txt",
|
| 410 |
+
)
|
| 411 |
+
@click.option(
|
| 412 |
+
"--max-resolution",
|
| 413 |
+
default=0,
|
| 414 |
+
type=int,
|
| 415 |
+
show_default=True,
|
| 416 |
+
help="Résolution maximale des images téléchargées (largeur en pixels). 0 = max disponible.",
|
| 417 |
+
)
|
| 418 |
+
@click.option("--no-progress", is_flag=True, default=False, help="Désactive la barre de progression")
|
| 419 |
+
@click.option("--verbose", "-v", is_flag=True, default=False, help="Mode verbeux")
|
| 420 |
+
def import_iiif_cmd(
|
| 421 |
+
manifest_url: str,
|
| 422 |
+
pages: str,
|
| 423 |
+
output: str,
|
| 424 |
+
max_resolution: int,
|
| 425 |
+
no_progress: bool,
|
| 426 |
+
verbose: bool,
|
| 427 |
+
) -> None:
|
| 428 |
+
"""Importe un corpus depuis un manifeste IIIF (v2 ou v3).
|
| 429 |
+
|
| 430 |
+
MANIFEST_URL : URL du manifeste IIIF (Gallica, Bodleian, BL, BSB…)
|
| 431 |
+
|
| 432 |
+
Exemples :
|
| 433 |
+
|
| 434 |
+
\b
|
| 435 |
+
picarones import iiif https://gallica.bnf.fr/ark:/12148/xxx/manifest.json
|
| 436 |
+
picarones import iiif https://gallica.bnf.fr/ark:/12148/xxx/manifest.json --pages 1-10
|
| 437 |
+
picarones import iiif https://gallica.bnf.fr/ark:/12148/xxx/manifest.json --pages 1,3,5-8 --output ./mon_corpus/
|
| 438 |
+
|
| 439 |
+
Les images sont téléchargées dans le dossier de sortie.
|
| 440 |
+
Des fichiers .gt.txt vides (ou remplis si le manifeste contient des annotations
|
| 441 |
+
de transcription) sont créés à côté de chaque image.
|
| 442 |
+
"""
|
| 443 |
+
_setup_logging(verbose)
|
| 444 |
+
|
| 445 |
+
from picarones.importers.iiif import IIIFImporter
|
| 446 |
+
|
| 447 |
+
click.echo(f"Manifeste IIIF : {manifest_url}")
|
| 448 |
+
|
| 449 |
+
try:
|
| 450 |
+
importer = IIIFImporter(manifest_url, max_resolution=max_resolution)
|
| 451 |
+
importer.load()
|
| 452 |
+
|
| 453 |
+
all_canvases = importer.parser.canvases()
|
| 454 |
+
click.echo(
|
| 455 |
+
f"Manifeste IIIF v{importer.parser.version} — "
|
| 456 |
+
f"titre : {importer.parser.label} — "
|
| 457 |
+
f"{len(all_canvases)} canvas disponibles"
|
| 458 |
+
)
|
| 459 |
+
|
| 460 |
+
selected = importer.list_canvases(pages)
|
| 461 |
+
click.echo(f"Pages sélectionnées : {len(selected)} sur {len(all_canvases)}")
|
| 462 |
+
|
| 463 |
+
corpus = importer.import_corpus(
|
| 464 |
+
pages=pages,
|
| 465 |
+
output_dir=output,
|
| 466 |
+
show_progress=not no_progress,
|
| 467 |
+
)
|
| 468 |
+
|
| 469 |
+
except (ValueError, RuntimeError) as exc:
|
| 470 |
+
click.echo(f"Erreur import IIIF : {exc}", err=True)
|
| 471 |
+
sys.exit(1)
|
| 472 |
+
|
| 473 |
+
click.echo(f"\n{len(corpus)} documents importés dans : {output}")
|
| 474 |
+
|
| 475 |
+
# Résumé
|
| 476 |
+
gt_filled = sum(1 for d in corpus.documents if d.ground_truth.strip())
|
| 477 |
+
if gt_filled:
|
| 478 |
+
click.echo(f"Transcriptions trouvées dans le manifeste : {gt_filled}/{len(corpus)}")
|
| 479 |
+
else:
|
| 480 |
+
click.echo(
|
| 481 |
+
"Aucune transcription dans le manifeste — "
|
| 482 |
+
"les fichiers .gt.txt sont vides (à remplir manuellement ou via OCR)."
|
| 483 |
+
)
|
| 484 |
+
|
| 485 |
+
click.echo(f"\nPour lancer un benchmark sur ce corpus :")
|
| 486 |
+
click.echo(f" picarones run --corpus {output} --engines tesseract")
|
| 487 |
+
|
| 488 |
+
|
| 489 |
if __name__ == "__main__":
|
| 490 |
cli()
|
|
@@ -5,6 +5,8 @@ Métriques implémentées
|
|
| 5 |
- CER brut : distance d'édition caractère / longueur GT
|
| 6 |
- CER normalisé NFC : après normalisation Unicode NFC
|
| 7 |
- CER sans casse : insensible aux majuscules/minuscules
|
|
|
|
|
|
|
| 8 |
- WER brut : word error rate standard
|
| 9 |
- WER normalisé : après normalisation des espaces
|
| 10 |
- MER : Match Error Rate (jiwer)
|
|
@@ -41,9 +43,6 @@ def _normalize_whitespace(text: str) -> str:
|
|
| 41 |
return " ".join(text.split())
|
| 42 |
|
| 43 |
|
| 44 |
-
# Transformations jiwer pour le CER (chaque char devient un "mot")
|
| 45 |
-
_CHAR_TRANSFORM = jiwer.transforms.Compose([]) if _JIWER_AVAILABLE else None
|
| 46 |
-
|
| 47 |
# Transformations jiwer pour le WER (normalisation légère des espaces)
|
| 48 |
_WER_TRANSFORM = (
|
| 49 |
jiwer.transforms.Compose(
|
|
@@ -62,7 +61,6 @@ def _cer_from_strings(reference: str, hypothesis: str) -> float:
|
|
| 62 |
"""CER brut : distance d'édition sur les caractères."""
|
| 63 |
if not reference:
|
| 64 |
return 0.0 if not hypothesis else 1.0
|
| 65 |
-
# jiwer.cer traite chaque caractère comme un token
|
| 66 |
return jiwer.cer(reference, hypothesis)
|
| 67 |
|
| 68 |
|
|
@@ -84,9 +82,15 @@ class MetricsResult:
|
|
| 84 |
reference_length: int
|
| 85 |
hypothesis_length: int
|
| 86 |
error: Optional[str] = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
def as_dict(self) -> dict:
|
| 89 |
-
|
| 90 |
"cer": round(self.cer, 6),
|
| 91 |
"cer_nfc": round(self.cer_nfc, 6),
|
| 92 |
"cer_caseless": round(self.cer_caseless, 6),
|
|
@@ -98,6 +102,10 @@ class MetricsResult:
|
|
| 98 |
"hypothesis_length": self.hypothesis_length,
|
| 99 |
"error": self.error,
|
| 100 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
|
| 102 |
@property
|
| 103 |
def cer_percent(self) -> float:
|
|
@@ -108,7 +116,11 @@ class MetricsResult:
|
|
| 108 |
return round(self.wer * 100, 2)
|
| 109 |
|
| 110 |
|
| 111 |
-
def compute_metrics(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
"""Calcule l'ensemble des métriques CER/WER pour une paire de textes.
|
| 113 |
|
| 114 |
Parameters
|
|
@@ -117,6 +129,10 @@ def compute_metrics(reference: str, hypothesis: str) -> MetricsResult:
|
|
| 117 |
Texte de vérité terrain (ground truth).
|
| 118 |
hypothesis:
|
| 119 |
Texte produit par le moteur OCR.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
|
| 121 |
Returns
|
| 122 |
-------
|
|
@@ -151,6 +167,19 @@ def compute_metrics(reference: str, hypothesis: str) -> MetricsResult:
|
|
| 151 |
mer = jiwer.mer(reference, hypothesis)
|
| 152 |
wil = jiwer.wil(reference, hypothesis)
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
return MetricsResult(
|
| 155 |
cer=cer_raw,
|
| 156 |
cer_nfc=cer_nfc,
|
|
@@ -161,6 +190,8 @@ def compute_metrics(reference: str, hypothesis: str) -> MetricsResult:
|
|
| 161 |
wil=wil,
|
| 162 |
reference_length=len(reference),
|
| 163 |
hypothesis_length=len(hypothesis),
|
|
|
|
|
|
|
| 164 |
)
|
| 165 |
|
| 166 |
except Exception as exc: # noqa: BLE001
|
|
@@ -208,7 +239,28 @@ def aggregate_metrics(results: list[MetricsResult]) -> dict:
|
|
| 208 |
values = [getattr(r, metric) for r in results if r.error is None]
|
| 209 |
aggregated[metric] = _stats(values)
|
| 210 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 211 |
aggregated["document_count"] = len(results)
|
| 212 |
aggregated["failed_count"] = sum(1 for r in results if r.error is not None)
|
| 213 |
|
| 214 |
return aggregated
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
- CER brut : distance d'édition caractère / longueur GT
|
| 6 |
- CER normalisé NFC : après normalisation Unicode NFC
|
| 7 |
- CER sans casse : insensible aux majuscules/minuscules
|
| 8 |
+
- CER diplomatique : après application d'une table de correspondances
|
| 9 |
+
historiques (ſ=s, u=v, i=j…) — configurable
|
| 10 |
- WER brut : word error rate standard
|
| 11 |
- WER normalisé : après normalisation des espaces
|
| 12 |
- MER : Match Error Rate (jiwer)
|
|
|
|
| 43 |
return " ".join(text.split())
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
|
|
|
| 46 |
# Transformations jiwer pour le WER (normalisation légère des espaces)
|
| 47 |
_WER_TRANSFORM = (
|
| 48 |
jiwer.transforms.Compose(
|
|
|
|
| 61 |
"""CER brut : distance d'édition sur les caractères."""
|
| 62 |
if not reference:
|
| 63 |
return 0.0 if not hypothesis else 1.0
|
|
|
|
| 64 |
return jiwer.cer(reference, hypothesis)
|
| 65 |
|
| 66 |
|
|
|
|
| 82 |
reference_length: int
|
| 83 |
hypothesis_length: int
|
| 84 |
error: Optional[str] = None
|
| 85 |
+
cer_diplomatic: Optional[float] = None
|
| 86 |
+
"""CER calculé après normalisation diplomatique (ſ=s, u=v, i=j…).
|
| 87 |
+
None si aucun profil diplomatique n'a été fourni à compute_metrics.
|
| 88 |
+
"""
|
| 89 |
+
diplomatic_profile_name: Optional[str] = None
|
| 90 |
+
"""Nom du profil de normalisation diplomatique utilisé."""
|
| 91 |
|
| 92 |
def as_dict(self) -> dict:
|
| 93 |
+
d = {
|
| 94 |
"cer": round(self.cer, 6),
|
| 95 |
"cer_nfc": round(self.cer_nfc, 6),
|
| 96 |
"cer_caseless": round(self.cer_caseless, 6),
|
|
|
|
| 102 |
"hypothesis_length": self.hypothesis_length,
|
| 103 |
"error": self.error,
|
| 104 |
}
|
| 105 |
+
if self.cer_diplomatic is not None:
|
| 106 |
+
d["cer_diplomatic"] = round(self.cer_diplomatic, 6)
|
| 107 |
+
d["diplomatic_profile_name"] = self.diplomatic_profile_name
|
| 108 |
+
return d
|
| 109 |
|
| 110 |
@property
|
| 111 |
def cer_percent(self) -> float:
|
|
|
|
| 116 |
return round(self.wer * 100, 2)
|
| 117 |
|
| 118 |
|
| 119 |
+
def compute_metrics(
|
| 120 |
+
reference: str,
|
| 121 |
+
hypothesis: str,
|
| 122 |
+
normalization_profile: "Optional[NormalizationProfile]" = None, # noqa: F821
|
| 123 |
+
) -> MetricsResult:
|
| 124 |
"""Calcule l'ensemble des métriques CER/WER pour une paire de textes.
|
| 125 |
|
| 126 |
Parameters
|
|
|
|
| 129 |
Texte de vérité terrain (ground truth).
|
| 130 |
hypothesis:
|
| 131 |
Texte produit par le moteur OCR.
|
| 132 |
+
normalization_profile:
|
| 133 |
+
Profil de normalisation diplomatique optionnel.
|
| 134 |
+
Si fourni, calcule ``cer_diplomatic`` en plus des métriques standard.
|
| 135 |
+
Si None, utilise le profil medieval_french par défaut.
|
| 136 |
|
| 137 |
Returns
|
| 138 |
-------
|
|
|
|
| 167 |
mer = jiwer.mer(reference, hypothesis)
|
| 168 |
wil = jiwer.wil(reference, hypothesis)
|
| 169 |
|
| 170 |
+
# CER diplomatique — utilise le profil fourni ou le profil médiéval par défaut
|
| 171 |
+
cer_diplomatic: Optional[float] = None
|
| 172 |
+
diplomatic_profile_name: Optional[str] = None
|
| 173 |
+
try:
|
| 174 |
+
from picarones.core.normalization import DEFAULT_DIPLOMATIC_PROFILE
|
| 175 |
+
profile = normalization_profile or DEFAULT_DIPLOMATIC_PROFILE
|
| 176 |
+
ref_diplo = profile.normalize(reference)
|
| 177 |
+
hyp_diplo = profile.normalize(hypothesis)
|
| 178 |
+
cer_diplomatic = _cer_from_strings(ref_diplo, hyp_diplo)
|
| 179 |
+
diplomatic_profile_name = profile.name
|
| 180 |
+
except Exception: # noqa: BLE001
|
| 181 |
+
pass # CER diplomatique non critique
|
| 182 |
+
|
| 183 |
return MetricsResult(
|
| 184 |
cer=cer_raw,
|
| 185 |
cer_nfc=cer_nfc,
|
|
|
|
| 190 |
wil=wil,
|
| 191 |
reference_length=len(reference),
|
| 192 |
hypothesis_length=len(hypothesis),
|
| 193 |
+
cer_diplomatic=cer_diplomatic,
|
| 194 |
+
diplomatic_profile_name=diplomatic_profile_name,
|
| 195 |
)
|
| 196 |
|
| 197 |
except Exception as exc: # noqa: BLE001
|
|
|
|
| 239 |
values = [getattr(r, metric) for r in results if r.error is None]
|
| 240 |
aggregated[metric] = _stats(values)
|
| 241 |
|
| 242 |
+
# CER diplomatique (optionnel — présent seulement si calculé)
|
| 243 |
+
diplo_values = [
|
| 244 |
+
r.cer_diplomatic for r in results
|
| 245 |
+
if r.error is None and r.cer_diplomatic is not None
|
| 246 |
+
]
|
| 247 |
+
if diplo_values:
|
| 248 |
+
aggregated["cer_diplomatic"] = _stats(diplo_values)
|
| 249 |
+
# Nom du profil (même pour tous les docs d'un corpus)
|
| 250 |
+
profile_name = next(
|
| 251 |
+
(r.diplomatic_profile_name for r in results if r.diplomatic_profile_name),
|
| 252 |
+
None,
|
| 253 |
+
)
|
| 254 |
+
if profile_name:
|
| 255 |
+
aggregated["cer_diplomatic"]["profile"] = profile_name
|
| 256 |
+
|
| 257 |
aggregated["document_count"] = len(results)
|
| 258 |
aggregated["failed_count"] = sum(1 for r in results if r.error is not None)
|
| 259 |
|
| 260 |
return aggregated
|
| 261 |
+
|
| 262 |
+
|
| 263 |
+
# Import paresseux pour éviter les imports circulaires
|
| 264 |
+
from typing import TYPE_CHECKING
|
| 265 |
+
if TYPE_CHECKING:
|
| 266 |
+
from picarones.core.normalization import NormalizationProfile
|
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Profils de normalisation unicode pour le calcul du CER diplomatique.
|
| 2 |
+
|
| 3 |
+
La normalisation diplomatique permet de calculer un CER tenant compte des
|
| 4 |
+
équivalences graphiques propres aux documents historiques : ſ=s, u=v, i=j, etc.
|
| 5 |
+
|
| 6 |
+
En appliquant la même table aux deux textes (GT et OCR), on mesure les erreurs
|
| 7 |
+
"substantielles" (transcription erronée) en ignorant les variations graphiques
|
| 8 |
+
codifiées connues.
|
| 9 |
+
|
| 10 |
+
Trois niveaux de normalisation sont disponibles :
|
| 11 |
+
|
| 12 |
+
1. NFC : normalisation Unicode canonique (décomposition+recomposition)
|
| 13 |
+
2. caseless : NFC + pliage de casse (casefold)
|
| 14 |
+
3. diplomatic: NFC + table de correspondances historiques configurables
|
| 15 |
+
|
| 16 |
+
Les profils préconfigurés couvrent les cas d'usage patrimoniaux courants.
|
| 17 |
+
Ils sont également chargeables depuis un fichier YAML.
|
| 18 |
+
|
| 19 |
+
Exemple YAML
|
| 20 |
+
------------
|
| 21 |
+
name: medieval_custom
|
| 22 |
+
caseless: false
|
| 23 |
+
diplomatic:
|
| 24 |
+
ſ: s
|
| 25 |
+
u: v
|
| 26 |
+
i: j
|
| 27 |
+
y: i
|
| 28 |
+
æ: ae
|
| 29 |
+
œ: oe
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
|
| 34 |
+
import unicodedata
|
| 35 |
+
from dataclasses import dataclass, field
|
| 36 |
+
from pathlib import Path
|
| 37 |
+
from typing import Optional
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
# ---------------------------------------------------------------------------
|
| 41 |
+
# Tables de correspondances diplomatiques préconfigurées
|
| 42 |
+
# ---------------------------------------------------------------------------
|
| 43 |
+
|
| 44 |
+
#: Français médiéval (XIIe–XVe siècle)
|
| 45 |
+
DIPLOMATIC_FR_MEDIEVAL: dict[str, str] = {
|
| 46 |
+
"ſ": "s", # s long → s
|
| 47 |
+
"u": "v", # u/v interchangeables en position initiale
|
| 48 |
+
"i": "j", # i/j interchangeables
|
| 49 |
+
"y": "i", # y vocalique → i
|
| 50 |
+
"æ": "ae", # ligature æ
|
| 51 |
+
"œ": "oe", # ligature œ
|
| 52 |
+
"ꝑ": "per", # abréviation per/par
|
| 53 |
+
"ꝓ": "pro", # abréviation pro
|
| 54 |
+
"\u0026": "et", # & → et
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
#: Français moderne / imprimés anciens (XVIe–XVIIIe siècle)
|
| 58 |
+
DIPLOMATIC_FR_EARLY_MODERN: dict[str, str] = {
|
| 59 |
+
"ſ": "s", # s long
|
| 60 |
+
"æ": "ae",
|
| 61 |
+
"œ": "oe",
|
| 62 |
+
"\u0026": "et",
|
| 63 |
+
"ỹ": "yn", # y tilde
|
| 64 |
+
}
|
| 65 |
+
|
| 66 |
+
#: Latin médiéval
|
| 67 |
+
DIPLOMATIC_LATIN_MEDIEVAL: dict[str, str] = {
|
| 68 |
+
"ſ": "s",
|
| 69 |
+
"u": "v",
|
| 70 |
+
"i": "j",
|
| 71 |
+
"y": "i",
|
| 72 |
+
"æ": "ae",
|
| 73 |
+
"œ": "oe",
|
| 74 |
+
"ꝑ": "per",
|
| 75 |
+
"ꝓ": "pro",
|
| 76 |
+
"ꝗ": "que", # q barré → que
|
| 77 |
+
"\u0026": "et",
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
#: Profil minimal — uniquement NFC + s long
|
| 81 |
+
DIPLOMATIC_MINIMAL: dict[str, str] = {
|
| 82 |
+
"ſ": "s",
|
| 83 |
+
}
|
| 84 |
+
|
| 85 |
+
|
| 86 |
+
# ---------------------------------------------------------------------------
|
| 87 |
+
# Profil de normalisation
|
| 88 |
+
# ---------------------------------------------------------------------------
|
| 89 |
+
|
| 90 |
+
@dataclass
|
| 91 |
+
class NormalizationProfile:
|
| 92 |
+
"""Décrit une stratégie de normalisation pour le calcul du CER diplomatique.
|
| 93 |
+
|
| 94 |
+
Parameters
|
| 95 |
+
----------
|
| 96 |
+
name:
|
| 97 |
+
Identifiant lisible du profil (ex : ``"medieval_french"``).
|
| 98 |
+
nfc:
|
| 99 |
+
Applique la normalisation Unicode NFC (recommandé, activé par défaut).
|
| 100 |
+
caseless:
|
| 101 |
+
Pliage de casse (casefold) après NFC.
|
| 102 |
+
diplomatic_table:
|
| 103 |
+
Table de correspondances graphiques historiques appliquée caractère
|
| 104 |
+
par caractère sur les deux textes avant calcul du CER.
|
| 105 |
+
description:
|
| 106 |
+
Description courte du profil (affichée dans le rapport HTML).
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
name: str
|
| 110 |
+
nfc: bool = True
|
| 111 |
+
caseless: bool = False
|
| 112 |
+
diplomatic_table: dict[str, str] = field(default_factory=dict)
|
| 113 |
+
description: str = ""
|
| 114 |
+
|
| 115 |
+
def normalize(self, text: str) -> str:
|
| 116 |
+
"""Applique le profil de normalisation à un texte."""
|
| 117 |
+
if self.nfc:
|
| 118 |
+
text = unicodedata.normalize("NFC", text)
|
| 119 |
+
if self.caseless:
|
| 120 |
+
text = text.casefold()
|
| 121 |
+
if self.diplomatic_table:
|
| 122 |
+
text = _apply_diplomatic_table(text, self.diplomatic_table)
|
| 123 |
+
return text
|
| 124 |
+
|
| 125 |
+
def as_dict(self) -> dict:
|
| 126 |
+
return {
|
| 127 |
+
"name": self.name,
|
| 128 |
+
"nfc": self.nfc,
|
| 129 |
+
"caseless": self.caseless,
|
| 130 |
+
"diplomatic_table": self.diplomatic_table,
|
| 131 |
+
"description": self.description,
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
@classmethod
|
| 135 |
+
def from_yaml(cls, path: str | Path) -> "NormalizationProfile":
|
| 136 |
+
"""Charge un profil depuis un fichier YAML.
|
| 137 |
+
|
| 138 |
+
Le fichier YAML doit contenir les clés ``name``, optionnellement
|
| 139 |
+
``caseless``, ``description`` et ``diplomatic`` (dict str→str).
|
| 140 |
+
|
| 141 |
+
Example
|
| 142 |
+
-------
|
| 143 |
+
.. code-block:: yaml
|
| 144 |
+
|
| 145 |
+
name: medieval_custom
|
| 146 |
+
caseless: false
|
| 147 |
+
description: Français médiéval personnalisé
|
| 148 |
+
diplomatic:
|
| 149 |
+
ſ: s
|
| 150 |
+
u: v
|
| 151 |
+
"""
|
| 152 |
+
try:
|
| 153 |
+
import yaml
|
| 154 |
+
except ImportError as exc:
|
| 155 |
+
raise RuntimeError(
|
| 156 |
+
"Le package 'pyyaml' est requis pour charger les profils YAML. "
|
| 157 |
+
"Installez-le avec : pip install pyyaml"
|
| 158 |
+
) from exc
|
| 159 |
+
|
| 160 |
+
data = yaml.safe_load(Path(path).read_text(encoding="utf-8"))
|
| 161 |
+
return cls(
|
| 162 |
+
name=data.get("name", Path(path).stem),
|
| 163 |
+
nfc=bool(data.get("nfc", True)),
|
| 164 |
+
caseless=bool(data.get("caseless", False)),
|
| 165 |
+
diplomatic_table=data.get("diplomatic", {}),
|
| 166 |
+
description=data.get("description", ""),
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
@classmethod
|
| 170 |
+
def from_dict(cls, data: dict) -> "NormalizationProfile":
|
| 171 |
+
"""Charge un profil depuis un dictionnaire (ex : section YAML inline)."""
|
| 172 |
+
return cls(
|
| 173 |
+
name=data.get("name", "custom"),
|
| 174 |
+
nfc=bool(data.get("nfc", True)),
|
| 175 |
+
caseless=bool(data.get("caseless", False)),
|
| 176 |
+
diplomatic_table=data.get("diplomatic", {}),
|
| 177 |
+
description=data.get("description", ""),
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# ---------------------------------------------------------------------------
|
| 182 |
+
# Profils préconfigurés
|
| 183 |
+
# ---------------------------------------------------------------------------
|
| 184 |
+
|
| 185 |
+
def get_builtin_profile(name: str) -> NormalizationProfile:
|
| 186 |
+
"""Retourne un profil préconfigurée par son identifiant.
|
| 187 |
+
|
| 188 |
+
Identifiants disponibles
|
| 189 |
+
------------------------
|
| 190 |
+
- ``"medieval_french"`` : français médiéval XIIe–XVe (ſ=s, u=v, i=j, æ=ae, œ=oe…)
|
| 191 |
+
- ``"early_modern_french"`` : imprimés anciens XVIe–XVIIIe (ſ=s, œ=oe, æ=ae…)
|
| 192 |
+
- ``"medieval_latin"`` : latin médiéval (ſ=s, u=v, i=j, ꝑ=per, ꝓ=pro…)
|
| 193 |
+
- ``"minimal"`` : uniquement NFC + s long
|
| 194 |
+
- ``"nfc"`` : NFC seul (sans table diplomatique)
|
| 195 |
+
- ``"caseless"`` : NFC + pliage de casse
|
| 196 |
+
|
| 197 |
+
Raises
|
| 198 |
+
------
|
| 199 |
+
KeyError
|
| 200 |
+
Si le nom n'est pas reconnu.
|
| 201 |
+
"""
|
| 202 |
+
profiles = {
|
| 203 |
+
"medieval_french": NormalizationProfile(
|
| 204 |
+
name="medieval_french",
|
| 205 |
+
nfc=True,
|
| 206 |
+
caseless=False,
|
| 207 |
+
diplomatic_table=DIPLOMATIC_FR_MEDIEVAL,
|
| 208 |
+
description="Français médiéval (XIIe–XVe) : ſ=s, u=v, i=j, æ=ae, œ=oe",
|
| 209 |
+
),
|
| 210 |
+
"early_modern_french": NormalizationProfile(
|
| 211 |
+
name="early_modern_french",
|
| 212 |
+
nfc=True,
|
| 213 |
+
caseless=False,
|
| 214 |
+
diplomatic_table=DIPLOMATIC_FR_EARLY_MODERN,
|
| 215 |
+
description="Imprimés anciens (XVIe–XVIIIe) : ſ=s, æ=ae, œ=oe",
|
| 216 |
+
),
|
| 217 |
+
"medieval_latin": NormalizationProfile(
|
| 218 |
+
name="medieval_latin",
|
| 219 |
+
nfc=True,
|
| 220 |
+
caseless=False,
|
| 221 |
+
diplomatic_table=DIPLOMATIC_LATIN_MEDIEVAL,
|
| 222 |
+
description="Latin médiéval : ſ=s, u=v, i=j, ꝑ=per, ꝓ=pro",
|
| 223 |
+
),
|
| 224 |
+
"minimal": NormalizationProfile(
|
| 225 |
+
name="minimal",
|
| 226 |
+
nfc=True,
|
| 227 |
+
caseless=False,
|
| 228 |
+
diplomatic_table=DIPLOMATIC_MINIMAL,
|
| 229 |
+
description="Minimal : NFC + s long seulement",
|
| 230 |
+
),
|
| 231 |
+
"nfc": NormalizationProfile(
|
| 232 |
+
name="nfc",
|
| 233 |
+
nfc=True,
|
| 234 |
+
caseless=False,
|
| 235 |
+
diplomatic_table={},
|
| 236 |
+
description="Normalisation NFC uniquement",
|
| 237 |
+
),
|
| 238 |
+
"caseless": NormalizationProfile(
|
| 239 |
+
name="caseless",
|
| 240 |
+
nfc=True,
|
| 241 |
+
caseless=True,
|
| 242 |
+
diplomatic_table={},
|
| 243 |
+
description="NFC + insensible à la casse",
|
| 244 |
+
),
|
| 245 |
+
}
|
| 246 |
+
if name not in profiles:
|
| 247 |
+
raise KeyError(
|
| 248 |
+
f"Profil de normalisation inconnu : '{name}'. "
|
| 249 |
+
f"Disponibles : {', '.join(profiles)}"
|
| 250 |
+
)
|
| 251 |
+
return profiles[name]
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
# ---------------------------------------------------------------------------
|
| 255 |
+
# Fonctions utilitaires
|
| 256 |
+
# ---------------------------------------------------------------------------
|
| 257 |
+
|
| 258 |
+
def _apply_diplomatic_table(text: str, table: dict[str, str]) -> str:
|
| 259 |
+
"""Applique une table de correspondances diplomatiques caractère par caractère.
|
| 260 |
+
|
| 261 |
+
Les clés multi-caractères (ex : ``"ae"`` → ``"æ"``) sont gérées en priorité
|
| 262 |
+
sur les correspondances simples.
|
| 263 |
+
"""
|
| 264 |
+
if not table:
|
| 265 |
+
return text
|
| 266 |
+
|
| 267 |
+
# Séparer les clés simples (1 char) des clés multi-chars pour traitement ordonné
|
| 268 |
+
multi_keys = sorted(
|
| 269 |
+
(k for k in table if len(k) > 1), key=len, reverse=True
|
| 270 |
+
)
|
| 271 |
+
simple_table = {k: v for k, v in table.items() if len(k) == 1}
|
| 272 |
+
|
| 273 |
+
result = text
|
| 274 |
+
# Remplacements multi-chars en premier (évite les conflits)
|
| 275 |
+
for key in multi_keys:
|
| 276 |
+
result = result.replace(key, table[key])
|
| 277 |
+
|
| 278 |
+
# Remplacements char par char
|
| 279 |
+
if simple_table:
|
| 280 |
+
result = "".join(simple_table.get(c, c) for c in result)
|
| 281 |
+
|
| 282 |
+
return result
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
# Profil par défaut utilisé pour le CER diplomatique intégré
|
| 286 |
+
DEFAULT_DIPLOMATIC_PROFILE: NormalizationProfile = get_builtin_profile("medieval_french")
|
|
@@ -2,8 +2,18 @@
|
|
| 2 |
|
| 3 |
from picarones.engines.base import BaseOCREngine, EngineResult
|
| 4 |
from picarones.engines.tesseract import TesseractEngine
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
__all__ = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
try:
|
| 9 |
from picarones.engines.pero_ocr import PeroOCREngine
|
|
|
|
| 2 |
|
| 3 |
from picarones.engines.base import BaseOCREngine, EngineResult
|
| 4 |
from picarones.engines.tesseract import TesseractEngine
|
| 5 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 6 |
+
from picarones.engines.google_vision import GoogleVisionEngine
|
| 7 |
+
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
|
| 8 |
|
| 9 |
+
__all__ = [
|
| 10 |
+
"BaseOCREngine",
|
| 11 |
+
"EngineResult",
|
| 12 |
+
"TesseractEngine",
|
| 13 |
+
"MistralOCREngine",
|
| 14 |
+
"GoogleVisionEngine",
|
| 15 |
+
"AzureDocIntelEngine",
|
| 16 |
+
]
|
| 17 |
|
| 18 |
try:
|
| 19 |
from picarones.engines.pero_ocr import PeroOCREngine
|
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Adaptateur OCR — Azure Document Intelligence (anciennement Form Recognizer).
|
| 2 |
+
|
| 3 |
+
Utilise l'API Azure Document Intelligence pour la reconnaissance de texte
|
| 4 |
+
dans des documents historiques.
|
| 5 |
+
|
| 6 |
+
Variables d'environnement requises :
|
| 7 |
+
- ``AZURE_DOC_INTEL_KEY`` : clé API Azure
|
| 8 |
+
- ``AZURE_DOC_INTEL_ENDPOINT`` : URL de l'endpoint (ex : https://moninstance.cognitiveservices.azure.com/)
|
| 9 |
+
|
| 10 |
+
Documentation : https://learn.microsoft.com/azure/ai-services/document-intelligence/
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
import base64
|
| 16 |
+
import json
|
| 17 |
+
import os
|
| 18 |
+
import time
|
| 19 |
+
import urllib.error
|
| 20 |
+
import urllib.request
|
| 21 |
+
from pathlib import Path
|
| 22 |
+
from typing import Optional
|
| 23 |
+
|
| 24 |
+
from picarones.engines.base import BaseOCREngine
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
class AzureDocIntelEngine(BaseOCREngine):
|
| 28 |
+
"""Moteur OCR via Azure Document Intelligence.
|
| 29 |
+
|
| 30 |
+
Configuration
|
| 31 |
+
-------------
|
| 32 |
+
model_id : str
|
| 33 |
+
Modèle Azure à utiliser. Défaut : ``"prebuilt-read"`` (lecture générique).
|
| 34 |
+
Alternatives : ``"prebuilt-document"``, ``"prebuilt-layout"``
|
| 35 |
+
ou un modèle entraîné personnalisé.
|
| 36 |
+
locale : str
|
| 37 |
+
Paramètre de locale pour améliorer la précision (ex : ``"fr-FR"``).
|
| 38 |
+
api_version : str
|
| 39 |
+
Version de l'API Azure (défaut : ``"2024-02-29-preview"``).
|
| 40 |
+
"""
|
| 41 |
+
|
| 42 |
+
@property
|
| 43 |
+
def name(self) -> str:
|
| 44 |
+
return "azure_doc_intel"
|
| 45 |
+
|
| 46 |
+
def version(self) -> str:
|
| 47 |
+
return self.config.get("api_version", "2024-02-29-preview")
|
| 48 |
+
|
| 49 |
+
def __init__(self, config: Optional[dict] = None) -> None:
|
| 50 |
+
super().__init__(config)
|
| 51 |
+
self._api_key = os.environ.get("AZURE_DOC_INTEL_KEY")
|
| 52 |
+
self._endpoint = (
|
| 53 |
+
os.environ.get("AZURE_DOC_INTEL_ENDPOINT", "").rstrip("/")
|
| 54 |
+
or self.config.get("endpoint", "").rstrip("/")
|
| 55 |
+
)
|
| 56 |
+
self._model_id: str = self.config.get("model_id", "prebuilt-read")
|
| 57 |
+
self._locale: str = self.config.get("locale", "fr-FR")
|
| 58 |
+
self._api_version: str = self.config.get("api_version", "2024-02-29-preview")
|
| 59 |
+
|
| 60 |
+
def _run_ocr(self, image_path: Path) -> str:
|
| 61 |
+
if not self._api_key:
|
| 62 |
+
raise RuntimeError(
|
| 63 |
+
"Clé API Azure manquante — définissez la variable d'environnement AZURE_DOC_INTEL_KEY"
|
| 64 |
+
)
|
| 65 |
+
if not self._endpoint:
|
| 66 |
+
raise RuntimeError(
|
| 67 |
+
"Endpoint Azure manquant — définissez la variable d'environnement AZURE_DOC_INTEL_ENDPOINT"
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
# Essai via SDK Azure si disponible, sinon REST direct
|
| 71 |
+
try:
|
| 72 |
+
return self._run_via_sdk(image_path)
|
| 73 |
+
except ImportError:
|
| 74 |
+
return self._run_via_rest(image_path)
|
| 75 |
+
|
| 76 |
+
def _run_via_sdk(self, image_path: Path) -> str:
|
| 77 |
+
from azure.ai.documentintelligence import DocumentIntelligenceClient
|
| 78 |
+
from azure.core.credentials import AzureKeyCredential
|
| 79 |
+
|
| 80 |
+
client = DocumentIntelligenceClient(
|
| 81 |
+
endpoint=self._endpoint,
|
| 82 |
+
credential=AzureKeyCredential(self._api_key),
|
| 83 |
+
)
|
| 84 |
+
with open(image_path, "rb") as f:
|
| 85 |
+
poller = client.begin_analyze_document(
|
| 86 |
+
model_id=self._model_id,
|
| 87 |
+
body=f,
|
| 88 |
+
locale=self._locale,
|
| 89 |
+
content_type="application/octet-stream",
|
| 90 |
+
)
|
| 91 |
+
result = poller.result()
|
| 92 |
+
return "\n".join(
|
| 93 |
+
line.content
|
| 94 |
+
for page in result.pages
|
| 95 |
+
for line in (page.lines or [])
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
def _run_via_rest(self, image_path: Path) -> str:
|
| 99 |
+
"""Appel REST direct (sans SDK Azure)."""
|
| 100 |
+
image_bytes = image_path.read_bytes()
|
| 101 |
+
analyze_url = (
|
| 102 |
+
f"{self._endpoint}/documentintelligence/documentModels/"
|
| 103 |
+
f"{self._model_id}:analyze"
|
| 104 |
+
f"?api-version={self._api_version}&locale={self._locale}"
|
| 105 |
+
)
|
| 106 |
+
|
| 107 |
+
# Soumettre l'image
|
| 108 |
+
req = urllib.request.Request(
|
| 109 |
+
analyze_url,
|
| 110 |
+
data=image_bytes,
|
| 111 |
+
headers={
|
| 112 |
+
"Ocp-Apim-Subscription-Key": self._api_key,
|
| 113 |
+
"Content-Type": "application/octet-stream",
|
| 114 |
+
},
|
| 115 |
+
)
|
| 116 |
+
try:
|
| 117 |
+
with urllib.request.urlopen(req, timeout=60) as resp:
|
| 118 |
+
operation_url = resp.headers.get("Operation-Location", "")
|
| 119 |
+
except urllib.error.HTTPError as exc:
|
| 120 |
+
raise RuntimeError(
|
| 121 |
+
f"Azure Document Intelligence erreur {exc.code}: {exc.read().decode()}"
|
| 122 |
+
) from exc
|
| 123 |
+
|
| 124 |
+
if not operation_url:
|
| 125 |
+
raise RuntimeError("Azure : pas d'Operation-Location dans la réponse")
|
| 126 |
+
|
| 127 |
+
# Polling du résultat (Azure est asynchrone)
|
| 128 |
+
headers = {"Ocp-Apim-Subscription-Key": self._api_key}
|
| 129 |
+
for attempt in range(30):
|
| 130 |
+
time.sleep(1 + attempt * 0.5)
|
| 131 |
+
poll_req = urllib.request.Request(operation_url, headers=headers)
|
| 132 |
+
with urllib.request.urlopen(poll_req, timeout=30) as resp:
|
| 133 |
+
result = json.loads(resp.read().decode("utf-8"))
|
| 134 |
+
status = result.get("status", "")
|
| 135 |
+
if status == "succeeded":
|
| 136 |
+
return self._extract_text_from_result(result)
|
| 137 |
+
if status in {"failed", "canceled"}:
|
| 138 |
+
raise RuntimeError(f"Azure Document Intelligence : analyse {status}")
|
| 139 |
+
# status == "running" → continuer à attendre
|
| 140 |
+
|
| 141 |
+
raise RuntimeError("Azure Document Intelligence : timeout — analyse trop longue")
|
| 142 |
+
|
| 143 |
+
@staticmethod
|
| 144 |
+
def _extract_text_from_result(result: dict) -> str:
|
| 145 |
+
"""Extrait le texte brut depuis la réponse JSON Azure."""
|
| 146 |
+
pages = result.get("analyzeResult", {}).get("pages", [])
|
| 147 |
+
lines: list[str] = []
|
| 148 |
+
for page in pages:
|
| 149 |
+
for line in page.get("lines", []):
|
| 150 |
+
content = line.get("content", "")
|
| 151 |
+
if content:
|
| 152 |
+
lines.append(content)
|
| 153 |
+
return "\n".join(lines)
|
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Adaptateur OCR — Google Cloud Vision API.
|
| 2 |
+
|
| 3 |
+
Utilise l'API Google Cloud Vision pour la détection de texte dans des
|
| 4 |
+
documents (méthode ``DOCUMENT_TEXT_DETECTION``, optimisée pour les textes
|
| 5 |
+
denses et multilinguistiques).
|
| 6 |
+
|
| 7 |
+
Authentification :
|
| 8 |
+
- Via service account JSON : variable d'environnement
|
| 9 |
+
``GOOGLE_APPLICATION_CREDENTIALS`` → chemin vers le fichier JSON
|
| 10 |
+
- Via clé API simple : variable d'environnement ``GOOGLE_API_KEY``
|
| 11 |
+
|
| 12 |
+
Le mode service account est recommandé pour la production.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
from __future__ import annotations
|
| 16 |
+
|
| 17 |
+
import base64
|
| 18 |
+
import json
|
| 19 |
+
import os
|
| 20 |
+
import urllib.error
|
| 21 |
+
import urllib.request
|
| 22 |
+
from pathlib import Path
|
| 23 |
+
from typing import Optional
|
| 24 |
+
|
| 25 |
+
from picarones.engines.base import BaseOCREngine
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class GoogleVisionEngine(BaseOCREngine):
|
| 29 |
+
"""Moteur OCR via l'API Google Cloud Vision.
|
| 30 |
+
|
| 31 |
+
Configuration
|
| 32 |
+
-------------
|
| 33 |
+
language_hints : list[str]
|
| 34 |
+
Suggestions de langue (ex : ``["fr"]``). Améliore la précision.
|
| 35 |
+
feature_type : str
|
| 36 |
+
Type de détection : ``"DOCUMENT_TEXT_DETECTION"`` (défaut, pour textes
|
| 37 |
+
denses) ou ``"TEXT_DETECTION"`` (pour textes courts).
|
| 38 |
+
"""
|
| 39 |
+
|
| 40 |
+
@property
|
| 41 |
+
def name(self) -> str:
|
| 42 |
+
return "google_vision"
|
| 43 |
+
|
| 44 |
+
def version(self) -> str:
|
| 45 |
+
return "v1"
|
| 46 |
+
|
| 47 |
+
def __init__(self, config: Optional[dict] = None) -> None:
|
| 48 |
+
super().__init__(config)
|
| 49 |
+
self._api_key = os.environ.get("GOOGLE_API_KEY")
|
| 50 |
+
self._credentials_path = os.environ.get("GOOGLE_APPLICATION_CREDENTIALS")
|
| 51 |
+
self._language_hints: list[str] = self.config.get("language_hints", ["fr"])
|
| 52 |
+
self._feature_type: str = self.config.get("feature_type", "DOCUMENT_TEXT_DETECTION")
|
| 53 |
+
|
| 54 |
+
def _run_ocr(self, image_path: Path) -> str:
|
| 55 |
+
# Priorité : SDK google-cloud-vision si disponible, sinon REST direct
|
| 56 |
+
if self._credentials_path:
|
| 57 |
+
return self._run_via_sdk(image_path)
|
| 58 |
+
elif self._api_key:
|
| 59 |
+
return self._run_via_rest(image_path)
|
| 60 |
+
else:
|
| 61 |
+
raise RuntimeError(
|
| 62 |
+
"Authentification Google Vision manquante. Définissez "
|
| 63 |
+
"GOOGLE_APPLICATION_CREDENTIALS (service account JSON) "
|
| 64 |
+
"ou GOOGLE_API_KEY."
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
def _run_via_sdk(self, image_path: Path) -> str:
|
| 68 |
+
try:
|
| 69 |
+
from google.cloud import vision
|
| 70 |
+
except ImportError as exc:
|
| 71 |
+
raise RuntimeError(
|
| 72 |
+
"Le package 'google-cloud-vision' n'est pas installé. "
|
| 73 |
+
"Lancez : pip install google-cloud-vision"
|
| 74 |
+
) from exc
|
| 75 |
+
|
| 76 |
+
client = vision.ImageAnnotatorClient()
|
| 77 |
+
image_bytes = image_path.read_bytes()
|
| 78 |
+
image = vision.Image(content=image_bytes)
|
| 79 |
+
|
| 80 |
+
if self._feature_type == "DOCUMENT_TEXT_DETECTION":
|
| 81 |
+
response = client.document_text_detection(
|
| 82 |
+
image=image,
|
| 83 |
+
image_context=vision.ImageContext(
|
| 84 |
+
language_hints=self._language_hints
|
| 85 |
+
),
|
| 86 |
+
)
|
| 87 |
+
return response.full_text_annotation.text
|
| 88 |
+
else:
|
| 89 |
+
response = client.text_detection(
|
| 90 |
+
image=image,
|
| 91 |
+
image_context=vision.ImageContext(
|
| 92 |
+
language_hints=self._language_hints
|
| 93 |
+
),
|
| 94 |
+
)
|
| 95 |
+
texts = response.text_annotations
|
| 96 |
+
return texts[0].description if texts else ""
|
| 97 |
+
|
| 98 |
+
def _run_via_rest(self, image_path: Path) -> str:
|
| 99 |
+
"""Appel REST direct (sans SDK), avec clé API simple."""
|
| 100 |
+
image_b64 = base64.b64encode(image_path.read_bytes()).decode("ascii")
|
| 101 |
+
payload = {
|
| 102 |
+
"requests": [
|
| 103 |
+
{
|
| 104 |
+
"image": {"content": image_b64},
|
| 105 |
+
"features": [{"type": self._feature_type, "maxResults": 1}],
|
| 106 |
+
"imageContext": {"languageHints": self._language_hints},
|
| 107 |
+
}
|
| 108 |
+
]
|
| 109 |
+
}
|
| 110 |
+
url = f"https://vision.googleapis.com/v1/images:annotate?key={self._api_key}"
|
| 111 |
+
data = json.dumps(payload).encode("utf-8")
|
| 112 |
+
req = urllib.request.Request(
|
| 113 |
+
url, data=data,
|
| 114 |
+
headers={"Content-Type": "application/json"},
|
| 115 |
+
)
|
| 116 |
+
try:
|
| 117 |
+
with urllib.request.urlopen(req, timeout=60) as resp:
|
| 118 |
+
result = json.loads(resp.read().decode("utf-8"))
|
| 119 |
+
except urllib.error.HTTPError as exc:
|
| 120 |
+
raise RuntimeError(f"Google Vision API erreur {exc.code}: {exc.read().decode()}") from exc
|
| 121 |
+
|
| 122 |
+
responses = result.get("responses", [{}])
|
| 123 |
+
if not responses:
|
| 124 |
+
return ""
|
| 125 |
+
r = responses[0]
|
| 126 |
+
if "error" in r:
|
| 127 |
+
raise RuntimeError(f"Google Vision API erreur : {r['error']}")
|
| 128 |
+
|
| 129 |
+
if self._feature_type == "DOCUMENT_TEXT_DETECTION":
|
| 130 |
+
return r.get("fullTextAnnotation", {}).get("text", "")
|
| 131 |
+
else:
|
| 132 |
+
texts = r.get("textAnnotations", [])
|
| 133 |
+
return texts[0]["description"] if texts else ""
|
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Adaptateur OCR — Mistral OCR (API vision Mistral AI).
|
| 2 |
+
|
| 3 |
+
Utilise l'API Mistral pour la reconnaissance de texte sur documents
|
| 4 |
+
patrimoniaux via le modèle multimodal Mistral.
|
| 5 |
+
|
| 6 |
+
Clé API : variable d'environnement ``MISTRAL_API_KEY``.
|
| 7 |
+
|
| 8 |
+
Documentation API : https://docs.mistral.ai/
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import base64
|
| 14 |
+
import os
|
| 15 |
+
from pathlib import Path
|
| 16 |
+
from typing import Optional
|
| 17 |
+
|
| 18 |
+
from picarones.engines.base import BaseOCREngine
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
class MistralOCREngine(BaseOCREngine):
|
| 22 |
+
"""Moteur OCR via l'API Mistral AI (modèle vision).
|
| 23 |
+
|
| 24 |
+
Configuration
|
| 25 |
+
-------------
|
| 26 |
+
model : str
|
| 27 |
+
Modèle Mistral à utiliser (défaut : ``"pixtral-12b-2409"``).
|
| 28 |
+
Les modèles multimodaux supportant la vision sont :
|
| 29 |
+
``pixtral-12b-2409``, ``pixtral-large-latest``.
|
| 30 |
+
prompt : str
|
| 31 |
+
Prompt envoyé avec l'image. Défaut : instruction générique de transcription.
|
| 32 |
+
max_tokens : int
|
| 33 |
+
Limite de tokens en sortie (défaut : 4096).
|
| 34 |
+
"""
|
| 35 |
+
|
| 36 |
+
@property
|
| 37 |
+
def name(self) -> str:
|
| 38 |
+
return "mistral_ocr"
|
| 39 |
+
|
| 40 |
+
def version(self) -> str:
|
| 41 |
+
return self.config.get("model", "pixtral-12b-2409")
|
| 42 |
+
|
| 43 |
+
def __init__(self, config: Optional[dict] = None) -> None:
|
| 44 |
+
super().__init__(config)
|
| 45 |
+
self._api_key = os.environ.get("MISTRAL_API_KEY")
|
| 46 |
+
self._model = self.config.get("model", "pixtral-12b-2409")
|
| 47 |
+
self._prompt = self.config.get(
|
| 48 |
+
"prompt",
|
| 49 |
+
"Transcris fidèlement le texte visible sur cette image de document "
|
| 50 |
+
"historique. Retourne uniquement le texte, sans commentaire.",
|
| 51 |
+
)
|
| 52 |
+
self._max_tokens = int(self.config.get("max_tokens", 4096))
|
| 53 |
+
|
| 54 |
+
def _run_ocr(self, image_path: Path) -> str:
|
| 55 |
+
if not self._api_key:
|
| 56 |
+
raise RuntimeError(
|
| 57 |
+
"Clé API Mistral manquante — définissez la variable d'environnement MISTRAL_API_KEY"
|
| 58 |
+
)
|
| 59 |
+
try:
|
| 60 |
+
from mistralai import Mistral
|
| 61 |
+
except ImportError as exc:
|
| 62 |
+
raise RuntimeError(
|
| 63 |
+
"Le package 'mistralai' n'est pas installé. Lancez : pip install mistralai"
|
| 64 |
+
) from exc
|
| 65 |
+
|
| 66 |
+
# Encoder l'image en base64 avec media type correct
|
| 67 |
+
suffix = image_path.suffix.lower()
|
| 68 |
+
media_type = {
|
| 69 |
+
".jpg": "image/jpeg", ".jpeg": "image/jpeg",
|
| 70 |
+
".png": "image/png", ".tif": "image/tiff",
|
| 71 |
+
".tiff": "image/tiff", ".webp": "image/webp",
|
| 72 |
+
}.get(suffix, "image/jpeg")
|
| 73 |
+
|
| 74 |
+
image_b64 = base64.b64encode(image_path.read_bytes()).decode("ascii")
|
| 75 |
+
image_url = f"data:{media_type};base64,{image_b64}"
|
| 76 |
+
|
| 77 |
+
client = Mistral(api_key=self._api_key)
|
| 78 |
+
response = client.chat.complete(
|
| 79 |
+
model=self._model,
|
| 80 |
+
messages=[
|
| 81 |
+
{
|
| 82 |
+
"role": "user",
|
| 83 |
+
"content": [
|
| 84 |
+
{"type": "text", "text": self._prompt},
|
| 85 |
+
{"type": "image_url", "image_url": image_url},
|
| 86 |
+
],
|
| 87 |
+
}
|
| 88 |
+
],
|
| 89 |
+
max_tokens=self._max_tokens,
|
| 90 |
+
)
|
| 91 |
+
return response.choices[0].message.content or ""
|
|
@@ -24,18 +24,19 @@ from picarones.pipelines.over_normalization import detect_over_normalization
|
|
| 24 |
# ---------------------------------------------------------------------------
|
| 25 |
|
| 26 |
_GT_TEXTS = [
|
| 27 |
-
|
| 28 |
-
"
|
| 29 |
-
"
|
| 30 |
-
"
|
| 31 |
-
"
|
| 32 |
-
"
|
| 33 |
-
"
|
| 34 |
-
"
|
| 35 |
-
"
|
| 36 |
-
"
|
| 37 |
-
"
|
| 38 |
-
"
|
|
|
|
| 39 |
]
|
| 40 |
|
| 41 |
# ---------------------------------------------------------------------------
|
|
|
|
| 24 |
# ---------------------------------------------------------------------------
|
| 25 |
|
| 26 |
_GT_TEXTS = [
|
| 27 |
+
# Textes avec graphies médiévales incluant ſ, &, u/v — pour démontrer le CER diplomatique
|
| 28 |
+
"Icy commence le prologue de maiſtre Jehan Froiſſart ſus les croniques de France & d'Angleterre.",
|
| 29 |
+
"En l'an de grace mil trois cens ſoixante, regnoit en France le noble roy Jehan, filz du roy Phelippe de Valois.",
|
| 30 |
+
"Item ledit iour furent menez en ladicte ville de Paris pluſieurs priſonniers ſaraſins & mahommetans.",
|
| 31 |
+
"Le chancellier du roy manda à tous les baillifs & ſeneſchaulx que on feiſt crier & publier par tous les carrefours.",
|
| 32 |
+
"Cy après ſenſuyt la copie des lettres patentes données par noſtre ſeigneur le roy à ſes très chiers & feaulx.",
|
| 33 |
+
"Nous Charles, par la grace de Dieu roy de France, à tous ceulx qui ces preſentes lettres verront, ſalut.",
|
| 34 |
+
"Sauoir faiſons que pour conſidéracion des bons & aggreables ſeruices que noſtre amé & feal conſeillier.",
|
| 35 |
+
"Donné à Paris, le vingt & deuxième iour du mois de iuillet, l'an de grace mil quatre cens & troys.",
|
| 36 |
+
"Les deſſus ditz ambaſſadeurs reſpondirent que leur ſeigneur & maiſtre eſtoit très ioyeulx de ceſte aliance.",
|
| 37 |
+
"Après lesquelles choſes ainſi faictes & paſſées, le dit traictié fut ratiffié & confirmé de toutes parties.",
|
| 38 |
+
"Item, en ladicte année, fut faicte grant aſſemblée de gens d'armes tant à cheual que à pied.",
|
| 39 |
+
"Et pour ce que la choſe eſt notoire & manifeſte, nous auons fait mettre noſtre ſcel à ces preſentes.",
|
| 40 |
]
|
| 41 |
|
| 42 |
# ---------------------------------------------------------------------------
|
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Importeurs de corpus depuis des sources distantes (IIIF, HuggingFace, HTR-United…)."""
|
| 2 |
+
|
| 3 |
+
from picarones.importers.iiif import IIIFImporter, import_iiif_manifest
|
| 4 |
+
|
| 5 |
+
__all__ = ["IIIFImporter", "import_iiif_manifest"]
|
|
@@ -0,0 +1,583 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Import de corpus depuis des manifestes IIIF v2 et v3.
|
| 2 |
+
|
| 3 |
+
Fonctionnement
|
| 4 |
+
--------------
|
| 5 |
+
1. Téléchargement et parsing du manifeste JSON (v2 ou v3 auto-détecté)
|
| 6 |
+
2. Extraction de la liste des canvases (pages) avec leurs URL d'image
|
| 7 |
+
3. Sélection optionnelle d'un sous-ensemble de pages (ex : ``--pages 1-10``)
|
| 8 |
+
4. Téléchargement des images dans un dossier local
|
| 9 |
+
5. Création de fichiers GT vides (``.gt.txt``) à remplir manuellement,
|
| 10 |
+
OU chargement des annotations de transcription si présentes dans le manifeste
|
| 11 |
+
6. Construction et retour d'un objet ``Corpus``
|
| 12 |
+
|
| 13 |
+
Compatibilité
|
| 14 |
+
-------------
|
| 15 |
+
- IIIF Image API v2 et v3
|
| 16 |
+
- Manifestes Presentation API v2 et v3
|
| 17 |
+
- Instances : Gallica (BnF), Bodleian, British Library, BSB, e-codices,
|
| 18 |
+
Europeana, et tout entrepôt IIIF-compliant
|
| 19 |
+
|
| 20 |
+
Utilisation
|
| 21 |
+
-----------
|
| 22 |
+
>>> from picarones.importers.iiif import IIIFImporter
|
| 23 |
+
>>> importer = IIIFImporter("https://gallica.bnf.fr/ark:/12148/xxx/manifest.json")
|
| 24 |
+
>>> corpus = importer.import_corpus(pages="1-10", output_dir="./corpus/")
|
| 25 |
+
>>> print(f"{len(corpus)} documents téléchargés")
|
| 26 |
+
|
| 27 |
+
Ou via la fonction de commodité :
|
| 28 |
+
>>> from picarones.importers.iiif import import_iiif_manifest
|
| 29 |
+
>>> corpus = import_iiif_manifest("https://...", pages="1-5", output_dir="./corpus/")
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
from __future__ import annotations
|
| 33 |
+
|
| 34 |
+
import json
|
| 35 |
+
import logging
|
| 36 |
+
import re
|
| 37 |
+
import time
|
| 38 |
+
import urllib.error
|
| 39 |
+
import urllib.request
|
| 40 |
+
from dataclasses import dataclass, field
|
| 41 |
+
from pathlib import Path
|
| 42 |
+
from typing import Iterator, Optional
|
| 43 |
+
|
| 44 |
+
from picarones.core.corpus import Corpus, Document
|
| 45 |
+
|
| 46 |
+
logger = logging.getLogger(__name__)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
# ---------------------------------------------------------------------------
|
| 50 |
+
# Parsing du sélecteur de pages
|
| 51 |
+
# ---------------------------------------------------------------------------
|
| 52 |
+
|
| 53 |
+
def parse_page_selector(pages: str, total: int) -> list[int]:
|
| 54 |
+
"""Parse un sélecteur de pages en liste d'indices 0-based.
|
| 55 |
+
|
| 56 |
+
Formats acceptés :
|
| 57 |
+
- ``"1-10"`` → pages 1 à 10 (1-based)
|
| 58 |
+
- ``"1,3,5"`` → pages 1, 3 et 5
|
| 59 |
+
- ``"1-5,10,15-20"`` → combinaison
|
| 60 |
+
- ``"all"`` / ``""`` → toutes les pages
|
| 61 |
+
|
| 62 |
+
Parameters
|
| 63 |
+
----------
|
| 64 |
+
pages:
|
| 65 |
+
Sélecteur de pages en chaîne de caractères.
|
| 66 |
+
total:
|
| 67 |
+
Nombre total de pages dans le manifeste.
|
| 68 |
+
|
| 69 |
+
Returns
|
| 70 |
+
-------
|
| 71 |
+
list[int]
|
| 72 |
+
Indices 0-based des pages sélectionnées, triés et dédoublonnés.
|
| 73 |
+
|
| 74 |
+
Raises
|
| 75 |
+
------
|
| 76 |
+
ValueError
|
| 77 |
+
Si la syntaxe est invalide ou les numéros hors bornes.
|
| 78 |
+
"""
|
| 79 |
+
if not pages or pages.strip().lower() == "all":
|
| 80 |
+
return list(range(total))
|
| 81 |
+
|
| 82 |
+
indices: set[int] = set()
|
| 83 |
+
for part in pages.split(","):
|
| 84 |
+
part = part.strip()
|
| 85 |
+
if "-" in part:
|
| 86 |
+
m = re.fullmatch(r"(\d+)-(\d+)", part)
|
| 87 |
+
if not m:
|
| 88 |
+
raise ValueError(f"Sélecteur de pages invalide : '{part}'")
|
| 89 |
+
start, end = int(m.group(1)), int(m.group(2))
|
| 90 |
+
if start < 1 or end > total or start > end:
|
| 91 |
+
raise ValueError(
|
| 92 |
+
f"Plage {start}-{end} hors bornes (1–{total})"
|
| 93 |
+
)
|
| 94 |
+
indices.update(range(start - 1, end))
|
| 95 |
+
else:
|
| 96 |
+
n = int(part)
|
| 97 |
+
if n < 1 or n > total:
|
| 98 |
+
raise ValueError(f"Page {n} hors bornes (1–{total})")
|
| 99 |
+
indices.add(n - 1)
|
| 100 |
+
return sorted(indices)
|
| 101 |
+
|
| 102 |
+
|
| 103 |
+
# ---------------------------------------------------------------------------
|
| 104 |
+
# Données d'un canvas IIIF
|
| 105 |
+
# ---------------------------------------------------------------------------
|
| 106 |
+
|
| 107 |
+
@dataclass
|
| 108 |
+
class IIIFCanvas:
|
| 109 |
+
"""Représente un canvas (page) dans un manifeste IIIF."""
|
| 110 |
+
|
| 111 |
+
index: int # position 0-based dans le manifeste
|
| 112 |
+
label: str # étiquette lisible (ex : "f. 1r", "Page 1")
|
| 113 |
+
image_url: str # URL de l'image pleine résolution
|
| 114 |
+
width: Optional[int] = None
|
| 115 |
+
height: Optional[int] = None
|
| 116 |
+
transcription: Optional[str] = None # texte GT si annoté dans le manifeste
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
# ---------------------------------------------------------------------------
|
| 120 |
+
# Parseur de manifeste IIIF
|
| 121 |
+
# ---------------------------------------------------------------------------
|
| 122 |
+
|
| 123 |
+
class IIIFManifestParser:
|
| 124 |
+
"""Parse un manifeste IIIF Presentation API v2 ou v3."""
|
| 125 |
+
|
| 126 |
+
def __init__(self, manifest: dict) -> None:
|
| 127 |
+
self._manifest = manifest
|
| 128 |
+
self._version = self._detect_version()
|
| 129 |
+
|
| 130 |
+
def _detect_version(self) -> int:
|
| 131 |
+
"""Détecte la version du manifeste (2 ou 3)."""
|
| 132 |
+
context = self._manifest.get("@context", "")
|
| 133 |
+
if isinstance(context, list):
|
| 134 |
+
context = " ".join(context)
|
| 135 |
+
if "presentation/3" in context or self._manifest.get("type") == "Manifest":
|
| 136 |
+
return 3
|
| 137 |
+
return 2
|
| 138 |
+
|
| 139 |
+
@property
|
| 140 |
+
def version(self) -> int:
|
| 141 |
+
return self._version
|
| 142 |
+
|
| 143 |
+
@property
|
| 144 |
+
def label(self) -> str:
|
| 145 |
+
"""Titre du manifeste."""
|
| 146 |
+
raw = self._manifest.get("label", "")
|
| 147 |
+
return _extract_label(raw)
|
| 148 |
+
|
| 149 |
+
@property
|
| 150 |
+
def attribution(self) -> str:
|
| 151 |
+
raw = self._manifest.get("attribution", self._manifest.get("requiredStatement", ""))
|
| 152 |
+
return _extract_label(raw)
|
| 153 |
+
|
| 154 |
+
def canvases(self) -> list[IIIFCanvas]:
|
| 155 |
+
"""Retourne la liste des canvases du manifeste."""
|
| 156 |
+
if self._version == 3:
|
| 157 |
+
return self._parse_v3_canvases()
|
| 158 |
+
return self._parse_v2_canvases()
|
| 159 |
+
|
| 160 |
+
def _parse_v2_canvases(self) -> list[IIIFCanvas]:
|
| 161 |
+
canvases: list[IIIFCanvas] = []
|
| 162 |
+
sequences = self._manifest.get("sequences", [])
|
| 163 |
+
if not sequences:
|
| 164 |
+
return canvases
|
| 165 |
+
raw_canvases = sequences[0].get("canvases", [])
|
| 166 |
+
for i, canvas in enumerate(raw_canvases):
|
| 167 |
+
label = _extract_label(canvas.get("label", f"canvas_{i+1}"))
|
| 168 |
+
# Image principale : images[0].resource.@id ou service
|
| 169 |
+
images = canvas.get("images", [])
|
| 170 |
+
image_url = ""
|
| 171 |
+
if images:
|
| 172 |
+
resource = images[0].get("resource", {})
|
| 173 |
+
image_url = _best_image_url_v2(resource, canvas)
|
| 174 |
+
|
| 175 |
+
# Annotations de transcription (OA annotations)
|
| 176 |
+
transcription = _extract_v2_transcription(canvas)
|
| 177 |
+
|
| 178 |
+
canvases.append(IIIFCanvas(
|
| 179 |
+
index=i,
|
| 180 |
+
label=label,
|
| 181 |
+
image_url=image_url,
|
| 182 |
+
width=canvas.get("width"),
|
| 183 |
+
height=canvas.get("height"),
|
| 184 |
+
transcription=transcription,
|
| 185 |
+
))
|
| 186 |
+
return canvases
|
| 187 |
+
|
| 188 |
+
def _parse_v3_canvases(self) -> list[IIIFCanvas]:
|
| 189 |
+
canvases: list[IIIFCanvas] = []
|
| 190 |
+
items = self._manifest.get("items", [])
|
| 191 |
+
for i, canvas in enumerate(items):
|
| 192 |
+
label = _extract_label(canvas.get("label", f"canvas_{i+1}"))
|
| 193 |
+
image_url = _best_image_url_v3(canvas)
|
| 194 |
+
transcription = _extract_v3_transcription(canvas)
|
| 195 |
+
canvases.append(IIIFCanvas(
|
| 196 |
+
index=i,
|
| 197 |
+
label=label,
|
| 198 |
+
image_url=image_url,
|
| 199 |
+
width=canvas.get("width"),
|
| 200 |
+
height=canvas.get("height"),
|
| 201 |
+
transcription=transcription,
|
| 202 |
+
))
|
| 203 |
+
return canvases
|
| 204 |
+
|
| 205 |
+
|
| 206 |
+
# ---------------------------------------------------------------------------
|
| 207 |
+
# Helpers extraction URL et label
|
| 208 |
+
# ---------------------------------------------------------------------------
|
| 209 |
+
|
| 210 |
+
def _extract_label(raw: object) -> str:
|
| 211 |
+
"""Extrait une chaîne lisible depuis les différents formats de label IIIF."""
|
| 212 |
+
if isinstance(raw, str):
|
| 213 |
+
return raw
|
| 214 |
+
if isinstance(raw, list) and raw:
|
| 215 |
+
return _extract_label(raw[0])
|
| 216 |
+
if isinstance(raw, dict):
|
| 217 |
+
# IIIF v3 : {"fr": ["titre"], "en": ["title"]}
|
| 218 |
+
for lang in ("fr", "en", "none", "@value"):
|
| 219 |
+
val = raw.get(lang, "")
|
| 220 |
+
if val:
|
| 221 |
+
if isinstance(val, list):
|
| 222 |
+
return val[0] if val else ""
|
| 223 |
+
return str(val)
|
| 224 |
+
# Fallback: première valeur
|
| 225 |
+
for v in raw.values():
|
| 226 |
+
return _extract_label(v)
|
| 227 |
+
return str(raw) if raw else ""
|
| 228 |
+
|
| 229 |
+
|
| 230 |
+
def _best_image_url_v2(resource: dict, canvas: dict) -> str:
|
| 231 |
+
"""Construit l'URL d'image optimale depuis une ressource IIIF v2."""
|
| 232 |
+
# 1. URL directe de la ressource
|
| 233 |
+
direct = resource.get("@id", "")
|
| 234 |
+
if direct and not direct.endswith("/info.json"):
|
| 235 |
+
return direct
|
| 236 |
+
|
| 237 |
+
# 2. Via le service IIIF Image API
|
| 238 |
+
service = resource.get("service", {})
|
| 239 |
+
if isinstance(service, list) and service:
|
| 240 |
+
service = service[0]
|
| 241 |
+
service_id = service.get("@id", service.get("id", ""))
|
| 242 |
+
if service_id:
|
| 243 |
+
return f"{service_id.rstrip('/')}/full/max/0/default.jpg"
|
| 244 |
+
|
| 245 |
+
return direct
|
| 246 |
+
|
| 247 |
+
|
| 248 |
+
def _best_image_url_v3(canvas: dict) -> str:
|
| 249 |
+
"""Extrait l'URL d'image depuis un canvas IIIF v3."""
|
| 250 |
+
items = canvas.get("items", [])
|
| 251 |
+
for annotation_page in items:
|
| 252 |
+
for annotation in annotation_page.get("items", []):
|
| 253 |
+
body = annotation.get("body", {})
|
| 254 |
+
if isinstance(body, list):
|
| 255 |
+
body = body[0] if body else {}
|
| 256 |
+
# URL directe
|
| 257 |
+
url = body.get("id", body.get("@id", ""))
|
| 258 |
+
if url and body.get("type", "") == "Image":
|
| 259 |
+
return url
|
| 260 |
+
# Via service IIIF Image API
|
| 261 |
+
service = body.get("service", [])
|
| 262 |
+
if isinstance(service, dict):
|
| 263 |
+
service = [service]
|
| 264 |
+
for svc in service:
|
| 265 |
+
svc_id = svc.get("id", svc.get("@id", ""))
|
| 266 |
+
if svc_id:
|
| 267 |
+
return f"{svc_id.rstrip('/')}/full/max/0/default.jpg"
|
| 268 |
+
if url:
|
| 269 |
+
return url
|
| 270 |
+
return ""
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def _extract_v2_transcription(canvas: dict) -> Optional[str]:
|
| 274 |
+
"""Tente d'extraire le texte GT depuis les annotations OA d'un canvas v2."""
|
| 275 |
+
other_content = canvas.get("otherContent", [])
|
| 276 |
+
for oc in other_content:
|
| 277 |
+
if not isinstance(oc, dict):
|
| 278 |
+
continue
|
| 279 |
+
motivation = oc.get("motivation", "")
|
| 280 |
+
if "transcrib" in motivation.lower() or "supplementing" in motivation.lower():
|
| 281 |
+
resources = oc.get("resources", [])
|
| 282 |
+
texts = []
|
| 283 |
+
for res in resources:
|
| 284 |
+
body = res.get("resource", {})
|
| 285 |
+
if body.get("@type") == "cnt:ContentAsText":
|
| 286 |
+
texts.append(body.get("chars", ""))
|
| 287 |
+
if texts:
|
| 288 |
+
return "\n".join(texts)
|
| 289 |
+
return None
|
| 290 |
+
|
| 291 |
+
|
| 292 |
+
def _extract_v3_transcription(canvas: dict) -> Optional[str]:
|
| 293 |
+
"""Tente d'extraire le texte GT depuis les annotations d'un canvas v3."""
|
| 294 |
+
annotations = canvas.get("annotations", [])
|
| 295 |
+
for ann_page in annotations:
|
| 296 |
+
items = ann_page.get("items", [])
|
| 297 |
+
for ann in items:
|
| 298 |
+
motivation = ann.get("motivation", "")
|
| 299 |
+
if "transcrib" in motivation.lower() or "supplementing" in motivation.lower():
|
| 300 |
+
body = ann.get("body", {})
|
| 301 |
+
if isinstance(body, dict) and body.get("type") == "TextualBody":
|
| 302 |
+
return body.get("value", "")
|
| 303 |
+
return None
|
| 304 |
+
|
| 305 |
+
|
| 306 |
+
# ---------------------------------------------------------------------------
|
| 307 |
+
# Téléchargement avec retry
|
| 308 |
+
# ---------------------------------------------------------------------------
|
| 309 |
+
|
| 310 |
+
def _download_url(
|
| 311 |
+
url: str,
|
| 312 |
+
retries: int = 4,
|
| 313 |
+
backoff: float = 2.0,
|
| 314 |
+
timeout: int = 60,
|
| 315 |
+
) -> bytes:
|
| 316 |
+
"""Télécharge une URL avec retry exponentiel."""
|
| 317 |
+
headers = {
|
| 318 |
+
"User-Agent": "Picarones/1.0 (BnF OCR benchmark platform; https://github.com/bnf/picarones)"
|
| 319 |
+
}
|
| 320 |
+
last_exc: Optional[Exception] = None
|
| 321 |
+
for attempt in range(retries):
|
| 322 |
+
if attempt > 0:
|
| 323 |
+
wait = backoff ** attempt
|
| 324 |
+
logger.debug("Retry %d/%d dans %.1fs — %s", attempt, retries - 1, wait, url)
|
| 325 |
+
time.sleep(wait)
|
| 326 |
+
try:
|
| 327 |
+
req = urllib.request.Request(url, headers=headers)
|
| 328 |
+
with urllib.request.urlopen(req, timeout=timeout) as resp:
|
| 329 |
+
return resp.read()
|
| 330 |
+
except (urllib.error.URLError, urllib.error.HTTPError) as exc:
|
| 331 |
+
last_exc = exc
|
| 332 |
+
logger.warning("Erreur téléchargement %s : %s", url, exc)
|
| 333 |
+
raise RuntimeError(f"Impossible de télécharger {url} après {retries} tentatives") from last_exc
|
| 334 |
+
|
| 335 |
+
|
| 336 |
+
def _fetch_manifest(url: str) -> dict:
|
| 337 |
+
"""Télécharge et parse un manifeste IIIF JSON."""
|
| 338 |
+
data = _download_url(url)
|
| 339 |
+
try:
|
| 340 |
+
return json.loads(data.decode("utf-8"))
|
| 341 |
+
except json.JSONDecodeError as exc:
|
| 342 |
+
raise ValueError(f"Manifeste IIIF invalide (JSON mal formé) : {url}") from exc
|
| 343 |
+
|
| 344 |
+
|
| 345 |
+
# ---------------------------------------------------------------------------
|
| 346 |
+
# Importeur principal
|
| 347 |
+
# ---------------------------------------------------------------------------
|
| 348 |
+
|
| 349 |
+
class IIIFImporter:
|
| 350 |
+
"""Importe un corpus depuis un manifeste IIIF.
|
| 351 |
+
|
| 352 |
+
Parameters
|
| 353 |
+
----------
|
| 354 |
+
manifest_url:
|
| 355 |
+
URL du manifeste IIIF (Presentation API v2 ou v3).
|
| 356 |
+
max_resolution:
|
| 357 |
+
Résolution maximale des images téléchargées (largeur en pixels).
|
| 358 |
+
0 = résolution maximale disponible.
|
| 359 |
+
"""
|
| 360 |
+
|
| 361 |
+
def __init__(
|
| 362 |
+
self,
|
| 363 |
+
manifest_url: str,
|
| 364 |
+
max_resolution: int = 0,
|
| 365 |
+
) -> None:
|
| 366 |
+
self.manifest_url = manifest_url
|
| 367 |
+
self.max_resolution = max_resolution
|
| 368 |
+
self._manifest: Optional[dict] = None
|
| 369 |
+
self._parser: Optional[IIIFManifestParser] = None
|
| 370 |
+
|
| 371 |
+
def load(self) -> "IIIFImporter":
|
| 372 |
+
"""Télécharge et parse le manifeste."""
|
| 373 |
+
logger.info("Téléchargement du manifeste IIIF : %s", self.manifest_url)
|
| 374 |
+
self._manifest = _fetch_manifest(self.manifest_url)
|
| 375 |
+
self._parser = IIIFManifestParser(self._manifest)
|
| 376 |
+
logger.info(
|
| 377 |
+
"Manifeste chargé — version IIIF %d — titre : %s — %d canvas",
|
| 378 |
+
self._parser.version,
|
| 379 |
+
self._parser.label,
|
| 380 |
+
len(self._parser.canvases()),
|
| 381 |
+
)
|
| 382 |
+
return self
|
| 383 |
+
|
| 384 |
+
@property
|
| 385 |
+
def parser(self) -> IIIFManifestParser:
|
| 386 |
+
if self._parser is None:
|
| 387 |
+
self.load()
|
| 388 |
+
return self._parser # type: ignore[return-value]
|
| 389 |
+
|
| 390 |
+
def list_canvases(self, pages: str = "all") -> list[IIIFCanvas]:
|
| 391 |
+
"""Retourne la liste des canvases sélectionnés."""
|
| 392 |
+
all_canvases = self.parser.canvases()
|
| 393 |
+
indices = parse_page_selector(pages, len(all_canvases))
|
| 394 |
+
return [all_canvases[i] for i in indices]
|
| 395 |
+
|
| 396 |
+
def import_corpus(
|
| 397 |
+
self,
|
| 398 |
+
pages: str = "all",
|
| 399 |
+
output_dir: Optional[str | Path] = None,
|
| 400 |
+
show_progress: bool = True,
|
| 401 |
+
) -> Corpus:
|
| 402 |
+
"""Télécharge les images et construit un corpus Picarones.
|
| 403 |
+
|
| 404 |
+
Si les canvases contiennent des annotations de transcription (GT),
|
| 405 |
+
elles sont automatiquement sauvegardées dans les fichiers ``.gt.txt``.
|
| 406 |
+
Sinon, des fichiers ``.gt.txt`` vides sont créés.
|
| 407 |
+
|
| 408 |
+
Parameters
|
| 409 |
+
----------
|
| 410 |
+
pages:
|
| 411 |
+
Sélecteur de pages (ex : ``"1-10"``, ``"1,3,5"``).
|
| 412 |
+
output_dir:
|
| 413 |
+
Dossier de destination pour les images et les GT.
|
| 414 |
+
Si None, le corpus est retourné en mémoire sans écriture disque.
|
| 415 |
+
show_progress:
|
| 416 |
+
Affiche une barre de progression tqdm.
|
| 417 |
+
|
| 418 |
+
Returns
|
| 419 |
+
-------
|
| 420 |
+
Corpus
|
| 421 |
+
Corpus prêt à être utilisé dans ``run_benchmark``.
|
| 422 |
+
"""
|
| 423 |
+
canvases = self.list_canvases(pages)
|
| 424 |
+
if not canvases:
|
| 425 |
+
raise ValueError("Aucun canvas sélectionné.")
|
| 426 |
+
|
| 427 |
+
out_dir: Optional[Path] = Path(output_dir) if output_dir else None
|
| 428 |
+
if out_dir:
|
| 429 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 430 |
+
|
| 431 |
+
# Nom du corpus depuis le titre du manifeste
|
| 432 |
+
corpus_name = self.parser.label or "iiif_corpus"
|
| 433 |
+
|
| 434 |
+
documents: list[Document] = []
|
| 435 |
+
iterator: Iterator[IIIFCanvas] = iter(canvases)
|
| 436 |
+
|
| 437 |
+
if show_progress:
|
| 438 |
+
try:
|
| 439 |
+
from tqdm import tqdm
|
| 440 |
+
iterator = tqdm(canvases, desc="Import IIIF", unit="page")
|
| 441 |
+
except ImportError:
|
| 442 |
+
pass
|
| 443 |
+
|
| 444 |
+
for canvas in iterator:
|
| 445 |
+
doc_id = f"{_slugify(canvas.label) or f'canvas_{canvas.index+1:04d}'}"
|
| 446 |
+
|
| 447 |
+
if not canvas.image_url:
|
| 448 |
+
logger.warning("Canvas %s : pas d'URL d'image — ignoré.", canvas.label)
|
| 449 |
+
continue
|
| 450 |
+
|
| 451 |
+
# Ajuster la résolution si max_resolution est défini
|
| 452 |
+
image_url = self._adjust_resolution(canvas.image_url, canvas.width)
|
| 453 |
+
|
| 454 |
+
# Téléchargement de l'image
|
| 455 |
+
try:
|
| 456 |
+
image_bytes = _download_url(image_url)
|
| 457 |
+
except RuntimeError as exc:
|
| 458 |
+
logger.error("Canvas %s : erreur téléchargement : %s", canvas.label, exc)
|
| 459 |
+
continue
|
| 460 |
+
|
| 461 |
+
# Déterminer l'extension de l'image
|
| 462 |
+
ext = _guess_extension(image_url)
|
| 463 |
+
|
| 464 |
+
if out_dir:
|
| 465 |
+
# Sauvegarde sur disque
|
| 466 |
+
image_path = out_dir / f"{doc_id}{ext}"
|
| 467 |
+
image_path.write_bytes(image_bytes)
|
| 468 |
+
|
| 469 |
+
gt_path = out_dir / f"{doc_id}.gt.txt"
|
| 470 |
+
gt_text = canvas.transcription or ""
|
| 471 |
+
gt_path.write_text(gt_text, encoding="utf-8")
|
| 472 |
+
|
| 473 |
+
documents.append(Document(
|
| 474 |
+
image_path=image_path,
|
| 475 |
+
ground_truth=gt_text,
|
| 476 |
+
doc_id=doc_id,
|
| 477 |
+
metadata={"iiif_label": canvas.label, "canvas_index": canvas.index},
|
| 478 |
+
))
|
| 479 |
+
else:
|
| 480 |
+
# Corpus en mémoire (image stockée comme chemin temporaire virtuel)
|
| 481 |
+
import tempfile
|
| 482 |
+
tmp = tempfile.NamedTemporaryFile(suffix=ext, delete=False)
|
| 483 |
+
tmp.write(image_bytes)
|
| 484 |
+
tmp.close()
|
| 485 |
+
documents.append(Document(
|
| 486 |
+
image_path=Path(tmp.name),
|
| 487 |
+
ground_truth=canvas.transcription or "",
|
| 488 |
+
doc_id=doc_id,
|
| 489 |
+
metadata={"iiif_label": canvas.label, "canvas_index": canvas.index},
|
| 490 |
+
))
|
| 491 |
+
|
| 492 |
+
if not documents:
|
| 493 |
+
raise ValueError("Aucun document importé depuis le manifeste IIIF.")
|
| 494 |
+
|
| 495 |
+
logger.info("Import IIIF terminé : %d documents.", len(documents))
|
| 496 |
+
|
| 497 |
+
return Corpus(
|
| 498 |
+
name=corpus_name,
|
| 499 |
+
documents=documents,
|
| 500 |
+
source_path=self.manifest_url,
|
| 501 |
+
metadata={
|
| 502 |
+
"iiif_manifest_url": self.manifest_url,
|
| 503 |
+
"iiif_version": self.parser.version,
|
| 504 |
+
"iiif_attribution": self.parser.attribution,
|
| 505 |
+
"pages_selected": pages,
|
| 506 |
+
},
|
| 507 |
+
)
|
| 508 |
+
|
| 509 |
+
def _adjust_resolution(self, image_url: str, canvas_width: Optional[int]) -> str:
|
| 510 |
+
"""Ajuste l'URL IIIF Image API pour respecter max_resolution."""
|
| 511 |
+
if not self.max_resolution or not canvas_width:
|
| 512 |
+
return image_url
|
| 513 |
+
if canvas_width <= self.max_resolution:
|
| 514 |
+
return image_url
|
| 515 |
+
# Remplacer /full/max/ ou /full/full/ par /full/{w},/
|
| 516 |
+
url = re.sub(
|
| 517 |
+
r"/full/(max|full)/",
|
| 518 |
+
f"/full/{self.max_resolution},/",
|
| 519 |
+
image_url,
|
| 520 |
+
)
|
| 521 |
+
return url
|
| 522 |
+
|
| 523 |
+
|
| 524 |
+
# ---------------------------------------------------------------------------
|
| 525 |
+
# Helpers utilitaires
|
| 526 |
+
# ---------------------------------------------------------------------------
|
| 527 |
+
|
| 528 |
+
def _slugify(text: str) -> str:
|
| 529 |
+
"""Convertit un label IIIF en identifiant de fichier sûr."""
|
| 530 |
+
text = re.sub(r"[^\w\s-]", "", text.strip())
|
| 531 |
+
text = re.sub(r"[\s_-]+", "_", text)
|
| 532 |
+
return text[:60]
|
| 533 |
+
|
| 534 |
+
|
| 535 |
+
def _guess_extension(url: str) -> str:
|
| 536 |
+
"""Détermine l'extension de l'image depuis l'URL."""
|
| 537 |
+
url_lower = url.lower().split("?")[0]
|
| 538 |
+
for ext in (".jpg", ".jpeg", ".png", ".tif", ".tiff", ".webp"):
|
| 539 |
+
if url_lower.endswith(ext):
|
| 540 |
+
return ext
|
| 541 |
+
# Par défaut pour les URLs IIIF Image API
|
| 542 |
+
if "/default." in url_lower or "/native." in url_lower:
|
| 543 |
+
return ".jpg"
|
| 544 |
+
return ".jpg"
|
| 545 |
+
|
| 546 |
+
|
| 547 |
+
# ---------------------------------------------------------------------------
|
| 548 |
+
# Fonction de commodité
|
| 549 |
+
# ---------------------------------------------------------------------------
|
| 550 |
+
|
| 551 |
+
def import_iiif_manifest(
|
| 552 |
+
manifest_url: str,
|
| 553 |
+
pages: str = "all",
|
| 554 |
+
output_dir: Optional[str | Path] = None,
|
| 555 |
+
max_resolution: int = 0,
|
| 556 |
+
show_progress: bool = True,
|
| 557 |
+
) -> Corpus:
|
| 558 |
+
"""Importe un corpus depuis un manifeste IIIF en une seule ligne.
|
| 559 |
+
|
| 560 |
+
Parameters
|
| 561 |
+
----------
|
| 562 |
+
manifest_url:
|
| 563 |
+
URL du manifeste IIIF (v2 ou v3).
|
| 564 |
+
pages:
|
| 565 |
+
Sélecteur de pages (ex : ``"1-10"``, ``"1,3,5"``). ``"all"`` par défaut.
|
| 566 |
+
output_dir:
|
| 567 |
+
Dossier de destination. Si None, corpus en mémoire.
|
| 568 |
+
max_resolution:
|
| 569 |
+
Résolution maximale (px). 0 = pas de limite.
|
| 570 |
+
show_progress:
|
| 571 |
+
Affiche une barre de progression.
|
| 572 |
+
|
| 573 |
+
Returns
|
| 574 |
+
-------
|
| 575 |
+
Corpus
|
| 576 |
+
"""
|
| 577 |
+
importer = IIIFImporter(manifest_url, max_resolution=max_resolution)
|
| 578 |
+
importer.load()
|
| 579 |
+
return importer.import_corpus(
|
| 580 |
+
pages=pages,
|
| 581 |
+
output_dir=output_dir,
|
| 582 |
+
show_progress=show_progress,
|
| 583 |
+
)
|
|
@@ -69,6 +69,7 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
|
|
| 69 |
engines_summary = []
|
| 70 |
for report in benchmark.engine_reports:
|
| 71 |
agg = report.aggregated_metrics
|
|
|
|
| 72 |
entry: dict = {
|
| 73 |
"name": report.engine_name,
|
| 74 |
"version": report.engine_version,
|
|
@@ -81,12 +82,20 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
|
|
| 81 |
"cer_max": _safe(agg.get("cer", {}).get("max")),
|
| 82 |
"doc_count": agg.get("document_count", 0),
|
| 83 |
"failed": agg.get("failed_count", 0),
|
|
|
|
|
|
|
|
|
|
| 84 |
# Distribution pour l'histogramme : liste des CER individuels
|
| 85 |
"cer_values": [
|
| 86 |
_safe(dr.metrics.cer)
|
| 87 |
for dr in report.document_results
|
| 88 |
if dr.metrics.error is None
|
| 89 |
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
# Champs pipeline OCR+LLM (vides pour les moteurs OCR seuls)
|
| 91 |
"is_pipeline": report.is_pipeline,
|
| 92 |
"pipeline_info": report.pipeline_info,
|
|
@@ -121,6 +130,7 @@ def _build_report_data(benchmark: BenchmarkResult, images_b64: dict[str, str]) -
|
|
| 121 |
"engine": engine_name,
|
| 122 |
"hypothesis": dr.hypothesis,
|
| 123 |
"cer": _safe(dr.metrics.cer),
|
|
|
|
| 124 |
"wer": _safe(dr.metrics.wer),
|
| 125 |
"duration": dr.duration_seconds,
|
| 126 |
"error": dr.engine_error,
|
|
@@ -622,7 +632,8 @@ footer {{
|
|
| 622 |
<tr>
|
| 623 |
<th data-col="rank" class="sortable sorted" data-dir="asc">#<i class="sort-icon">↑</i></th>
|
| 624 |
<th data-col="name" class="sortable">Concurrent<i class="sort-icon">↕</i></th>
|
| 625 |
-
<th data-col="cer" class="sortable">CER<i class="sort-icon">↕</i></th>
|
|
|
|
| 626 |
<th data-col="wer" class="sortable">WER<i class="sort-icon">↕</i></th>
|
| 627 |
<th data-col="mer" class="sortable">MER<i class="sort-icon">↕</i></th>
|
| 628 |
<th data-col="wil" class="sortable">WIL<i class="sort-icon">↕</i></th>
|
|
@@ -906,6 +917,18 @@ function renderRanking() {{
|
|
| 906 |
overNormCell = `<td><span class="${{cls}}" title="Classe 10 — ${{on.over_normalized_count}} mots corrects dégradés sur ${{on.total_correct_ocr_words}}">${{onPct}} %</span></td>`;
|
| 907 |
}}
|
| 908 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 909 |
return `<tr>
|
| 910 |
<td><span class="${{badgeClass}}">${{rank}}</span></td>
|
| 911 |
<td>
|
|
@@ -918,6 +941,7 @@ function renderRanking() {{
|
|
| 918 |
<span class="bar" style="width:${{barW}}px;background:${{cerC}}"></span>
|
| 919 |
<span class="cer-badge" style="color:${{cerC}};background:${{cerB}}">${{pct(e.cer)}}</span>
|
| 920 |
</td>
|
|
|
|
| 921 |
<td>${{pct(e.wer)}}</td>
|
| 922 |
<td>${{pct(e.mer)}}</td>
|
| 923 |
<td>${{pct(e.wil)}}</td>
|
|
@@ -1109,12 +1133,23 @@ function loadDocument(docId) {{
|
|
| 1109 |
</div>`;
|
| 1110 |
}}
|
| 1111 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1112 |
return `<div class="diff-panel">
|
| 1113 |
<div class="diff-panel-header">
|
| 1114 |
<span class="diff-panel-title">${{esc(er.engine)}}</span>
|
| 1115 |
${{pipeTagPanel}}
|
| 1116 |
<span class="diff-panel-metrics">
|
| 1117 |
<span class="cer-badge" style="color:${{c}};background:${{bg}}">${{pct(er.cer)}}</span>
|
|
|
|
| 1118 |
<span class="badge" style="background:#f1f5f9">WER ${{pct(er.wer)}}</span>
|
| 1119 |
${{onBadge}}
|
| 1120 |
${{errBadge}}
|
|
|
|
| 69 |
engines_summary = []
|
| 70 |
for report in benchmark.engine_reports:
|
| 71 |
agg = report.aggregated_metrics
|
| 72 |
+
diplo_agg = agg.get("cer_diplomatic", {})
|
| 73 |
entry: dict = {
|
| 74 |
"name": report.engine_name,
|
| 75 |
"version": report.engine_version,
|
|
|
|
| 82 |
"cer_max": _safe(agg.get("cer", {}).get("max")),
|
| 83 |
"doc_count": agg.get("document_count", 0),
|
| 84 |
"failed": agg.get("failed_count", 0),
|
| 85 |
+
# CER diplomatique (après normalisation historique : ſ=s, u=v, i=j…)
|
| 86 |
+
"cer_diplomatic": _safe(diplo_agg.get("mean")) if diplo_agg else None,
|
| 87 |
+
"cer_diplomatic_profile": diplo_agg.get("profile"),
|
| 88 |
# Distribution pour l'histogramme : liste des CER individuels
|
| 89 |
"cer_values": [
|
| 90 |
_safe(dr.metrics.cer)
|
| 91 |
for dr in report.document_results
|
| 92 |
if dr.metrics.error is None
|
| 93 |
],
|
| 94 |
+
"cer_diplomatic_values": [
|
| 95 |
+
_safe(dr.metrics.cer_diplomatic)
|
| 96 |
+
for dr in report.document_results
|
| 97 |
+
if dr.metrics.error is None and dr.metrics.cer_diplomatic is not None
|
| 98 |
+
],
|
| 99 |
# Champs pipeline OCR+LLM (vides pour les moteurs OCR seuls)
|
| 100 |
"is_pipeline": report.is_pipeline,
|
| 101 |
"pipeline_info": report.pipeline_info,
|
|
|
|
| 130 |
"engine": engine_name,
|
| 131 |
"hypothesis": dr.hypothesis,
|
| 132 |
"cer": _safe(dr.metrics.cer),
|
| 133 |
+
"cer_diplomatic": _safe(dr.metrics.cer_diplomatic) if dr.metrics.cer_diplomatic is not None else None,
|
| 134 |
"wer": _safe(dr.metrics.wer),
|
| 135 |
"duration": dr.duration_seconds,
|
| 136 |
"error": dr.engine_error,
|
|
|
|
| 632 |
<tr>
|
| 633 |
<th data-col="rank" class="sortable sorted" data-dir="asc">#<i class="sort-icon">↑</i></th>
|
| 634 |
<th data-col="name" class="sortable">Concurrent<i class="sort-icon">↕</i></th>
|
| 635 |
+
<th data-col="cer" class="sortable">CER exact<i class="sort-icon">↕</i></th>
|
| 636 |
+
<th data-col="cer_diplomatic" class="sortable" title="CER après normalisation diplomatique (ſ=s, u=v, i=j…) — mesure les erreurs substantielles en ignorant les variantes graphiques codifiées">CER diplo.<i class="sort-icon">↕</i></th>
|
| 637 |
<th data-col="wer" class="sortable">WER<i class="sort-icon">↕</i></th>
|
| 638 |
<th data-col="mer" class="sortable">MER<i class="sort-icon">↕</i></th>
|
| 639 |
<th data-col="wil" class="sortable">WIL<i class="sort-icon">↕</i></th>
|
|
|
|
| 917 |
overNormCell = `<td><span class="${{cls}}" title="Classe 10 — ${{on.over_normalized_count}} mots corrects dégradés sur ${{on.total_correct_ocr_words}}">${{onPct}} %</span></td>`;
|
| 918 |
}}
|
| 919 |
|
| 920 |
+
// CER diplomatique
|
| 921 |
+
let diploCerCell = '<td style="color:var(--text-muted)">—</td>';
|
| 922 |
+
if (e.cer_diplomatic !== null && e.cer_diplomatic !== undefined) {{
|
| 923 |
+
const dipC = cerColor(e.cer_diplomatic); const dipB = cerBg(e.cer_diplomatic);
|
| 924 |
+
const delta = e.cer - e.cer_diplomatic;
|
| 925 |
+
const deltaStr = delta > 0.001 ? ` <span style="font-size:.65rem;color:#059669">-${{(delta*100).toFixed(1)}}%</span>` : '';
|
| 926 |
+
const profileHint = e.cer_diplomatic_profile ? ` title="Profil : ${{esc(e.cer_diplomatic_profile)}}"` : '';
|
| 927 |
+
diploCerCell = `<td${{profileHint}}>
|
| 928 |
+
<span class="cer-badge" style="color:${{dipC}};background:${{dipB}}">${{pct(e.cer_diplomatic)}}</span>${{deltaStr}}
|
| 929 |
+
</td>`;
|
| 930 |
+
}}
|
| 931 |
+
|
| 932 |
return `<tr>
|
| 933 |
<td><span class="${{badgeClass}}">${{rank}}</span></td>
|
| 934 |
<td>
|
|
|
|
| 941 |
<span class="bar" style="width:${{barW}}px;background:${{cerC}}"></span>
|
| 942 |
<span class="cer-badge" style="color:${{cerC}};background:${{cerB}}">${{pct(e.cer)}}</span>
|
| 943 |
</td>
|
| 944 |
+
${{diploCerCell}}
|
| 945 |
<td>${{pct(e.wer)}}</td>
|
| 946 |
<td>${{pct(e.mer)}}</td>
|
| 947 |
<td>${{pct(e.wil)}}</td>
|
|
|
|
| 1133 |
</div>`;
|
| 1134 |
}}
|
| 1135 |
|
| 1136 |
+
// CER diplomatique par document
|
| 1137 |
+
let diplomaBadge = '';
|
| 1138 |
+
if (er.cer_diplomatic !== null && er.cer_diplomatic !== undefined) {{
|
| 1139 |
+
const dipC = cerColor(er.cer_diplomatic); const dipB = cerBg(er.cer_diplomatic);
|
| 1140 |
+
const delta = er.cer - er.cer_diplomatic;
|
| 1141 |
+
const deltaHint = delta > 0.001 ? ` (−${{(delta*100).toFixed(1)}}% avec normalisation)` : '';
|
| 1142 |
+
diplomaBadge = `<span class="cer-badge" style="color:${{dipC}};background:${{dipB}};opacity:.85"
|
| 1143 |
+
title="CER diplomatique (ſ=s, u=v, i=j…)${{deltaHint}}">diplo. ${{pct(er.cer_diplomatic)}}</span>`;
|
| 1144 |
+
}}
|
| 1145 |
+
|
| 1146 |
return `<div class="diff-panel">
|
| 1147 |
<div class="diff-panel-header">
|
| 1148 |
<span class="diff-panel-title">${{esc(er.engine)}}</span>
|
| 1149 |
${{pipeTagPanel}}
|
| 1150 |
<span class="diff-panel-metrics">
|
| 1151 |
<span class="cer-badge" style="color:${{c}};background:${{bg}}">${{pct(er.cer)}}</span>
|
| 1152 |
+
${{diplomaBadge}}
|
| 1153 |
<span class="badge" style="background:#f1f5f9">WER ${{pct(er.wer)}}</span>
|
| 1154 |
${{onBadge}}
|
| 1155 |
${{errBadge}}
|
|
The diff for this file is too large to render.
See raw diff
|
|
|
|
@@ -0,0 +1,834 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests Sprint 4 : normalisation diplomatique, import IIIF, adaptateurs API OCR."""
|
| 2 |
+
|
| 3 |
+
from __future__ import annotations
|
| 4 |
+
|
| 5 |
+
import json
|
| 6 |
+
import os
|
| 7 |
+
import pytest
|
| 8 |
+
|
| 9 |
+
from picarones.core.normalization import (
|
| 10 |
+
NormalizationProfile,
|
| 11 |
+
DIPLOMATIC_FR_MEDIEVAL,
|
| 12 |
+
DIPLOMATIC_FR_EARLY_MODERN,
|
| 13 |
+
DIPLOMATIC_LATIN_MEDIEVAL,
|
| 14 |
+
DIPLOMATIC_MINIMAL,
|
| 15 |
+
DEFAULT_DIPLOMATIC_PROFILE,
|
| 16 |
+
_apply_diplomatic_table,
|
| 17 |
+
get_builtin_profile,
|
| 18 |
+
)
|
| 19 |
+
from picarones.core.metrics import compute_metrics, aggregate_metrics, MetricsResult
|
| 20 |
+
from picarones.importers.iiif import (
|
| 21 |
+
IIIFManifestParser,
|
| 22 |
+
IIIFCanvas,
|
| 23 |
+
parse_page_selector,
|
| 24 |
+
_extract_label,
|
| 25 |
+
_best_image_url_v2,
|
| 26 |
+
_best_image_url_v3,
|
| 27 |
+
_guess_extension,
|
| 28 |
+
_slugify,
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ===========================================================================
|
| 33 |
+
# Tests NormalizationProfile
|
| 34 |
+
# ===========================================================================
|
| 35 |
+
|
| 36 |
+
class TestNormalizationProfile:
|
| 37 |
+
|
| 38 |
+
def test_default_nfc_only(self):
|
| 39 |
+
profile = NormalizationProfile(name="test")
|
| 40 |
+
assert profile.nfc is True
|
| 41 |
+
assert profile.caseless is False
|
| 42 |
+
assert profile.diplomatic_table == {}
|
| 43 |
+
|
| 44 |
+
def test_normalize_nfc(self):
|
| 45 |
+
profile = NormalizationProfile(name="nfc_only")
|
| 46 |
+
# NFD vs NFC : après NFC, les deux doivent être identiques
|
| 47 |
+
decomposed = "e\u0301" # e + accent
|
| 48 |
+
assert profile.normalize(decomposed) == "\u00e9" # é NFC
|
| 49 |
+
|
| 50 |
+
def test_normalize_caseless(self):
|
| 51 |
+
profile = NormalizationProfile(name="caseless", caseless=True)
|
| 52 |
+
assert profile.normalize("Bonjour MONDE") == "bonjour monde"
|
| 53 |
+
|
| 54 |
+
def test_normalize_diplomatic_table(self):
|
| 55 |
+
profile = NormalizationProfile(
|
| 56 |
+
name="test",
|
| 57 |
+
diplomatic_table={"ſ": "s", "u": "v"}
|
| 58 |
+
)
|
| 59 |
+
# "maiſon": ſ→s gives "maison", no u present → "maison"
|
| 60 |
+
assert profile.normalize("maiſon") == "maison"
|
| 61 |
+
# "uers" (vers ancien): u→v gives "vers"
|
| 62 |
+
assert profile.normalize("uers") == "vers"
|
| 63 |
+
|
| 64 |
+
def test_normalize_order_nfc_then_caseless_then_diplomatic(self):
|
| 65 |
+
"""L'ordre est : NFC → caseless → table diplomatique."""
|
| 66 |
+
profile = NormalizationProfile(
|
| 67 |
+
name="combined",
|
| 68 |
+
caseless=True,
|
| 69 |
+
diplomatic_table={"ſ": "s"}
|
| 70 |
+
)
|
| 71 |
+
result = profile.normalize("Maiſon")
|
| 72 |
+
assert result == "maison"
|
| 73 |
+
|
| 74 |
+
def test_as_dict(self):
|
| 75 |
+
profile = NormalizationProfile(
|
| 76 |
+
name="medieval_french",
|
| 77 |
+
nfc=True,
|
| 78 |
+
caseless=False,
|
| 79 |
+
diplomatic_table={"ſ": "s"},
|
| 80 |
+
description="Test",
|
| 81 |
+
)
|
| 82 |
+
d = profile.as_dict()
|
| 83 |
+
assert d["name"] == "medieval_french"
|
| 84 |
+
assert d["diplomatic_table"] == {"ſ": "s"}
|
| 85 |
+
assert d["caseless"] is False
|
| 86 |
+
|
| 87 |
+
def test_from_dict(self):
|
| 88 |
+
data = {
|
| 89 |
+
"name": "custom",
|
| 90 |
+
"caseless": True,
|
| 91 |
+
"diplomatic": {"ſ": "s", "u": "v"},
|
| 92 |
+
"description": "Custom profile",
|
| 93 |
+
}
|
| 94 |
+
profile = NormalizationProfile.from_dict(data)
|
| 95 |
+
assert profile.name == "custom"
|
| 96 |
+
assert profile.caseless is True
|
| 97 |
+
assert profile.diplomatic_table == {"ſ": "s", "u": "v"}
|
| 98 |
+
|
| 99 |
+
def test_from_dict_defaults(self):
|
| 100 |
+
profile = NormalizationProfile.from_dict({})
|
| 101 |
+
assert profile.name == "custom"
|
| 102 |
+
assert profile.nfc is True
|
| 103 |
+
assert profile.caseless is False
|
| 104 |
+
|
| 105 |
+
def test_from_yaml(self, tmp_path):
|
| 106 |
+
yaml_content = "name: my_profile\ncaseless: false\ndiplomatic:\n \u017f: s\n u: v\n"
|
| 107 |
+
yaml_file = tmp_path / "profile.yaml"
|
| 108 |
+
yaml_file.write_text(yaml_content, encoding="utf-8")
|
| 109 |
+
try:
|
| 110 |
+
profile = NormalizationProfile.from_yaml(yaml_file)
|
| 111 |
+
assert profile.name == "my_profile"
|
| 112 |
+
assert profile.diplomatic_table == {"\u017f": "s", "u": "v"}
|
| 113 |
+
except RuntimeError as e:
|
| 114 |
+
if "pyyaml" in str(e):
|
| 115 |
+
pytest.skip("pyyaml non installé")
|
| 116 |
+
raise
|
| 117 |
+
|
| 118 |
+
|
| 119 |
+
class TestApplyDiplomaticTable:
|
| 120 |
+
|
| 121 |
+
def test_simple_substitutions(self):
|
| 122 |
+
table = {"ſ": "s", "u": "v"}
|
| 123 |
+
# "maiſon": ſ→s gives "maison"; no u → "maison"
|
| 124 |
+
assert _apply_diplomatic_table("maiſon", table) == "maison"
|
| 125 |
+
# "uers": u→v gives "vers"
|
| 126 |
+
assert _apply_diplomatic_table("uers", table) == "vers"
|
| 127 |
+
|
| 128 |
+
def test_multi_char_key_priority(self):
|
| 129 |
+
"""Les clés multi-chars sont appliquées avant les clés simples."""
|
| 130 |
+
table = {"ae": "X", "a": "Y"}
|
| 131 |
+
# "ae" doit être remplacé en "X" et non "Ye"
|
| 132 |
+
result = _apply_diplomatic_table("aeb", table)
|
| 133 |
+
assert result == "Xb"
|
| 134 |
+
|
| 135 |
+
def test_ampersand_to_et(self):
|
| 136 |
+
table = {"&": "et"}
|
| 137 |
+
assert _apply_diplomatic_table("noir & blanc", table) == "noir et blanc"
|
| 138 |
+
|
| 139 |
+
def test_empty_table(self):
|
| 140 |
+
assert _apply_diplomatic_table("hello", {}) == "hello"
|
| 141 |
+
|
| 142 |
+
def test_empty_text(self):
|
| 143 |
+
assert _apply_diplomatic_table("", {"a": "b"}) == ""
|
| 144 |
+
|
| 145 |
+
|
| 146 |
+
class TestGetBuiltinProfile:
|
| 147 |
+
|
| 148 |
+
def test_medieval_french(self):
|
| 149 |
+
profile = get_builtin_profile("medieval_french")
|
| 150 |
+
assert profile.name == "medieval_french"
|
| 151 |
+
assert "ſ" in profile.diplomatic_table
|
| 152 |
+
assert profile.diplomatic_table["ſ"] == "s"
|
| 153 |
+
|
| 154 |
+
def test_early_modern_french(self):
|
| 155 |
+
profile = get_builtin_profile("early_modern_french")
|
| 156 |
+
assert "ſ" in profile.diplomatic_table
|
| 157 |
+
|
| 158 |
+
def test_medieval_latin(self):
|
| 159 |
+
profile = get_builtin_profile("medieval_latin")
|
| 160 |
+
assert "ꝑ" in profile.diplomatic_table
|
| 161 |
+
|
| 162 |
+
def test_minimal(self):
|
| 163 |
+
profile = get_builtin_profile("minimal")
|
| 164 |
+
assert "ſ" in profile.diplomatic_table
|
| 165 |
+
assert "u" not in profile.diplomatic_table
|
| 166 |
+
|
| 167 |
+
def test_nfc(self):
|
| 168 |
+
profile = get_builtin_profile("nfc")
|
| 169 |
+
assert profile.nfc is True
|
| 170 |
+
assert profile.diplomatic_table == {}
|
| 171 |
+
|
| 172 |
+
def test_caseless(self):
|
| 173 |
+
profile = get_builtin_profile("caseless")
|
| 174 |
+
assert profile.caseless is True
|
| 175 |
+
|
| 176 |
+
def test_unknown_raises_key_error(self):
|
| 177 |
+
with pytest.raises(KeyError, match="inexistant"):
|
| 178 |
+
get_builtin_profile("inexistant")
|
| 179 |
+
|
| 180 |
+
def test_default_profile_is_medieval_french(self):
|
| 181 |
+
assert DEFAULT_DIPLOMATIC_PROFILE.name == "medieval_french"
|
| 182 |
+
|
| 183 |
+
|
| 184 |
+
# ===========================================================================
|
| 185 |
+
# Tests CER diplomatique dans compute_metrics
|
| 186 |
+
# ===========================================================================
|
| 187 |
+
|
| 188 |
+
class TestDiplomaticCER:
|
| 189 |
+
|
| 190 |
+
def test_cer_diplomatic_computed_by_default(self):
|
| 191 |
+
"""Le CER diplomatique est calculé par défaut avec le profil médiéval."""
|
| 192 |
+
result = compute_metrics("maiſon", "maison")
|
| 193 |
+
assert result.cer_diplomatic is not None
|
| 194 |
+
assert result.diplomatic_profile_name == "medieval_french"
|
| 195 |
+
|
| 196 |
+
def test_cer_diplomatic_lower_than_exact_for_long_s(self):
|
| 197 |
+
"""
|
| 198 |
+
Avec ſ→s : le CER diplomatique doit être 0.0 pour "maiſon" vs "maison"
|
| 199 |
+
car après normalisation les deux deviennent "maivon" ou "maison".
|
| 200 |
+
"""
|
| 201 |
+
# "maiſon" vs "maison" — différence uniquement sur ſ vs s
|
| 202 |
+
result = compute_metrics("maiſon", "maison")
|
| 203 |
+
# CER brut > 0 (ſ ≠ s, deux bytes UTF-8 vs un)
|
| 204 |
+
assert result.cer > 0.0
|
| 205 |
+
# CER diplomatique = 0 car ſ et s sont équivalents dans le profil médiéval
|
| 206 |
+
assert result.cer_diplomatic == pytest.approx(0.0)
|
| 207 |
+
|
| 208 |
+
def test_cer_diplomatic_in_as_dict(self):
|
| 209 |
+
result = compute_metrics("maiſon", "maison")
|
| 210 |
+
d = result.as_dict()
|
| 211 |
+
assert "cer_diplomatic" in d
|
| 212 |
+
assert "diplomatic_profile_name" in d
|
| 213 |
+
|
| 214 |
+
def test_cer_diplomatic_with_custom_profile(self):
|
| 215 |
+
from picarones.core.normalization import NormalizationProfile
|
| 216 |
+
profile = NormalizationProfile(
|
| 217 |
+
name="test_profile",
|
| 218 |
+
diplomatic_table={"ſ": "s"}
|
| 219 |
+
)
|
| 220 |
+
result = compute_metrics("maiſon", "maison", normalization_profile=profile)
|
| 221 |
+
assert result.cer_diplomatic == pytest.approx(0.0)
|
| 222 |
+
assert result.diplomatic_profile_name == "test_profile"
|
| 223 |
+
|
| 224 |
+
def test_cer_diplomatic_not_in_as_dict_when_none(self):
|
| 225 |
+
"""Si le CER diplomatique n'a pas pu être calculé, il n'est pas dans as_dict."""
|
| 226 |
+
result = MetricsResult(
|
| 227 |
+
cer=0.1, cer_nfc=0.1, cer_caseless=0.1,
|
| 228 |
+
wer=0.1, wer_normalized=0.1, mer=0.1, wil=0.1,
|
| 229 |
+
reference_length=10, hypothesis_length=10,
|
| 230 |
+
cer_diplomatic=None, diplomatic_profile_name=None,
|
| 231 |
+
)
|
| 232 |
+
d = result.as_dict()
|
| 233 |
+
assert "cer_diplomatic" not in d
|
| 234 |
+
|
| 235 |
+
def test_aggregate_metrics_includes_diplomatic_cer(self):
|
| 236 |
+
"""aggregate_metrics doit agréger cer_diplomatic quand disponible."""
|
| 237 |
+
results = [
|
| 238 |
+
MetricsResult(
|
| 239 |
+
cer=0.1, cer_nfc=0.1, cer_caseless=0.1,
|
| 240 |
+
wer=0.1, wer_normalized=0.1, mer=0.1, wil=0.1,
|
| 241 |
+
reference_length=10, hypothesis_length=10,
|
| 242 |
+
cer_diplomatic=0.05, diplomatic_profile_name="medieval_french",
|
| 243 |
+
),
|
| 244 |
+
MetricsResult(
|
| 245 |
+
cer=0.2, cer_nfc=0.2, cer_caseless=0.2,
|
| 246 |
+
wer=0.2, wer_normalized=0.2, mer=0.2, wil=0.2,
|
| 247 |
+
reference_length=10, hypothesis_length=10,
|
| 248 |
+
cer_diplomatic=0.10, diplomatic_profile_name="medieval_french",
|
| 249 |
+
),
|
| 250 |
+
]
|
| 251 |
+
agg = aggregate_metrics(results)
|
| 252 |
+
assert "cer_diplomatic" in agg
|
| 253 |
+
assert agg["cer_diplomatic"]["mean"] == pytest.approx(0.075)
|
| 254 |
+
assert agg["cer_diplomatic"].get("profile") == "medieval_french"
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
# ===========================================================================
|
| 258 |
+
# Tests parse_page_selector
|
| 259 |
+
# ===========================================================================
|
| 260 |
+
|
| 261 |
+
class TestParsePageSelector:
|
| 262 |
+
|
| 263 |
+
def test_all(self):
|
| 264 |
+
assert parse_page_selector("all", 10) == list(range(10))
|
| 265 |
+
|
| 266 |
+
def test_empty_string(self):
|
| 267 |
+
assert parse_page_selector("", 5) == list(range(5))
|
| 268 |
+
|
| 269 |
+
def test_single_page(self):
|
| 270 |
+
assert parse_page_selector("3", 10) == [2] # 0-based
|
| 271 |
+
|
| 272 |
+
def test_range(self):
|
| 273 |
+
assert parse_page_selector("1-5", 10) == [0, 1, 2, 3, 4]
|
| 274 |
+
|
| 275 |
+
def test_comma_list(self):
|
| 276 |
+
assert parse_page_selector("1,3,5", 10) == [0, 2, 4]
|
| 277 |
+
|
| 278 |
+
def test_combined(self):
|
| 279 |
+
result = parse_page_selector("1-3,5,8-9", 10)
|
| 280 |
+
assert result == [0, 1, 2, 4, 7, 8]
|
| 281 |
+
|
| 282 |
+
def test_deduplication(self):
|
| 283 |
+
result = parse_page_selector("1,1,2", 5)
|
| 284 |
+
assert result == [0, 1]
|
| 285 |
+
|
| 286 |
+
def test_sorted_output(self):
|
| 287 |
+
result = parse_page_selector("5,1,3", 10)
|
| 288 |
+
assert result == [0, 2, 4]
|
| 289 |
+
|
| 290 |
+
def test_page_out_of_range_raises(self):
|
| 291 |
+
with pytest.raises(ValueError):
|
| 292 |
+
parse_page_selector("15", 10)
|
| 293 |
+
|
| 294 |
+
def test_range_out_of_bounds_raises(self):
|
| 295 |
+
with pytest.raises(ValueError):
|
| 296 |
+
parse_page_selector("1-15", 10)
|
| 297 |
+
|
| 298 |
+
def test_invalid_syntax_raises(self):
|
| 299 |
+
with pytest.raises((ValueError, Exception)):
|
| 300 |
+
parse_page_selector("abc", 10)
|
| 301 |
+
|
| 302 |
+
def test_last_page(self):
|
| 303 |
+
assert parse_page_selector("10", 10) == [9]
|
| 304 |
+
|
| 305 |
+
def test_first_page(self):
|
| 306 |
+
assert parse_page_selector("1", 10) == [0]
|
| 307 |
+
|
| 308 |
+
|
| 309 |
+
# ===========================================================================
|
| 310 |
+
# Tests IIIFManifestParser — IIIF v2
|
| 311 |
+
# ===========================================================================
|
| 312 |
+
|
| 313 |
+
def _make_v2_manifest(num_canvases: int = 3, with_service: bool = False) -> dict:
|
| 314 |
+
"""Fabrique un manifeste IIIF v2 minimal de test."""
|
| 315 |
+
canvases = []
|
| 316 |
+
for i in range(num_canvases):
|
| 317 |
+
resource: dict
|
| 318 |
+
if with_service:
|
| 319 |
+
resource = {
|
| 320 |
+
"@type": "dctypes:Image",
|
| 321 |
+
"service": {"@id": f"https://example.com/iiif/img{i+1}"},
|
| 322 |
+
}
|
| 323 |
+
else:
|
| 324 |
+
resource = {
|
| 325 |
+
"@type": "dctypes:Image",
|
| 326 |
+
"@id": f"https://example.com/images/img{i+1}.jpg",
|
| 327 |
+
}
|
| 328 |
+
canvases.append({
|
| 329 |
+
"@id": f"https://example.com/canvas/{i+1}",
|
| 330 |
+
"@type": "sc:Canvas",
|
| 331 |
+
"label": f"f. {i+1}r",
|
| 332 |
+
"width": 2000,
|
| 333 |
+
"height": 3000,
|
| 334 |
+
"images": [
|
| 335 |
+
{
|
| 336 |
+
"@type": "oa:Annotation",
|
| 337 |
+
"motivation": "sc:painting",
|
| 338 |
+
"resource": resource,
|
| 339 |
+
"on": f"https://example.com/canvas/{i+1}",
|
| 340 |
+
}
|
| 341 |
+
],
|
| 342 |
+
})
|
| 343 |
+
return {
|
| 344 |
+
"@context": "http://iiif.io/api/presentation/2/context.json",
|
| 345 |
+
"@type": "sc:Manifest",
|
| 346 |
+
"@id": "https://example.com/manifest.json",
|
| 347 |
+
"label": "Manuscript de test",
|
| 348 |
+
"sequences": [
|
| 349 |
+
{
|
| 350 |
+
"@type": "sc:Sequence",
|
| 351 |
+
"canvases": canvases,
|
| 352 |
+
}
|
| 353 |
+
],
|
| 354 |
+
}
|
| 355 |
+
|
| 356 |
+
|
| 357 |
+
def _make_v3_manifest(num_canvases: int = 3) -> dict:
|
| 358 |
+
"""Fabrique un manifeste IIIF v3 minimal de test."""
|
| 359 |
+
items = []
|
| 360 |
+
for i in range(num_canvases):
|
| 361 |
+
items.append({
|
| 362 |
+
"id": f"https://example.com/canvas/{i+1}",
|
| 363 |
+
"type": "Canvas",
|
| 364 |
+
"label": {"fr": [f"Page {i+1}"]},
|
| 365 |
+
"width": 1500,
|
| 366 |
+
"height": 2200,
|
| 367 |
+
"items": [
|
| 368 |
+
{
|
| 369 |
+
"id": f"https://example.com/canvas/{i+1}/ap",
|
| 370 |
+
"type": "AnnotationPage",
|
| 371 |
+
"items": [
|
| 372 |
+
{
|
| 373 |
+
"id": f"https://example.com/canvas/{i+1}/ap/a",
|
| 374 |
+
"type": "Annotation",
|
| 375 |
+
"motivation": "painting",
|
| 376 |
+
"body": {
|
| 377 |
+
"id": f"https://example.com/images/{i+1}/full/max/0/default.jpg",
|
| 378 |
+
"type": "Image",
|
| 379 |
+
"format": "image/jpeg",
|
| 380 |
+
},
|
| 381 |
+
"target": f"https://example.com/canvas/{i+1}",
|
| 382 |
+
}
|
| 383 |
+
],
|
| 384 |
+
}
|
| 385 |
+
],
|
| 386 |
+
})
|
| 387 |
+
return {
|
| 388 |
+
"@context": "http://iiif.io/api/presentation/3/context.json",
|
| 389 |
+
"id": "https://example.com/manifest.json",
|
| 390 |
+
"type": "Manifest",
|
| 391 |
+
"label": {"fr": ["Manuscrit v3 de test"]},
|
| 392 |
+
"items": items,
|
| 393 |
+
}
|
| 394 |
+
|
| 395 |
+
|
| 396 |
+
class TestIIIFManifestParserV2:
|
| 397 |
+
|
| 398 |
+
def test_version_detection(self):
|
| 399 |
+
manifest = _make_v2_manifest()
|
| 400 |
+
parser = IIIFManifestParser(manifest)
|
| 401 |
+
assert parser.version == 2
|
| 402 |
+
|
| 403 |
+
def test_canvases_count(self):
|
| 404 |
+
parser = IIIFManifestParser(_make_v2_manifest(5))
|
| 405 |
+
assert len(parser.canvases()) == 5
|
| 406 |
+
|
| 407 |
+
def test_canvas_label(self):
|
| 408 |
+
parser = IIIFManifestParser(_make_v2_manifest())
|
| 409 |
+
canvases = parser.canvases()
|
| 410 |
+
assert canvases[0].label == "f. 1r"
|
| 411 |
+
assert canvases[1].label == "f. 2r"
|
| 412 |
+
|
| 413 |
+
def test_canvas_image_url_direct(self):
|
| 414 |
+
parser = IIIFManifestParser(_make_v2_manifest())
|
| 415 |
+
canvases = parser.canvases()
|
| 416 |
+
assert canvases[0].image_url == "https://example.com/images/img1.jpg"
|
| 417 |
+
|
| 418 |
+
def test_canvas_image_url_via_service(self):
|
| 419 |
+
parser = IIIFManifestParser(_make_v2_manifest(with_service=True))
|
| 420 |
+
canvases = parser.canvases()
|
| 421 |
+
assert "/full/max/0/default.jpg" in canvases[0].image_url
|
| 422 |
+
|
| 423 |
+
def test_canvas_dimensions(self):
|
| 424 |
+
parser = IIIFManifestParser(_make_v2_manifest())
|
| 425 |
+
c = parser.canvases()[0]
|
| 426 |
+
assert c.width == 2000
|
| 427 |
+
assert c.height == 3000
|
| 428 |
+
|
| 429 |
+
def test_canvas_index(self):
|
| 430 |
+
parser = IIIFManifestParser(_make_v2_manifest(3))
|
| 431 |
+
canvases = parser.canvases()
|
| 432 |
+
for i, c in enumerate(canvases):
|
| 433 |
+
assert c.index == i
|
| 434 |
+
|
| 435 |
+
def test_label(self):
|
| 436 |
+
parser = IIIFManifestParser(_make_v2_manifest())
|
| 437 |
+
assert parser.label == "Manuscript de test"
|
| 438 |
+
|
| 439 |
+
def test_empty_sequences(self):
|
| 440 |
+
manifest = {
|
| 441 |
+
"@context": "http://iiif.io/api/presentation/2/context.json",
|
| 442 |
+
"@type": "sc:Manifest",
|
| 443 |
+
"label": "Empty",
|
| 444 |
+
"sequences": [],
|
| 445 |
+
}
|
| 446 |
+
parser = IIIFManifestParser(manifest)
|
| 447 |
+
assert parser.canvases() == []
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
class TestIIIFManifestParserV3:
|
| 451 |
+
|
| 452 |
+
def test_version_detection(self):
|
| 453 |
+
manifest = _make_v3_manifest()
|
| 454 |
+
parser = IIIFManifestParser(manifest)
|
| 455 |
+
assert parser.version == 3
|
| 456 |
+
|
| 457 |
+
def test_canvases_count(self):
|
| 458 |
+
parser = IIIFManifestParser(_make_v3_manifest(4))
|
| 459 |
+
assert len(parser.canvases()) == 4
|
| 460 |
+
|
| 461 |
+
def test_canvas_label_from_language_map(self):
|
| 462 |
+
parser = IIIFManifestParser(_make_v3_manifest())
|
| 463 |
+
canvases = parser.canvases()
|
| 464 |
+
assert "Page 1" in canvases[0].label
|
| 465 |
+
|
| 466 |
+
def test_canvas_image_url(self):
|
| 467 |
+
parser = IIIFManifestParser(_make_v3_manifest())
|
| 468 |
+
canvases = parser.canvases()
|
| 469 |
+
assert "default.jpg" in canvases[0].image_url
|
| 470 |
+
|
| 471 |
+
def test_manifest_label_language_map(self):
|
| 472 |
+
parser = IIIFManifestParser(_make_v3_manifest())
|
| 473 |
+
assert "v3" in parser.label.lower() or "test" in parser.label.lower()
|
| 474 |
+
|
| 475 |
+
def test_type_manifest_triggers_v3(self):
|
| 476 |
+
"""Un manifeste avec type == 'Manifest' est détecté comme v3."""
|
| 477 |
+
manifest = {"type": "Manifest", "items": []}
|
| 478 |
+
parser = IIIFManifestParser(manifest)
|
| 479 |
+
assert parser.version == 3
|
| 480 |
+
|
| 481 |
+
|
| 482 |
+
class TestExtractLabel:
|
| 483 |
+
|
| 484 |
+
def test_string(self):
|
| 485 |
+
assert _extract_label("Page 1") == "Page 1"
|
| 486 |
+
|
| 487 |
+
def test_list(self):
|
| 488 |
+
assert _extract_label(["Page 1", "Page 2"]) == "Page 1"
|
| 489 |
+
|
| 490 |
+
def test_dict_fr(self):
|
| 491 |
+
assert _extract_label({"fr": ["Folio 1r"]}) == "Folio 1r"
|
| 492 |
+
|
| 493 |
+
def test_dict_en(self):
|
| 494 |
+
assert _extract_label({"en": ["Folio 1r"]}) == "Folio 1r"
|
| 495 |
+
|
| 496 |
+
def test_dict_none_key(self):
|
| 497 |
+
assert _extract_label({"none": ["Label"]}) == "Label"
|
| 498 |
+
|
| 499 |
+
def test_empty_string(self):
|
| 500 |
+
assert _extract_label("") == ""
|
| 501 |
+
|
| 502 |
+
def test_none_value(self):
|
| 503 |
+
result = _extract_label(None)
|
| 504 |
+
assert isinstance(result, str)
|
| 505 |
+
|
| 506 |
+
|
| 507 |
+
class TestBestImageUrlV2:
|
| 508 |
+
|
| 509 |
+
def test_direct_id(self):
|
| 510 |
+
resource = {"@id": "https://example.com/img.jpg"}
|
| 511 |
+
url = _best_image_url_v2(resource, {})
|
| 512 |
+
assert url == "https://example.com/img.jpg"
|
| 513 |
+
|
| 514 |
+
def test_service_id(self):
|
| 515 |
+
resource = {
|
| 516 |
+
"@id": "https://example.com/info.json",
|
| 517 |
+
"service": {"@id": "https://example.com/iiif/img1"},
|
| 518 |
+
}
|
| 519 |
+
url = _best_image_url_v2(resource, {})
|
| 520 |
+
assert url == "https://example.com/iiif/img1/full/max/0/default.jpg"
|
| 521 |
+
|
| 522 |
+
def test_service_list(self):
|
| 523 |
+
resource = {
|
| 524 |
+
"service": [
|
| 525 |
+
{"@id": "https://example.com/iiif/img2"},
|
| 526 |
+
]
|
| 527 |
+
}
|
| 528 |
+
url = _best_image_url_v2(resource, {})
|
| 529 |
+
assert url == "https://example.com/iiif/img2/full/max/0/default.jpg"
|
| 530 |
+
|
| 531 |
+
|
| 532 |
+
class TestBestImageUrlV3:
|
| 533 |
+
|
| 534 |
+
def test_direct_body_image(self):
|
| 535 |
+
canvas = {
|
| 536 |
+
"items": [
|
| 537 |
+
{
|
| 538 |
+
"type": "AnnotationPage",
|
| 539 |
+
"items": [
|
| 540 |
+
{
|
| 541 |
+
"type": "Annotation",
|
| 542 |
+
"motivation": "painting",
|
| 543 |
+
"body": {
|
| 544 |
+
"id": "https://example.com/img.jpg",
|
| 545 |
+
"type": "Image",
|
| 546 |
+
},
|
| 547 |
+
}
|
| 548 |
+
],
|
| 549 |
+
}
|
| 550 |
+
]
|
| 551 |
+
}
|
| 552 |
+
url = _best_image_url_v3(canvas)
|
| 553 |
+
assert url == "https://example.com/img.jpg"
|
| 554 |
+
|
| 555 |
+
def test_body_via_service(self):
|
| 556 |
+
canvas = {
|
| 557 |
+
"items": [
|
| 558 |
+
{
|
| 559 |
+
"items": [
|
| 560 |
+
{
|
| 561 |
+
"body": {
|
| 562 |
+
"type": "Image",
|
| 563 |
+
"id": "",
|
| 564 |
+
"service": [{"id": "https://example.com/iiif/3/img1"}],
|
| 565 |
+
}
|
| 566 |
+
}
|
| 567 |
+
]
|
| 568 |
+
}
|
| 569 |
+
]
|
| 570 |
+
}
|
| 571 |
+
url = _best_image_url_v3(canvas)
|
| 572 |
+
assert "/full/max/0/default.jpg" in url
|
| 573 |
+
|
| 574 |
+
def test_empty_canvas(self):
|
| 575 |
+
url = _best_image_url_v3({})
|
| 576 |
+
assert url == ""
|
| 577 |
+
|
| 578 |
+
|
| 579 |
+
class TestGuessExtension:
|
| 580 |
+
|
| 581 |
+
def test_jpg(self):
|
| 582 |
+
assert _guess_extension("https://example.com/img.jpg") == ".jpg"
|
| 583 |
+
|
| 584 |
+
def test_png(self):
|
| 585 |
+
assert _guess_extension("https://example.com/img.png") == ".png"
|
| 586 |
+
|
| 587 |
+
def test_tiff(self):
|
| 588 |
+
assert _guess_extension("https://example.com/img.tiff") == ".tiff"
|
| 589 |
+
|
| 590 |
+
def test_iiif_default(self):
|
| 591 |
+
# URL IIIF standard contient /default.jpg
|
| 592 |
+
url = "https://example.com/iiif/img/full/max/0/default.jpg"
|
| 593 |
+
assert _guess_extension(url) == ".jpg"
|
| 594 |
+
|
| 595 |
+
def test_unknown_defaults_to_jpg(self):
|
| 596 |
+
assert _guess_extension("https://example.com/resource/123") == ".jpg"
|
| 597 |
+
|
| 598 |
+
|
| 599 |
+
class TestSlugify:
|
| 600 |
+
|
| 601 |
+
def test_simple(self):
|
| 602 |
+
assert _slugify("Page 1") == "Page_1"
|
| 603 |
+
|
| 604 |
+
def test_special_chars_removed(self):
|
| 605 |
+
result = _slugify("f. 1r (recto)")
|
| 606 |
+
assert "/" not in result
|
| 607 |
+
assert "." not in result
|
| 608 |
+
|
| 609 |
+
def test_max_length(self):
|
| 610 |
+
long_label = "x" * 100
|
| 611 |
+
assert len(_slugify(long_label)) <= 60
|
| 612 |
+
|
| 613 |
+
def test_empty(self):
|
| 614 |
+
assert _slugify("") == ""
|
| 615 |
+
|
| 616 |
+
|
| 617 |
+
# ===========================================================================
|
| 618 |
+
# Tests structure des nouveaux moteurs OCR (sans appel réseau)
|
| 619 |
+
# ===========================================================================
|
| 620 |
+
|
| 621 |
+
class TestMistralOCREngine:
|
| 622 |
+
|
| 623 |
+
def test_import(self):
|
| 624 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 625 |
+
assert MistralOCREngine is not None
|
| 626 |
+
|
| 627 |
+
def test_name(self):
|
| 628 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 629 |
+
engine = MistralOCREngine()
|
| 630 |
+
assert engine.name == "mistral_ocr"
|
| 631 |
+
|
| 632 |
+
def test_version_default_model(self):
|
| 633 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 634 |
+
engine = MistralOCREngine()
|
| 635 |
+
assert "pixtral" in engine.version()
|
| 636 |
+
|
| 637 |
+
def test_version_custom_model(self):
|
| 638 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 639 |
+
engine = MistralOCREngine({"model": "pixtral-large-latest"})
|
| 640 |
+
assert engine.version() == "pixtral-large-latest"
|
| 641 |
+
|
| 642 |
+
def test_missing_api_key_raises(self, monkeypatch, tmp_path):
|
| 643 |
+
from picarones.engines.mistral_ocr import MistralOCREngine
|
| 644 |
+
monkeypatch.delenv("MISTRAL_API_KEY", raising=False)
|
| 645 |
+
engine = MistralOCREngine()
|
| 646 |
+
# Créer un fichier image factice
|
| 647 |
+
img = tmp_path / "test.jpg"
|
| 648 |
+
img.write_bytes(b"\xff\xd8\xff") # JPEG header minimal
|
| 649 |
+
with pytest.raises(RuntimeError, match="MISTRAL_API_KEY"):
|
| 650 |
+
engine._run_ocr(img)
|
| 651 |
+
|
| 652 |
+
def test_exported_from_engines(self):
|
| 653 |
+
from picarones.engines import MistralOCREngine
|
| 654 |
+
assert MistralOCREngine is not None
|
| 655 |
+
|
| 656 |
+
|
| 657 |
+
class TestGoogleVisionEngine:
|
| 658 |
+
|
| 659 |
+
def test_import(self):
|
| 660 |
+
from picarones.engines.google_vision import GoogleVisionEngine
|
| 661 |
+
assert GoogleVisionEngine is not None
|
| 662 |
+
|
| 663 |
+
def test_name(self):
|
| 664 |
+
from picarones.engines.google_vision import GoogleVisionEngine
|
| 665 |
+
engine = GoogleVisionEngine()
|
| 666 |
+
assert engine.name == "google_vision"
|
| 667 |
+
|
| 668 |
+
def test_version(self):
|
| 669 |
+
from picarones.engines.google_vision import GoogleVisionEngine
|
| 670 |
+
engine = GoogleVisionEngine()
|
| 671 |
+
assert engine.version() == "v1"
|
| 672 |
+
|
| 673 |
+
def test_missing_credentials_raises(self, monkeypatch, tmp_path):
|
| 674 |
+
from picarones.engines.google_vision import GoogleVisionEngine
|
| 675 |
+
monkeypatch.delenv("GOOGLE_APPLICATION_CREDENTIALS", raising=False)
|
| 676 |
+
monkeypatch.delenv("GOOGLE_API_KEY", raising=False)
|
| 677 |
+
engine = GoogleVisionEngine()
|
| 678 |
+
img = tmp_path / "test.jpg"
|
| 679 |
+
img.write_bytes(b"\xff\xd8\xff")
|
| 680 |
+
with pytest.raises(RuntimeError):
|
| 681 |
+
engine._run_ocr(img)
|
| 682 |
+
|
| 683 |
+
def test_exported_from_engines(self):
|
| 684 |
+
from picarones.engines import GoogleVisionEngine
|
| 685 |
+
assert GoogleVisionEngine is not None
|
| 686 |
+
|
| 687 |
+
|
| 688 |
+
class TestAzureDocIntelEngine:
|
| 689 |
+
|
| 690 |
+
def test_import(self):
|
| 691 |
+
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
|
| 692 |
+
assert AzureDocIntelEngine is not None
|
| 693 |
+
|
| 694 |
+
def test_name(self):
|
| 695 |
+
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
|
| 696 |
+
engine = AzureDocIntelEngine()
|
| 697 |
+
assert engine.name == "azure_doc_intel"
|
| 698 |
+
|
| 699 |
+
def test_missing_key_raises(self, monkeypatch, tmp_path):
|
| 700 |
+
from picarones.engines.azure_doc_intel import AzureDocIntelEngine
|
| 701 |
+
monkeypatch.delenv("AZURE_DOC_INTEL_KEY", raising=False)
|
| 702 |
+
monkeypatch.delenv("AZURE_DOC_INTEL_ENDPOINT", raising=False)
|
| 703 |
+
engine = AzureDocIntelEngine()
|
| 704 |
+
img = tmp_path / "test.jpg"
|
| 705 |
+
img.write_bytes(b"\xff\xd8\xff")
|
| 706 |
+
with pytest.raises(RuntimeError):
|
| 707 |
+
engine._run_ocr(img)
|
| 708 |
+
|
| 709 |
+
def test_exported_from_engines(self):
|
| 710 |
+
from picarones.engines import AzureDocIntelEngine
|
| 711 |
+
assert AzureDocIntelEngine is not None
|
| 712 |
+
|
| 713 |
+
|
| 714 |
+
# ===========================================================================
|
| 715 |
+
# Tests CLI — commande import iiif
|
| 716 |
+
# ===========================================================================
|
| 717 |
+
|
| 718 |
+
class TestCLIImportIIIF:
|
| 719 |
+
|
| 720 |
+
def test_import_group_exists(self):
|
| 721 |
+
from picarones.cli import cli
|
| 722 |
+
from click.testing import CliRunner
|
| 723 |
+
runner = CliRunner()
|
| 724 |
+
result = runner.invoke(cli, ["import", "--help"])
|
| 725 |
+
assert result.exit_code == 0
|
| 726 |
+
|
| 727 |
+
def test_import_iiif_command_exists(self):
|
| 728 |
+
from picarones.cli import cli
|
| 729 |
+
from click.testing import CliRunner
|
| 730 |
+
runner = CliRunner()
|
| 731 |
+
result = runner.invoke(cli, ["import", "iiif", "--help"])
|
| 732 |
+
assert result.exit_code == 0
|
| 733 |
+
assert "manifest_url" in result.output.lower() or "MANIFEST_URL" in result.output
|
| 734 |
+
|
| 735 |
+
def test_import_iiif_options(self):
|
| 736 |
+
from picarones.cli import cli
|
| 737 |
+
from click.testing import CliRunner
|
| 738 |
+
runner = CliRunner()
|
| 739 |
+
result = runner.invoke(cli, ["import", "iiif", "--help"])
|
| 740 |
+
assert "--pages" in result.output
|
| 741 |
+
assert "--output" in result.output
|
| 742 |
+
|
| 743 |
+
def test_import_iiif_requires_url(self):
|
| 744 |
+
from picarones.cli import cli
|
| 745 |
+
from click.testing import CliRunner
|
| 746 |
+
runner = CliRunner()
|
| 747 |
+
result = runner.invoke(cli, ["import", "iiif"])
|
| 748 |
+
# Sans URL, doit afficher une erreur
|
| 749 |
+
assert result.exit_code != 0
|
| 750 |
+
|
| 751 |
+
|
| 752 |
+
# ===========================================================================
|
| 753 |
+
# Tests fixtures Sprint 4 (CER diplomatique dans la démo)
|
| 754 |
+
# ===========================================================================
|
| 755 |
+
|
| 756 |
+
class TestFixturesDiplomaticCER:
|
| 757 |
+
|
| 758 |
+
def test_gt_texts_contain_medieval_graphies(self):
|
| 759 |
+
"""Les textes GT de démo doivent contenir des graphies médiévales."""
|
| 760 |
+
from picarones.fixtures import _GT_TEXTS
|
| 761 |
+
all_gt = " ".join(_GT_TEXTS)
|
| 762 |
+
# Les GT doivent contenir au moins ſ, & ou æ/œ
|
| 763 |
+
has_medieval_chars = any(c in all_gt for c in ["ſ", "&", "æ", "œ"])
|
| 764 |
+
assert has_medieval_chars, "Les GT de démo doivent inclure des graphies médiévales pour illustrer le CER diplomatique"
|
| 765 |
+
|
| 766 |
+
def test_benchmark_results_have_diplomatic_cer(self):
|
| 767 |
+
"""Les résultats du benchmark fictif doivent inclure le CER diplomatique."""
|
| 768 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 769 |
+
bm = generate_sample_benchmark()
|
| 770 |
+
for engine_report in bm.engine_reports:
|
| 771 |
+
for doc_result in engine_report.document_results:
|
| 772 |
+
if doc_result.metrics.error is None:
|
| 773 |
+
# Le CER diplomatique doit être calculé
|
| 774 |
+
assert doc_result.metrics.cer_diplomatic is not None, (
|
| 775 |
+
f"CER diplomatique manquant pour {engine_report.engine_name}"
|
| 776 |
+
)
|
| 777 |
+
break # Un seul doc suffit pour vérifier
|
| 778 |
+
|
| 779 |
+
def test_diplomatic_cer_lower_for_medieval_graphies(self):
|
| 780 |
+
"""Pour un texte avec ſ, le CER diplomatique doit être ≤ CER exact."""
|
| 781 |
+
result = compute_metrics(
|
| 782 |
+
"maiſon & jardin", # GT avec graphies médiévales
|
| 783 |
+
"maison et jardin", # OCR avec graphies modernisées
|
| 784 |
+
)
|
| 785 |
+
assert result.cer_diplomatic is not None
|
| 786 |
+
# CER diplomatique doit être inférieur ou égal au CER exact
|
| 787 |
+
assert result.cer_diplomatic <= result.cer
|
| 788 |
+
|
| 789 |
+
|
| 790 |
+
# ===========================================================================
|
| 791 |
+
# Tests rapport HTML Sprint 4 (CER diplomatique affiché)
|
| 792 |
+
# ===========================================================================
|
| 793 |
+
|
| 794 |
+
class TestReportDiplomaticCER:
|
| 795 |
+
|
| 796 |
+
def test_report_data_has_cer_diplomatic(self):
|
| 797 |
+
"""_build_report_data doit inclure cer_diplomatic dans engines_summary."""
|
| 798 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 799 |
+
from picarones.report.generator import _build_report_data
|
| 800 |
+
|
| 801 |
+
bm = generate_sample_benchmark()
|
| 802 |
+
data = _build_report_data(bm, images_b64={})
|
| 803 |
+
|
| 804 |
+
# Chaque entrée engines doit avoir cer_diplomatic (ou None)
|
| 805 |
+
assert "engines" in data
|
| 806 |
+
for engine_data in data["engines"]:
|
| 807 |
+
assert "cer_diplomatic" in engine_data, (
|
| 808 |
+
f"cer_diplomatic manquant dans {engine_data.get('name', '?')}"
|
| 809 |
+
)
|
| 810 |
+
|
| 811 |
+
def test_html_contains_cer_diplo_column(self, tmp_path):
|
| 812 |
+
"""Le HTML généré doit contenir la colonne CER diplo."""
|
| 813 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 814 |
+
from picarones.report.generator import ReportGenerator
|
| 815 |
+
|
| 816 |
+
bm = generate_sample_benchmark()
|
| 817 |
+
out = tmp_path / "report_test.html"
|
| 818 |
+
ReportGenerator(bm).generate(out)
|
| 819 |
+
html = out.read_text(encoding="utf-8")
|
| 820 |
+
assert "diplo" in html.lower() or "diplomatique" in html.lower(), (
|
| 821 |
+
"Le rapport HTML doit mentionner le CER diplomatique"
|
| 822 |
+
)
|
| 823 |
+
|
| 824 |
+
def test_html_contains_medieval_graphie_indicator(self, tmp_path):
|
| 825 |
+
"""Le rapport doit mentionner les graphies médiévales (ſ=s ou u=v)."""
|
| 826 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 827 |
+
from picarones.report.generator import ReportGenerator
|
| 828 |
+
|
| 829 |
+
bm = generate_sample_benchmark()
|
| 830 |
+
out = tmp_path / "report_test.html"
|
| 831 |
+
ReportGenerator(bm).generate(out)
|
| 832 |
+
html = out.read_text(encoding="utf-8")
|
| 833 |
+
# Le tooltip ou la légende doit mentionner les correspondances diplomatiques
|
| 834 |
+
assert "ſ=s" in html or "u=v" in html or "diplomatique" in html.lower()
|