Spaces:
Sleeping
Sprint 8 : intégration eScriptorium, import Gallica, suivi longitudinal et analyse de robustesse
Browse filesNouveaux modules
----------------
- picarones/importers/escriptorium.py : client API REST eScriptorium
- Authentification par token, listing projets/documents/pages
- Import de documents avec transcriptions comme corpus Picarones
- Export de résultats benchmark comme couche OCR dans eScriptorium
- EScriptoriumClient, EScriptoriumProject, EScriptoriumDocument
- connect_escriptorium() : connexion avec validation
- picarones/importers/gallica.py : client Gallica (BnF)
- Recherche via API SRU BnF (cote/titre/auteur/date/langue/type)
- Récupération OCR Gallica texte brut (f{n}.texteBrut)
- Import via IIIF Gallica avec enrichissement OCR comme GT
- GallicaClient, GallicaRecord, search_gallica(), import_gallica_document()
- picarones/core/history.py : suivi longitudinal SQLite optionnel
- Base SQLite horodatée par run, moteur, corpus, CER/WER
- BenchmarkHistory.query() avec filtres engine/corpus/since
- get_cer_curve() pour tracer l'évolution du CER dans le temps
- detect_regression() et detect_all_regressions() : seuil configurable
- record() depuis BenchmarkResult, record_single() pour imports manuels
- export_json(), generate_demo_history() (8 runs fictifs avec régression simulée)
- picarones/core/robustness.py : analyse de robustesse
- 5 types de dégradation : bruit gaussien, flou, rotation, résolution, binarisation
- degrade_image_bytes() : Pillow (préféré) ou fallback pur Python
- RobustnessAnalyzer : OCR sur chaque niveau, calcul CER, seuil critique
- DegradationCurve, RobustnessReport, _build_summary()
- generate_demo_robustness_report() pour la démo sans moteur réel
CLI Sprint 8
------------
- picarones history : consulte l'historique des benchmarks
- Filtres --engine, --corpus, --since, --limit
- --regression : détection automatique avec seuil configurable
- --demo : données fictives (8 runs, 3 moteurs, régression au run 5)
- --export-json : export complet de l'historique
- picarones robustness : analyse de robustesse sur un corpus
- --degradations : choix des types (noise, blur, rotation, resolution, binarization)
- --cer-threshold : seuil critique configurable
- --demo : rapport fictif sans OCR réel
- --output-json : export du rapport en JSON
- picarones demo : mis à jour avec --with-history et --with-robustness
Tests Sprint 8 (160 tests, 743 total)
--------------------------------------
- tests/test_sprint8_escriptorium_gallica.py (74 tests)
- TestEScriptoriumClient, TestEScriptoriumConnect, TestEScriptoriumExport
- TestGallicaRecord, TestGallicaClient, TestGallicaSearchQuery, TestGallicaOCR
- TestImportersInit, TestCLIHistory, TestCLIRobustness
- tests/test_sprint8_longitudinal_robustness.py (86 tests)
- TestBenchmarkHistory, TestHistoryEntry, TestRegressionResult
- TestGenerateDemoHistory, TestDegradationLevels, TestDegradationFunctions
- TestDegradationCurve, TestRobustnessReport, TestRobustnessAnalyzer
- TestGenerateDemoRobustness, TestCLIDemo
https://claude.ai/code/session_017gXea9mxBQqDTAsSQd7aAq
- picarones/cli.py +418 -7
- picarones/core/history.py +612 -0
- picarones/core/robustness.py +711 -0
- picarones/importers/__init__.py +15 -2
- picarones/importers/escriptorium.py +532 -0
- picarones/importers/gallica.py +540 -0
- tests/test_sprint8_escriptorium_gallica.py +678 -0
- tests/test_sprint8_longitudinal_robustness.py +734 -0
|
@@ -2,17 +2,21 @@
|
|
| 2 |
|
| 3 |
Commandes disponibles
|
| 4 |
---------------------
|
| 5 |
-
picarones run
|
| 6 |
-
picarones report
|
| 7 |
-
picarones demo
|
| 8 |
-
picarones metrics
|
| 9 |
-
picarones engines
|
| 10 |
-
picarones info
|
|
|
|
|
|
|
| 11 |
|
| 12 |
Exemples d'usage
|
| 13 |
----------------
|
| 14 |
picarones run --corpus ./corpus/ --engines tesseract --output results.json
|
| 15 |
picarones metrics --reference gt.txt --hypothesis ocr.txt
|
|
|
|
|
|
|
| 16 |
picarones engines
|
| 17 |
"""
|
| 18 |
|
|
@@ -360,10 +364,35 @@ def report_cmd(results: str, output: str, verbose: bool) -> None:
|
|
| 360 |
type=click.Path(resolve_path=True),
|
| 361 |
help="Exporte aussi les résultats JSON",
|
| 362 |
)
|
| 363 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
"""Génère un rapport de démonstration avec des données fictives réalistes.
|
| 365 |
|
| 366 |
Utile pour tester le rendu HTML sans installer Tesseract ni Pero OCR.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 367 |
"""
|
| 368 |
from picarones.fixtures import generate_sample_benchmark
|
| 369 |
from picarones.report.generator import ReportGenerator
|
|
@@ -380,6 +409,52 @@ def demo_cmd(output: str, docs: int, json_output: str | None) -> None:
|
|
| 380 |
click.echo(f"Rapport de démonstration : {path}")
|
| 381 |
click.echo(f"Ouvrez-le dans un navigateur : file://{path}")
|
| 382 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 383 |
|
| 384 |
# ---------------------------------------------------------------------------
|
| 385 |
# picarones import (groupe de sous-commandes)
|
|
@@ -546,5 +621,341 @@ def serve_cmd(host: str, port: int, reload: bool, verbose: bool) -> None:
|
|
| 546 |
)
|
| 547 |
|
| 548 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
if __name__ == "__main__":
|
| 550 |
cli()
|
|
|
|
| 2 |
|
| 3 |
Commandes disponibles
|
| 4 |
---------------------
|
| 5 |
+
picarones run — Lance un benchmark complet
|
| 6 |
+
picarones report — Génère le rapport HTML depuis un JSON de résultats
|
| 7 |
+
picarones demo — Génère un rapport de démonstration avec données fictives
|
| 8 |
+
picarones metrics — Calcule CER/WER entre deux fichiers texte
|
| 9 |
+
picarones engines — Liste les moteurs disponibles
|
| 10 |
+
picarones info — Informations de version
|
| 11 |
+
picarones history — Consulte l'historique des benchmarks (suivi longitudinal)
|
| 12 |
+
picarones robustness — Lance une analyse de robustesse sur un corpus
|
| 13 |
|
| 14 |
Exemples d'usage
|
| 15 |
----------------
|
| 16 |
picarones run --corpus ./corpus/ --engines tesseract --output results.json
|
| 17 |
picarones metrics --reference gt.txt --hypothesis ocr.txt
|
| 18 |
+
picarones history --engine tesseract
|
| 19 |
+
picarones robustness --corpus ./gt/ --engine tesseract
|
| 20 |
picarones engines
|
| 21 |
"""
|
| 22 |
|
|
|
|
| 364 |
type=click.Path(resolve_path=True),
|
| 365 |
help="Exporte aussi les résultats JSON",
|
| 366 |
)
|
| 367 |
+
@click.option(
|
| 368 |
+
"--with-history",
|
| 369 |
+
is_flag=True,
|
| 370 |
+
default=False,
|
| 371 |
+
help="Inclut une démonstration du suivi longitudinal (8 runs fictifs)",
|
| 372 |
+
)
|
| 373 |
+
@click.option(
|
| 374 |
+
"--with-robustness",
|
| 375 |
+
is_flag=True,
|
| 376 |
+
default=False,
|
| 377 |
+
help="Inclut une démonstration de l'analyse de robustesse",
|
| 378 |
+
)
|
| 379 |
+
def demo_cmd(
|
| 380 |
+
output: str,
|
| 381 |
+
docs: int,
|
| 382 |
+
json_output: str | None,
|
| 383 |
+
with_history: bool,
|
| 384 |
+
with_robustness: bool,
|
| 385 |
+
) -> None:
|
| 386 |
"""Génère un rapport de démonstration avec des données fictives réalistes.
|
| 387 |
|
| 388 |
Utile pour tester le rendu HTML sans installer Tesseract ni Pero OCR.
|
| 389 |
+
|
| 390 |
+
\b
|
| 391 |
+
Exemples :
|
| 392 |
+
picarones demo
|
| 393 |
+
picarones demo --with-history
|
| 394 |
+
picarones demo --with-robustness
|
| 395 |
+
picarones demo --with-history --with-robustness --docs 8
|
| 396 |
"""
|
| 397 |
from picarones.fixtures import generate_sample_benchmark
|
| 398 |
from picarones.report.generator import ReportGenerator
|
|
|
|
| 409 |
click.echo(f"Rapport de démonstration : {path}")
|
| 410 |
click.echo(f"Ouvrez-le dans un navigateur : file://{path}")
|
| 411 |
|
| 412 |
+
# Suivi longitudinal
|
| 413 |
+
if with_history:
|
| 414 |
+
click.echo("\n── Démonstration suivi longitudinal ──────────────")
|
| 415 |
+
from picarones.core.history import BenchmarkHistory, generate_demo_history
|
| 416 |
+
history = BenchmarkHistory(":memory:")
|
| 417 |
+
generate_demo_history(history, n_runs=8)
|
| 418 |
+
entries = history.query(engine="tesseract")
|
| 419 |
+
click.echo(f" {history.count()} entrées générées (8 runs, 3 moteurs).")
|
| 420 |
+
click.echo("\n Évolution du CER — tesseract :")
|
| 421 |
+
for e in entries:
|
| 422 |
+
cer_str = f"{e.cer_percent:.2f}%" if e.cer_percent is not None else "N/A"
|
| 423 |
+
bar = "█" * int((e.cer_percent or 0) * 2)
|
| 424 |
+
click.echo(f" {e.timestamp[:10]} {cer_str:<8} {bar}")
|
| 425 |
+
regression = history.detect_regression("tesseract", threshold=0.01)
|
| 426 |
+
if regression and regression.is_regression:
|
| 427 |
+
click.echo(
|
| 428 |
+
click.style(
|
| 429 |
+
f"\n RÉGRESSION détectée ! delta CER = +{regression.delta_cer * 100:.2f}%",
|
| 430 |
+
fg="red",
|
| 431 |
+
)
|
| 432 |
+
)
|
| 433 |
+
else:
|
| 434 |
+
click.echo(click.style("\n Aucune régression détectée.", fg="green"))
|
| 435 |
+
|
| 436 |
+
# Analyse de robustesse
|
| 437 |
+
if with_robustness:
|
| 438 |
+
click.echo("\n── Démonstration analyse de robustesse ───────────")
|
| 439 |
+
from picarones.core.robustness import generate_demo_robustness_report
|
| 440 |
+
report = generate_demo_robustness_report(
|
| 441 |
+
engine_names=["tesseract", "pero_ocr"]
|
| 442 |
+
)
|
| 443 |
+
for curve in report.curves:
|
| 444 |
+
if curve.degradation_type == "noise":
|
| 445 |
+
click.echo(f"\n {curve.engine_name} / bruit gaussien :")
|
| 446 |
+
for label, cer in zip(curve.labels, curve.cer_values):
|
| 447 |
+
cer_pct = f"{(cer or 0) * 100:.1f}%"
|
| 448 |
+
bar = "█" * int((cer or 0) * 40)
|
| 449 |
+
click.echo(f" {label:<12} {cer_pct:<8} {bar}")
|
| 450 |
+
if curve.critical_threshold_level is not None:
|
| 451 |
+
click.echo(
|
| 452 |
+
click.style(
|
| 453 |
+
f" Niveau critique (CER>20%) : σ={curve.critical_threshold_level}",
|
| 454 |
+
fg="yellow",
|
| 455 |
+
)
|
| 456 |
+
)
|
| 457 |
+
|
| 458 |
|
| 459 |
# ---------------------------------------------------------------------------
|
| 460 |
# picarones import (groupe de sous-commandes)
|
|
|
|
| 621 |
)
|
| 622 |
|
| 623 |
|
| 624 |
+
# ---------------------------------------------------------------------------
|
| 625 |
+
# picarones history
|
| 626 |
+
# ---------------------------------------------------------------------------
|
| 627 |
+
|
| 628 |
+
@cli.command("history")
|
| 629 |
+
@click.option(
|
| 630 |
+
"--db",
|
| 631 |
+
default="~/.picarones/history.db",
|
| 632 |
+
show_default=True,
|
| 633 |
+
type=click.Path(resolve_path=False),
|
| 634 |
+
help="Chemin vers la base SQLite d'historique",
|
| 635 |
+
)
|
| 636 |
+
@click.option(
|
| 637 |
+
"--engine", "-e",
|
| 638 |
+
default=None,
|
| 639 |
+
help="Filtre sur le nom du moteur",
|
| 640 |
+
)
|
| 641 |
+
@click.option(
|
| 642 |
+
"--corpus", "-c",
|
| 643 |
+
default=None,
|
| 644 |
+
help="Filtre sur le nom du corpus",
|
| 645 |
+
)
|
| 646 |
+
@click.option(
|
| 647 |
+
"--since",
|
| 648 |
+
default=None,
|
| 649 |
+
metavar="DATE",
|
| 650 |
+
help="Date minimale ISO 8601 (ex: 2025-01-01)",
|
| 651 |
+
)
|
| 652 |
+
@click.option(
|
| 653 |
+
"--limit", "-n",
|
| 654 |
+
default=50,
|
| 655 |
+
show_default=True,
|
| 656 |
+
type=click.IntRange(1, 10000),
|
| 657 |
+
help="Nombre maximum d'entrées à afficher",
|
| 658 |
+
)
|
| 659 |
+
@click.option(
|
| 660 |
+
"--regression",
|
| 661 |
+
is_flag=True,
|
| 662 |
+
default=False,
|
| 663 |
+
help="Détecter automatiquement les régressions (compare au run précédent)",
|
| 664 |
+
)
|
| 665 |
+
@click.option(
|
| 666 |
+
"--regression-threshold",
|
| 667 |
+
default=0.01,
|
| 668 |
+
show_default=True,
|
| 669 |
+
type=float,
|
| 670 |
+
metavar="DELTA",
|
| 671 |
+
help="Seuil de régression en points de CER absolus (ex: 0.01 = 1%)",
|
| 672 |
+
)
|
| 673 |
+
@click.option(
|
| 674 |
+
"--export-json",
|
| 675 |
+
default=None,
|
| 676 |
+
type=click.Path(resolve_path=True),
|
| 677 |
+
help="Exporte l'historique complet en JSON",
|
| 678 |
+
)
|
| 679 |
+
@click.option(
|
| 680 |
+
"--demo",
|
| 681 |
+
is_flag=True,
|
| 682 |
+
default=False,
|
| 683 |
+
help="Pré-remplir la base avec des données fictives de démonstration",
|
| 684 |
+
)
|
| 685 |
+
@click.option("--verbose", "-v", is_flag=True, default=False, help="Mode verbeux")
|
| 686 |
+
def history_cmd(
|
| 687 |
+
db: str,
|
| 688 |
+
engine: str | None,
|
| 689 |
+
corpus: str | None,
|
| 690 |
+
since: str | None,
|
| 691 |
+
limit: int,
|
| 692 |
+
regression: bool,
|
| 693 |
+
regression_threshold: float,
|
| 694 |
+
export_json: str | None,
|
| 695 |
+
demo: bool,
|
| 696 |
+
verbose: bool,
|
| 697 |
+
) -> None:
|
| 698 |
+
"""Consulte l'historique des benchmarks (suivi longitudinal).
|
| 699 |
+
|
| 700 |
+
Affiche l'évolution du CER dans le temps pour chaque moteur et corpus.
|
| 701 |
+
Permet de détecter automatiquement les régressions entre deux runs.
|
| 702 |
+
|
| 703 |
+
\b
|
| 704 |
+
Exemples :
|
| 705 |
+
picarones history
|
| 706 |
+
picarones history --engine tesseract --corpus "Chroniques BnF"
|
| 707 |
+
picarones history --regression --regression-threshold 0.02
|
| 708 |
+
picarones history --demo # données fictives de démonstration
|
| 709 |
+
picarones history --export-json historique.json
|
| 710 |
+
"""
|
| 711 |
+
_setup_logging(verbose)
|
| 712 |
+
|
| 713 |
+
from picarones.core.history import BenchmarkHistory, generate_demo_history
|
| 714 |
+
|
| 715 |
+
history = BenchmarkHistory(db)
|
| 716 |
+
|
| 717 |
+
if demo:
|
| 718 |
+
click.echo("Insertion de données fictives de démonstration dans l'historique…")
|
| 719 |
+
generate_demo_history(history, n_runs=8)
|
| 720 |
+
click.echo(f" {history.count()} entrées insérées.")
|
| 721 |
+
|
| 722 |
+
if export_json:
|
| 723 |
+
path = history.export_json(export_json)
|
| 724 |
+
click.echo(f"Historique exporté : {path}")
|
| 725 |
+
return
|
| 726 |
+
|
| 727 |
+
entries = history.query(engine=engine, corpus=corpus, since=since, limit=limit)
|
| 728 |
+
|
| 729 |
+
if not entries:
|
| 730 |
+
click.echo("Aucun benchmark dans l'historique.")
|
| 731 |
+
click.echo(
|
| 732 |
+
"\nPour enregistrer automatiquement les runs, utilisez :\n"
|
| 733 |
+
" picarones run --corpus ./gt/ --engines tesseract --save-history\n"
|
| 734 |
+
"\nOu pour tester avec des données fictives :\n"
|
| 735 |
+
" picarones history --demo"
|
| 736 |
+
)
|
| 737 |
+
return
|
| 738 |
+
|
| 739 |
+
# Regrouper par moteur
|
| 740 |
+
by_engine: dict[str, list] = {}
|
| 741 |
+
for entry in entries:
|
| 742 |
+
by_engine.setdefault(entry.engine_name, []).append(entry)
|
| 743 |
+
|
| 744 |
+
click.echo(f"\n── Historique des benchmarks ({'filtré' if engine or corpus else 'tous'}) ──")
|
| 745 |
+
click.echo(f" Base : {history.db_path}")
|
| 746 |
+
click.echo(f" Total entrées : {len(entries)}\n")
|
| 747 |
+
|
| 748 |
+
for eng_name, eng_entries in by_engine.items():
|
| 749 |
+
click.echo(click.style(f" Moteur : {eng_name}", bold=True))
|
| 750 |
+
for e in eng_entries:
|
| 751 |
+
cer_str = f"{e.cer_percent:.2f}%" if e.cer_percent is not None else "N/A"
|
| 752 |
+
wer_str = f"{e.wer_mean * 100:.2f}%" if e.wer_mean is not None else "N/A"
|
| 753 |
+
ts = e.timestamp[:10] # date uniquement
|
| 754 |
+
click.echo(f" {ts} CER={cer_str:<8} WER={wer_str:<8} docs={e.doc_count} corpus={e.corpus_name}")
|
| 755 |
+
click.echo()
|
| 756 |
+
|
| 757 |
+
# Détection de régression
|
| 758 |
+
if regression:
|
| 759 |
+
click.echo("── Détection de régressions ──────────────────────")
|
| 760 |
+
regressions = history.detect_all_regressions(threshold=regression_threshold)
|
| 761 |
+
if not regressions:
|
| 762 |
+
click.echo(
|
| 763 |
+
click.style(
|
| 764 |
+
f" Aucune régression détectée (seuil={regression_threshold*100:.1f}%)",
|
| 765 |
+
fg="green",
|
| 766 |
+
)
|
| 767 |
+
)
|
| 768 |
+
else:
|
| 769 |
+
for r in regressions:
|
| 770 |
+
delta_str = f"+{r.delta_cer * 100:.2f}%" if r.delta_cer else "N/A"
|
| 771 |
+
click.echo(
|
| 772 |
+
click.style(
|
| 773 |
+
f" RÉGRESSION {r.engine_name} / {r.corpus_name} : "
|
| 774 |
+
f"delta CER={delta_str} "
|
| 775 |
+
f"({r.baseline_timestamp[:10]} → {r.current_timestamp[:10]})",
|
| 776 |
+
fg="red",
|
| 777 |
+
)
|
| 778 |
+
)
|
| 779 |
+
|
| 780 |
+
|
| 781 |
+
# ---------------------------------------------------------------------------
|
| 782 |
+
# picarones robustness
|
| 783 |
+
# ---------------------------------------------------------------------------
|
| 784 |
+
|
| 785 |
+
@cli.command("robustness")
|
| 786 |
+
@click.option(
|
| 787 |
+
"--corpus", "-c",
|
| 788 |
+
required=True,
|
| 789 |
+
type=click.Path(exists=True, file_okay=False, resolve_path=True),
|
| 790 |
+
help="Dossier contenant les paires image / .gt.txt",
|
| 791 |
+
)
|
| 792 |
+
@click.option(
|
| 793 |
+
"--engine", "-e",
|
| 794 |
+
default="tesseract",
|
| 795 |
+
show_default=True,
|
| 796 |
+
help="Moteur OCR à tester (tesseract, pero_ocr…)",
|
| 797 |
+
)
|
| 798 |
+
@click.option(
|
| 799 |
+
"--degradations", "-d",
|
| 800 |
+
default="noise,blur,rotation,resolution,binarization",
|
| 801 |
+
show_default=True,
|
| 802 |
+
help="Types de dégradation séparés par des virgules",
|
| 803 |
+
)
|
| 804 |
+
@click.option(
|
| 805 |
+
"--cer-threshold",
|
| 806 |
+
default=0.20,
|
| 807 |
+
show_default=True,
|
| 808 |
+
type=float,
|
| 809 |
+
metavar="THRESHOLD",
|
| 810 |
+
help="Seuil CER pour définir le niveau critique (0-1)",
|
| 811 |
+
)
|
| 812 |
+
@click.option(
|
| 813 |
+
"--max-docs",
|
| 814 |
+
default=10,
|
| 815 |
+
show_default=True,
|
| 816 |
+
type=click.IntRange(1, 1000),
|
| 817 |
+
help="Nombre maximum de documents à traiter",
|
| 818 |
+
)
|
| 819 |
+
@click.option(
|
| 820 |
+
"--output-json", "-o",
|
| 821 |
+
default=None,
|
| 822 |
+
type=click.Path(resolve_path=True),
|
| 823 |
+
help="Exporte le rapport de robustesse en JSON",
|
| 824 |
+
)
|
| 825 |
+
@click.option(
|
| 826 |
+
"--lang", "-l",
|
| 827 |
+
default="fra",
|
| 828 |
+
show_default=True,
|
| 829 |
+
help="Code langue Tesseract",
|
| 830 |
+
)
|
| 831 |
+
@click.option("--no-progress", is_flag=True, default=False, help="Désactive la barre de progression")
|
| 832 |
+
@click.option("--demo", is_flag=True, default=False, help="Mode démo avec données fictives (sans OCR réel)")
|
| 833 |
+
@click.option("--verbose", "-v", is_flag=True, default=False, help="Mode verbeux")
|
| 834 |
+
def robustness_cmd(
|
| 835 |
+
corpus: str,
|
| 836 |
+
engine: str,
|
| 837 |
+
degradations: str,
|
| 838 |
+
cer_threshold: float,
|
| 839 |
+
max_docs: int,
|
| 840 |
+
output_json: str | None,
|
| 841 |
+
lang: str,
|
| 842 |
+
no_progress: bool,
|
| 843 |
+
demo: bool,
|
| 844 |
+
verbose: bool,
|
| 845 |
+
) -> None:
|
| 846 |
+
"""Lance une analyse de robustesse d'un moteur OCR face aux dégradations d'image.
|
| 847 |
+
|
| 848 |
+
Génère des versions dégradées des images (bruit, flou, rotation,
|
| 849 |
+
réduction de résolution, binarisation) et mesure le CER à chaque niveau.
|
| 850 |
+
|
| 851 |
+
\b
|
| 852 |
+
Exemples :
|
| 853 |
+
picarones robustness --corpus ./gt/ --engine tesseract
|
| 854 |
+
picarones robustness --corpus ./gt/ --engine pero_ocr --degradations noise,blur
|
| 855 |
+
picarones robustness --corpus ./gt/ --engine tesseract --output-json robustness.json
|
| 856 |
+
picarones robustness --corpus ./gt/ --engine tesseract --demo
|
| 857 |
+
"""
|
| 858 |
+
_setup_logging(verbose)
|
| 859 |
+
|
| 860 |
+
import json as _json
|
| 861 |
+
|
| 862 |
+
deg_types = [d.strip() for d in degradations.split(",") if d.strip()]
|
| 863 |
+
|
| 864 |
+
from picarones.core.robustness import (
|
| 865 |
+
RobustnessAnalyzer, ALL_DEGRADATION_TYPES, generate_demo_robustness_report
|
| 866 |
+
)
|
| 867 |
+
|
| 868 |
+
# Valider les types de dégradation
|
| 869 |
+
invalid = [d for d in deg_types if d not in ALL_DEGRADATION_TYPES]
|
| 870 |
+
if invalid:
|
| 871 |
+
click.echo(
|
| 872 |
+
f"Types de dégradation invalides : {', '.join(invalid)}\n"
|
| 873 |
+
f"Types valides : {', '.join(ALL_DEGRADATION_TYPES)}",
|
| 874 |
+
err=True,
|
| 875 |
+
)
|
| 876 |
+
sys.exit(1)
|
| 877 |
+
|
| 878 |
+
click.echo(f"Corpus : {corpus}")
|
| 879 |
+
click.echo(f"Moteur : {engine}")
|
| 880 |
+
click.echo(f"Dégradations : {', '.join(deg_types)}")
|
| 881 |
+
click.echo(f"Seuil CER : {cer_threshold * 100:.0f}%")
|
| 882 |
+
|
| 883 |
+
if demo:
|
| 884 |
+
click.echo("\nMode démo : génération d'un rapport fictif réaliste…")
|
| 885 |
+
report = generate_demo_robustness_report(engine_names=[engine])
|
| 886 |
+
else:
|
| 887 |
+
# Charger le corpus
|
| 888 |
+
from picarones.core.corpus import load_corpus_from_directory
|
| 889 |
+
try:
|
| 890 |
+
corp = load_corpus_from_directory(corpus)
|
| 891 |
+
except (FileNotFoundError, ValueError) as exc:
|
| 892 |
+
click.echo(f"Erreur corpus : {exc}", err=True)
|
| 893 |
+
sys.exit(1)
|
| 894 |
+
|
| 895 |
+
click.echo(f"\n{len(corp)} documents chargés. Début de l'analyse…\n")
|
| 896 |
+
|
| 897 |
+
# Instancier le moteur
|
| 898 |
+
try:
|
| 899 |
+
ocr_engine = _engine_from_name(engine, lang=lang, psm=6)
|
| 900 |
+
except click.BadParameter as exc:
|
| 901 |
+
click.echo(f"Erreur moteur : {exc}", err=True)
|
| 902 |
+
sys.exit(1)
|
| 903 |
+
|
| 904 |
+
from picarones.core.robustness import RobustnessAnalyzer
|
| 905 |
+
analyzer = RobustnessAnalyzer(
|
| 906 |
+
engines=[ocr_engine],
|
| 907 |
+
degradation_types=deg_types,
|
| 908 |
+
cer_threshold=cer_threshold,
|
| 909 |
+
)
|
| 910 |
+
report = analyzer.analyze(
|
| 911 |
+
corpus=corp,
|
| 912 |
+
show_progress=not no_progress,
|
| 913 |
+
max_docs=max_docs,
|
| 914 |
+
)
|
| 915 |
+
|
| 916 |
+
# Affichage des résultats
|
| 917 |
+
click.echo("\n── Résultats de robustesse ──────────────────────────")
|
| 918 |
+
for curve in report.curves:
|
| 919 |
+
click.echo(f"\n {curve.engine_name} / {curve.degradation_type}")
|
| 920 |
+
for label, cer in zip(curve.labels, curve.cer_values):
|
| 921 |
+
if cer is not None:
|
| 922 |
+
bar_len = int(cer * 40)
|
| 923 |
+
bar = "█" * bar_len
|
| 924 |
+
cer_pct = f"{cer * 100:.1f}%"
|
| 925 |
+
threshold_marker = " ← CRITIQUE" if curve.critical_threshold_level is not None and \
|
| 926 |
+
curve.levels[curve.labels.index(label)] == curve.critical_threshold_level else ""
|
| 927 |
+
click.echo(f" {label:<12} {cer_pct:<8} {bar}{threshold_marker}")
|
| 928 |
+
if curve.critical_threshold_level is not None:
|
| 929 |
+
click.echo(
|
| 930 |
+
click.style(
|
| 931 |
+
f" Niveau critique (CER>{cer_threshold*100:.0f}%) : {curve.critical_threshold_level}",
|
| 932 |
+
fg="yellow",
|
| 933 |
+
)
|
| 934 |
+
)
|
| 935 |
+
else:
|
| 936 |
+
click.echo(click.style(f" Robuste jusqu'au niveau max.", fg="green"))
|
| 937 |
+
|
| 938 |
+
# Résumé
|
| 939 |
+
click.echo("\n── Résumé ──────────────────────────────────────────")
|
| 940 |
+
for key, val in report.summary.items():
|
| 941 |
+
if key.startswith("most_robust_"):
|
| 942 |
+
deg = key.replace("most_robust_", "")
|
| 943 |
+
click.echo(f" Moteur le plus robuste ({deg}) : {val}")
|
| 944 |
+
|
| 945 |
+
# Export JSON
|
| 946 |
+
if output_json:
|
| 947 |
+
report_dict = report.as_dict()
|
| 948 |
+
Path(output_json).write_text(
|
| 949 |
+
_json.dumps(report_dict, ensure_ascii=False, indent=2),
|
| 950 |
+
encoding="utf-8",
|
| 951 |
+
)
|
| 952 |
+
click.echo(f"\nRapport JSON exporté : {output_json}")
|
| 953 |
+
|
| 954 |
+
|
| 955 |
+
# ---------------------------------------------------------------------------
|
| 956 |
+
# Mise à jour de picarones demo pour illustrer suivi longitudinal + robustesse
|
| 957 |
+
# ---------------------------------------------------------------------------
|
| 958 |
+
|
| 959 |
+
|
| 960 |
if __name__ == "__main__":
|
| 961 |
cli()
|
|
@@ -0,0 +1,612 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Suivi longitudinal des benchmarks — base SQLite optionnelle.
|
| 2 |
+
|
| 3 |
+
Fonctionnement
|
| 4 |
+
--------------
|
| 5 |
+
- Chaque run de benchmark est enregistré dans une table SQLite avec horodatage,
|
| 6 |
+
corpus, moteurs, métriques agrégées.
|
| 7 |
+
- L'historique permet de tracer des courbes d'évolution du CER dans le temps.
|
| 8 |
+
- La détection de régression compare le dernier run à une baseline configurable.
|
| 9 |
+
|
| 10 |
+
Structure de la base
|
| 11 |
+
--------------------
|
| 12 |
+
Table ``runs`` :
|
| 13 |
+
run_id TEXT PRIMARY KEY — UUID ou hash du run
|
| 14 |
+
timestamp TEXT — ISO 8601
|
| 15 |
+
corpus_name TEXT
|
| 16 |
+
engine_name TEXT
|
| 17 |
+
cer_mean REAL
|
| 18 |
+
wer_mean REAL
|
| 19 |
+
doc_count INTEGER
|
| 20 |
+
metadata TEXT — JSON
|
| 21 |
+
|
| 22 |
+
Usage
|
| 23 |
+
-----
|
| 24 |
+
>>> from picarones.core.history import BenchmarkHistory
|
| 25 |
+
>>> history = BenchmarkHistory("~/.picarones/history.db")
|
| 26 |
+
>>> history.record(benchmark_result)
|
| 27 |
+
>>> df = history.query(engine="tesseract", corpus="chroniques")
|
| 28 |
+
>>> regression = history.detect_regression(engine="tesseract", threshold=0.02)
|
| 29 |
+
"""
|
| 30 |
+
|
| 31 |
+
from __future__ import annotations
|
| 32 |
+
|
| 33 |
+
import json
|
| 34 |
+
import logging
|
| 35 |
+
import sqlite3
|
| 36 |
+
import uuid
|
| 37 |
+
from dataclasses import dataclass, field
|
| 38 |
+
from datetime import datetime, timezone
|
| 39 |
+
from pathlib import Path
|
| 40 |
+
from typing import Optional
|
| 41 |
+
|
| 42 |
+
logger = logging.getLogger(__name__)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
# ---------------------------------------------------------------------------
|
| 46 |
+
# Structures de données
|
| 47 |
+
# ---------------------------------------------------------------------------
|
| 48 |
+
|
| 49 |
+
@dataclass
|
| 50 |
+
class HistoryEntry:
|
| 51 |
+
"""Un enregistrement dans l'historique des benchmarks."""
|
| 52 |
+
run_id: str
|
| 53 |
+
timestamp: str
|
| 54 |
+
corpus_name: str
|
| 55 |
+
engine_name: str
|
| 56 |
+
cer_mean: Optional[float]
|
| 57 |
+
wer_mean: Optional[float]
|
| 58 |
+
doc_count: int
|
| 59 |
+
metadata: dict = field(default_factory=dict)
|
| 60 |
+
|
| 61 |
+
@property
|
| 62 |
+
def cer_percent(self) -> Optional[float]:
|
| 63 |
+
return self.cer_mean * 100 if self.cer_mean is not None else None
|
| 64 |
+
|
| 65 |
+
def as_dict(self) -> dict:
|
| 66 |
+
return {
|
| 67 |
+
"run_id": self.run_id,
|
| 68 |
+
"timestamp": self.timestamp,
|
| 69 |
+
"corpus_name": self.corpus_name,
|
| 70 |
+
"engine_name": self.engine_name,
|
| 71 |
+
"cer_mean": self.cer_mean,
|
| 72 |
+
"wer_mean": self.wer_mean,
|
| 73 |
+
"doc_count": self.doc_count,
|
| 74 |
+
"metadata": self.metadata,
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
@dataclass
|
| 79 |
+
class RegressionResult:
|
| 80 |
+
"""Résultat d'une détection de régression."""
|
| 81 |
+
engine_name: str
|
| 82 |
+
corpus_name: str
|
| 83 |
+
baseline_run_id: str
|
| 84 |
+
baseline_timestamp: str
|
| 85 |
+
baseline_cer: Optional[float]
|
| 86 |
+
current_run_id: str
|
| 87 |
+
current_timestamp: str
|
| 88 |
+
current_cer: Optional[float]
|
| 89 |
+
delta_cer: Optional[float]
|
| 90 |
+
"""Delta CER (current - baseline). Positif = régression."""
|
| 91 |
+
is_regression: bool
|
| 92 |
+
threshold: float
|
| 93 |
+
|
| 94 |
+
def as_dict(self) -> dict:
|
| 95 |
+
return {
|
| 96 |
+
"engine_name": self.engine_name,
|
| 97 |
+
"corpus_name": self.corpus_name,
|
| 98 |
+
"baseline_run_id": self.baseline_run_id,
|
| 99 |
+
"baseline_timestamp": self.baseline_timestamp,
|
| 100 |
+
"baseline_cer": self.baseline_cer,
|
| 101 |
+
"current_run_id": self.current_run_id,
|
| 102 |
+
"current_timestamp": self.current_timestamp,
|
| 103 |
+
"current_cer": self.current_cer,
|
| 104 |
+
"delta_cer": self.delta_cer,
|
| 105 |
+
"is_regression": self.is_regression,
|
| 106 |
+
"threshold": self.threshold,
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
|
| 110 |
+
# ---------------------------------------------------------------------------
|
| 111 |
+
# BenchmarkHistory
|
| 112 |
+
# ---------------------------------------------------------------------------
|
| 113 |
+
|
| 114 |
+
class BenchmarkHistory:
|
| 115 |
+
"""Gestionnaire de l'historique des benchmarks dans SQLite.
|
| 116 |
+
|
| 117 |
+
Parameters
|
| 118 |
+
----------
|
| 119 |
+
db_path:
|
| 120 |
+
Chemin vers le fichier SQLite. Utiliser ``":memory:"`` pour les tests.
|
| 121 |
+
|
| 122 |
+
Examples
|
| 123 |
+
--------
|
| 124 |
+
>>> history = BenchmarkHistory("~/.picarones/history.db")
|
| 125 |
+
>>> history.record(benchmark)
|
| 126 |
+
>>> entries = history.query(engine="tesseract")
|
| 127 |
+
>>> for e in entries:
|
| 128 |
+
... print(e.timestamp, f"CER={e.cer_percent:.2f}%")
|
| 129 |
+
"""
|
| 130 |
+
|
| 131 |
+
_CREATE_TABLE = """
|
| 132 |
+
CREATE TABLE IF NOT EXISTS runs (
|
| 133 |
+
run_id TEXT PRIMARY KEY,
|
| 134 |
+
timestamp TEXT NOT NULL,
|
| 135 |
+
corpus_name TEXT NOT NULL,
|
| 136 |
+
engine_name TEXT NOT NULL,
|
| 137 |
+
cer_mean REAL,
|
| 138 |
+
wer_mean REAL,
|
| 139 |
+
doc_count INTEGER,
|
| 140 |
+
metadata TEXT
|
| 141 |
+
);
|
| 142 |
+
CREATE INDEX IF NOT EXISTS idx_engine ON runs (engine_name);
|
| 143 |
+
CREATE INDEX IF NOT EXISTS idx_corpus ON runs (corpus_name);
|
| 144 |
+
CREATE INDEX IF NOT EXISTS idx_timestamp ON runs (timestamp);
|
| 145 |
+
"""
|
| 146 |
+
|
| 147 |
+
def __init__(self, db_path: str = "~/.picarones/history.db") -> None:
|
| 148 |
+
if db_path != ":memory:":
|
| 149 |
+
path = Path(db_path).expanduser()
|
| 150 |
+
path.parent.mkdir(parents=True, exist_ok=True)
|
| 151 |
+
self.db_path = str(path)
|
| 152 |
+
else:
|
| 153 |
+
self.db_path = ":memory:"
|
| 154 |
+
self._conn: Optional[sqlite3.Connection] = None
|
| 155 |
+
self._init_db()
|
| 156 |
+
|
| 157 |
+
def _connect(self) -> sqlite3.Connection:
|
| 158 |
+
if self._conn is None:
|
| 159 |
+
self._conn = sqlite3.connect(self.db_path)
|
| 160 |
+
self._conn.row_factory = sqlite3.Row
|
| 161 |
+
return self._conn
|
| 162 |
+
|
| 163 |
+
def _init_db(self) -> None:
|
| 164 |
+
conn = self._connect()
|
| 165 |
+
conn.executescript(self._CREATE_TABLE)
|
| 166 |
+
conn.commit()
|
| 167 |
+
|
| 168 |
+
def close(self) -> None:
|
| 169 |
+
"""Ferme la connexion SQLite."""
|
| 170 |
+
if self._conn:
|
| 171 |
+
self._conn.close()
|
| 172 |
+
self._conn = None
|
| 173 |
+
|
| 174 |
+
# ------------------------------------------------------------------
|
| 175 |
+
# Enregistrement
|
| 176 |
+
# ------------------------------------------------------------------
|
| 177 |
+
|
| 178 |
+
def record(
|
| 179 |
+
self,
|
| 180 |
+
benchmark_result: "BenchmarkResult",
|
| 181 |
+
run_id: Optional[str] = None,
|
| 182 |
+
extra_metadata: Optional[dict] = None,
|
| 183 |
+
) -> str:
|
| 184 |
+
"""Enregistre les résultats d'un benchmark dans l'historique.
|
| 185 |
+
|
| 186 |
+
Parameters
|
| 187 |
+
----------
|
| 188 |
+
benchmark_result:
|
| 189 |
+
Résultats à enregistrer (``BenchmarkResult``).
|
| 190 |
+
run_id:
|
| 191 |
+
Identifiant du run (auto-généré si None).
|
| 192 |
+
extra_metadata:
|
| 193 |
+
Métadonnées supplémentaires à stocker.
|
| 194 |
+
|
| 195 |
+
Returns
|
| 196 |
+
-------
|
| 197 |
+
str
|
| 198 |
+
L'identifiant du run enregistré.
|
| 199 |
+
"""
|
| 200 |
+
if run_id is None:
|
| 201 |
+
run_id = str(uuid.uuid4())
|
| 202 |
+
|
| 203 |
+
timestamp = datetime.now(timezone.utc).isoformat()
|
| 204 |
+
conn = self._connect()
|
| 205 |
+
|
| 206 |
+
for report in benchmark_result.engine_reports:
|
| 207 |
+
ranking = benchmark_result.ranking()
|
| 208 |
+
engine_entry = next(
|
| 209 |
+
(r for r in ranking if r["engine"] == report.engine_name),
|
| 210 |
+
None,
|
| 211 |
+
)
|
| 212 |
+
cer_mean = engine_entry["mean_cer"] if engine_entry else None
|
| 213 |
+
wer_mean = engine_entry["mean_wer"] if engine_entry else None
|
| 214 |
+
|
| 215 |
+
meta = {
|
| 216 |
+
"engine_version": report.engine_version,
|
| 217 |
+
"engine_config": report.engine_config,
|
| 218 |
+
"picarones_version": benchmark_result.metadata.get("picarones_version", ""),
|
| 219 |
+
**(extra_metadata or {}),
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
conn.execute(
|
| 223 |
+
"""
|
| 224 |
+
INSERT OR REPLACE INTO runs
|
| 225 |
+
(run_id, timestamp, corpus_name, engine_name,
|
| 226 |
+
cer_mean, wer_mean, doc_count, metadata)
|
| 227 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
| 228 |
+
""",
|
| 229 |
+
(
|
| 230 |
+
f"{run_id}_{report.engine_name}",
|
| 231 |
+
timestamp,
|
| 232 |
+
benchmark_result.corpus_name,
|
| 233 |
+
report.engine_name,
|
| 234 |
+
cer_mean,
|
| 235 |
+
wer_mean,
|
| 236 |
+
benchmark_result.document_count,
|
| 237 |
+
json.dumps(meta, ensure_ascii=False),
|
| 238 |
+
),
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
conn.commit()
|
| 242 |
+
logger.info("Benchmark enregistré dans l'historique : run_id=%s", run_id)
|
| 243 |
+
return run_id
|
| 244 |
+
|
| 245 |
+
def record_single(
|
| 246 |
+
self,
|
| 247 |
+
run_id: str,
|
| 248 |
+
corpus_name: str,
|
| 249 |
+
engine_name: str,
|
| 250 |
+
cer_mean: Optional[float],
|
| 251 |
+
wer_mean: Optional[float],
|
| 252 |
+
doc_count: int,
|
| 253 |
+
timestamp: Optional[str] = None,
|
| 254 |
+
metadata: Optional[dict] = None,
|
| 255 |
+
) -> str:
|
| 256 |
+
"""Enregistre manuellement une entrée dans l'historique.
|
| 257 |
+
|
| 258 |
+
Utile pour les tests, les imports de données externes, ou pour
|
| 259 |
+
enregistrer des résultats calculés en dehors de Picarones.
|
| 260 |
+
|
| 261 |
+
Returns
|
| 262 |
+
-------
|
| 263 |
+
str
|
| 264 |
+
run_id enregistré.
|
| 265 |
+
"""
|
| 266 |
+
if timestamp is None:
|
| 267 |
+
timestamp = datetime.now(timezone.utc).isoformat()
|
| 268 |
+
|
| 269 |
+
conn = self._connect()
|
| 270 |
+
conn.execute(
|
| 271 |
+
"""
|
| 272 |
+
INSERT OR REPLACE INTO runs
|
| 273 |
+
(run_id, timestamp, corpus_name, engine_name,
|
| 274 |
+
cer_mean, wer_mean, doc_count, metadata)
|
| 275 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
| 276 |
+
""",
|
| 277 |
+
(
|
| 278 |
+
run_id,
|
| 279 |
+
timestamp,
|
| 280 |
+
corpus_name,
|
| 281 |
+
engine_name,
|
| 282 |
+
cer_mean,
|
| 283 |
+
wer_mean,
|
| 284 |
+
doc_count,
|
| 285 |
+
json.dumps(metadata or {}, ensure_ascii=False),
|
| 286 |
+
),
|
| 287 |
+
)
|
| 288 |
+
conn.commit()
|
| 289 |
+
return run_id
|
| 290 |
+
|
| 291 |
+
# ------------------------------------------------------------------
|
| 292 |
+
# Requêtes
|
| 293 |
+
# ------------------------------------------------------------------
|
| 294 |
+
|
| 295 |
+
def query(
|
| 296 |
+
self,
|
| 297 |
+
engine: Optional[str] = None,
|
| 298 |
+
corpus: Optional[str] = None,
|
| 299 |
+
since: Optional[str] = None,
|
| 300 |
+
limit: int = 100,
|
| 301 |
+
) -> list[HistoryEntry]:
|
| 302 |
+
"""Retourne l'historique des runs, avec filtres optionnels.
|
| 303 |
+
|
| 304 |
+
Parameters
|
| 305 |
+
----------
|
| 306 |
+
engine:
|
| 307 |
+
Filtre sur le nom du moteur.
|
| 308 |
+
corpus:
|
| 309 |
+
Filtre sur le nom du corpus.
|
| 310 |
+
since:
|
| 311 |
+
Date ISO 8601 minimale (``"2025-01-01"``).
|
| 312 |
+
limit:
|
| 313 |
+
Nombre maximum d'entrées retournées.
|
| 314 |
+
|
| 315 |
+
Returns
|
| 316 |
+
-------
|
| 317 |
+
list[HistoryEntry]
|
| 318 |
+
Entrées triées par timestamp croissant.
|
| 319 |
+
"""
|
| 320 |
+
clauses: list[str] = []
|
| 321 |
+
params: list = []
|
| 322 |
+
|
| 323 |
+
if engine:
|
| 324 |
+
clauses.append("engine_name = ?")
|
| 325 |
+
params.append(engine)
|
| 326 |
+
if corpus:
|
| 327 |
+
clauses.append("corpus_name = ?")
|
| 328 |
+
params.append(corpus)
|
| 329 |
+
if since:
|
| 330 |
+
clauses.append("timestamp >= ?")
|
| 331 |
+
params.append(since)
|
| 332 |
+
|
| 333 |
+
where = f"WHERE {' AND '.join(clauses)}" if clauses else ""
|
| 334 |
+
params.append(limit)
|
| 335 |
+
|
| 336 |
+
conn = self._connect()
|
| 337 |
+
rows = conn.execute(
|
| 338 |
+
f"SELECT * FROM runs {where} ORDER BY timestamp ASC LIMIT ?",
|
| 339 |
+
params,
|
| 340 |
+
).fetchall()
|
| 341 |
+
|
| 342 |
+
return [
|
| 343 |
+
HistoryEntry(
|
| 344 |
+
run_id=row["run_id"],
|
| 345 |
+
timestamp=row["timestamp"],
|
| 346 |
+
corpus_name=row["corpus_name"],
|
| 347 |
+
engine_name=row["engine_name"],
|
| 348 |
+
cer_mean=row["cer_mean"],
|
| 349 |
+
wer_mean=row["wer_mean"],
|
| 350 |
+
doc_count=row["doc_count"],
|
| 351 |
+
metadata=json.loads(row["metadata"] or "{}"),
|
| 352 |
+
)
|
| 353 |
+
for row in rows
|
| 354 |
+
]
|
| 355 |
+
|
| 356 |
+
def list_engines(self) -> list[str]:
|
| 357 |
+
"""Retourne la liste des moteurs présents dans l'historique."""
|
| 358 |
+
conn = self._connect()
|
| 359 |
+
rows = conn.execute(
|
| 360 |
+
"SELECT DISTINCT engine_name FROM runs ORDER BY engine_name"
|
| 361 |
+
).fetchall()
|
| 362 |
+
return [row[0] for row in rows]
|
| 363 |
+
|
| 364 |
+
def list_corpora(self) -> list[str]:
|
| 365 |
+
"""Retourne la liste des corpus présents dans l'historique."""
|
| 366 |
+
conn = self._connect()
|
| 367 |
+
rows = conn.execute(
|
| 368 |
+
"SELECT DISTINCT corpus_name FROM runs ORDER BY corpus_name"
|
| 369 |
+
).fetchall()
|
| 370 |
+
return [row[0] for row in rows]
|
| 371 |
+
|
| 372 |
+
def count(self) -> int:
|
| 373 |
+
"""Nombre total d'entrées dans l'historique."""
|
| 374 |
+
conn = self._connect()
|
| 375 |
+
return conn.execute("SELECT COUNT(*) FROM runs").fetchone()[0]
|
| 376 |
+
|
| 377 |
+
# ------------------------------------------------------------------
|
| 378 |
+
# Courbes d'évolution
|
| 379 |
+
# ------------------------------------------------------------------
|
| 380 |
+
|
| 381 |
+
def get_cer_curve(
|
| 382 |
+
self,
|
| 383 |
+
engine: str,
|
| 384 |
+
corpus: Optional[str] = None,
|
| 385 |
+
) -> list[dict]:
|
| 386 |
+
"""Retourne les données pour tracer la courbe d'évolution du CER.
|
| 387 |
+
|
| 388 |
+
Parameters
|
| 389 |
+
----------
|
| 390 |
+
engine:
|
| 391 |
+
Nom du moteur.
|
| 392 |
+
corpus:
|
| 393 |
+
Corpus spécifique (None = tous les corpus pour ce moteur).
|
| 394 |
+
|
| 395 |
+
Returns
|
| 396 |
+
-------
|
| 397 |
+
list[dict]
|
| 398 |
+
Chaque dict contient ``{"timestamp": str, "cer": float, "run_id": str}``.
|
| 399 |
+
"""
|
| 400 |
+
entries = self.query(engine=engine, corpus=corpus, limit=1000)
|
| 401 |
+
return [
|
| 402 |
+
{
|
| 403 |
+
"timestamp": e.timestamp,
|
| 404 |
+
"cer": e.cer_mean,
|
| 405 |
+
"cer_percent": e.cer_percent,
|
| 406 |
+
"run_id": e.run_id,
|
| 407 |
+
"corpus_name": e.corpus_name,
|
| 408 |
+
}
|
| 409 |
+
for e in entries
|
| 410 |
+
if e.cer_mean is not None
|
| 411 |
+
]
|
| 412 |
+
|
| 413 |
+
# ------------------------------------------------------------------
|
| 414 |
+
# Détection de régression
|
| 415 |
+
# ------------------------------------------------------------------
|
| 416 |
+
|
| 417 |
+
def detect_regression(
|
| 418 |
+
self,
|
| 419 |
+
engine: str,
|
| 420 |
+
corpus: Optional[str] = None,
|
| 421 |
+
threshold: float = 0.01,
|
| 422 |
+
baseline_run_id: Optional[str] = None,
|
| 423 |
+
) -> Optional[RegressionResult]:
|
| 424 |
+
"""Détecte une régression du CER entre deux runs.
|
| 425 |
+
|
| 426 |
+
Compare le run le plus récent à une baseline (le run précédent ou
|
| 427 |
+
un run spécifique).
|
| 428 |
+
|
| 429 |
+
Parameters
|
| 430 |
+
----------
|
| 431 |
+
engine:
|
| 432 |
+
Nom du moteur à surveiller.
|
| 433 |
+
corpus:
|
| 434 |
+
Corpus spécifique (None = tous).
|
| 435 |
+
threshold:
|
| 436 |
+
Seuil de régression en points absolus de CER (ex : 0.01 = 1%).
|
| 437 |
+
Si delta_cer > threshold → régression détectée.
|
| 438 |
+
baseline_run_id:
|
| 439 |
+
run_id de référence. Si None, utilise l'avant-dernier run.
|
| 440 |
+
|
| 441 |
+
Returns
|
| 442 |
+
-------
|
| 443 |
+
RegressionResult | None
|
| 444 |
+
None si moins de 2 runs disponibles.
|
| 445 |
+
"""
|
| 446 |
+
entries = self.query(engine=engine, corpus=corpus, limit=1000)
|
| 447 |
+
if len(entries) < 2:
|
| 448 |
+
logger.info("Pas assez de runs pour détecter une régression (moteur=%s)", engine)
|
| 449 |
+
return None
|
| 450 |
+
|
| 451 |
+
current = entries[-1]
|
| 452 |
+
|
| 453 |
+
if baseline_run_id:
|
| 454 |
+
baseline_list = [e for e in entries[:-1] if e.run_id == baseline_run_id]
|
| 455 |
+
baseline = baseline_list[0] if baseline_list else entries[-2]
|
| 456 |
+
else:
|
| 457 |
+
baseline = entries[-2]
|
| 458 |
+
|
| 459 |
+
delta = None
|
| 460 |
+
is_regression = False
|
| 461 |
+
if current.cer_mean is not None and baseline.cer_mean is not None:
|
| 462 |
+
delta = current.cer_mean - baseline.cer_mean
|
| 463 |
+
is_regression = delta > threshold
|
| 464 |
+
|
| 465 |
+
return RegressionResult(
|
| 466 |
+
engine_name=engine,
|
| 467 |
+
corpus_name=corpus or "tous",
|
| 468 |
+
baseline_run_id=baseline.run_id,
|
| 469 |
+
baseline_timestamp=baseline.timestamp,
|
| 470 |
+
baseline_cer=baseline.cer_mean,
|
| 471 |
+
current_run_id=current.run_id,
|
| 472 |
+
current_timestamp=current.timestamp,
|
| 473 |
+
current_cer=current.cer_mean,
|
| 474 |
+
delta_cer=delta,
|
| 475 |
+
is_regression=is_regression,
|
| 476 |
+
threshold=threshold,
|
| 477 |
+
)
|
| 478 |
+
|
| 479 |
+
def detect_all_regressions(
|
| 480 |
+
self,
|
| 481 |
+
threshold: float = 0.01,
|
| 482 |
+
) -> list[RegressionResult]:
|
| 483 |
+
"""Détecte les régressions pour tous les moteurs et corpus connus.
|
| 484 |
+
|
| 485 |
+
Parameters
|
| 486 |
+
----------
|
| 487 |
+
threshold:
|
| 488 |
+
Seuil de régression.
|
| 489 |
+
|
| 490 |
+
Returns
|
| 491 |
+
-------
|
| 492 |
+
list[RegressionResult]
|
| 493 |
+
Uniquement les moteurs où une régression est détectée.
|
| 494 |
+
"""
|
| 495 |
+
results: list[RegressionResult] = []
|
| 496 |
+
engines = self.list_engines()
|
| 497 |
+
corpora = self.list_corpora()
|
| 498 |
+
|
| 499 |
+
for engine in engines:
|
| 500 |
+
for corpus in corpora:
|
| 501 |
+
result = self.detect_regression(engine, corpus, threshold)
|
| 502 |
+
if result and result.is_regression:
|
| 503 |
+
results.append(result)
|
| 504 |
+
|
| 505 |
+
return results
|
| 506 |
+
|
| 507 |
+
# ------------------------------------------------------------------
|
| 508 |
+
# Export
|
| 509 |
+
# ------------------------------------------------------------------
|
| 510 |
+
|
| 511 |
+
def export_json(self, output_path: str) -> Path:
|
| 512 |
+
"""Exporte l'historique complet en JSON.
|
| 513 |
+
|
| 514 |
+
Parameters
|
| 515 |
+
----------
|
| 516 |
+
output_path:
|
| 517 |
+
Chemin du fichier JSON de sortie.
|
| 518 |
+
|
| 519 |
+
Returns
|
| 520 |
+
-------
|
| 521 |
+
Path
|
| 522 |
+
Chemin vers le fichier créé.
|
| 523 |
+
"""
|
| 524 |
+
entries = self.query(limit=100_000)
|
| 525 |
+
path = Path(output_path)
|
| 526 |
+
data = {
|
| 527 |
+
"picarones_history": True,
|
| 528 |
+
"exported_at": datetime.now(timezone.utc).isoformat(),
|
| 529 |
+
"total_runs": len(entries),
|
| 530 |
+
"engines": self.list_engines(),
|
| 531 |
+
"corpora": self.list_corpora(),
|
| 532 |
+
"runs": [e.as_dict() for e in entries],
|
| 533 |
+
}
|
| 534 |
+
path.write_text(json.dumps(data, ensure_ascii=False, indent=2), encoding="utf-8")
|
| 535 |
+
return path
|
| 536 |
+
|
| 537 |
+
def __repr__(self) -> str:
|
| 538 |
+
return f"BenchmarkHistory(db='{self.db_path}', runs={self.count()})"
|
| 539 |
+
|
| 540 |
+
|
| 541 |
+
# ---------------------------------------------------------------------------
|
| 542 |
+
# Données de démonstration longitudinale
|
| 543 |
+
# ---------------------------------------------------------------------------
|
| 544 |
+
|
| 545 |
+
def generate_demo_history(
|
| 546 |
+
db: BenchmarkHistory,
|
| 547 |
+
n_runs: int = 8,
|
| 548 |
+
seed: int = 42,
|
| 549 |
+
) -> None:
|
| 550 |
+
"""Insère des données fictives de suivi longitudinal pour la démo.
|
| 551 |
+
|
| 552 |
+
Simule l'amélioration progressive d'un modèle tesseract sur 8 runs,
|
| 553 |
+
avec une légère régression au run 5.
|
| 554 |
+
|
| 555 |
+
Parameters
|
| 556 |
+
----------
|
| 557 |
+
db:
|
| 558 |
+
Base d'historique à remplir.
|
| 559 |
+
n_runs:
|
| 560 |
+
Nombre de runs à générer.
|
| 561 |
+
seed:
|
| 562 |
+
Graine aléatoire.
|
| 563 |
+
"""
|
| 564 |
+
import random
|
| 565 |
+
rng = random.Random(seed)
|
| 566 |
+
|
| 567 |
+
engines = ["tesseract", "pero_ocr", "ancien_moteur"]
|
| 568 |
+
corpus = "Chroniques médiévales BnF"
|
| 569 |
+
|
| 570 |
+
# Trajectoires de CER simulées (amélioration progressive + bruit)
|
| 571 |
+
base_cers = {
|
| 572 |
+
"tesseract": 0.15,
|
| 573 |
+
"pero_ocr": 0.09,
|
| 574 |
+
"ancien_moteur": 0.28,
|
| 575 |
+
}
|
| 576 |
+
improvements = {
|
| 577 |
+
"tesseract": -0.008, # améliore de ~0.8% par run
|
| 578 |
+
"pero_ocr": -0.005, # améliore de ~0.5% par run
|
| 579 |
+
"ancien_moteur": -0.003,
|
| 580 |
+
}
|
| 581 |
+
|
| 582 |
+
from datetime import timedelta
|
| 583 |
+
base_date = datetime(2024, 9, 1, tzinfo=timezone.utc)
|
| 584 |
+
|
| 585 |
+
for run_idx in range(n_runs):
|
| 586 |
+
run_date = base_date + timedelta(weeks=run_idx * 2)
|
| 587 |
+
run_id = f"demo_run_{run_idx + 1:02d}"
|
| 588 |
+
|
| 589 |
+
for engine in engines:
|
| 590 |
+
cer = base_cers[engine] + improvements[engine] * run_idx
|
| 591 |
+
# Ajouter du bruit + régression au run 5
|
| 592 |
+
noise = rng.gauss(0, 0.005)
|
| 593 |
+
if run_idx == 4 and engine == "tesseract":
|
| 594 |
+
noise += 0.02 # régression simulée
|
| 595 |
+
cer = max(0.01, min(0.5, cer + noise))
|
| 596 |
+
|
| 597 |
+
wer = cer * 1.8 + rng.gauss(0, 0.01)
|
| 598 |
+
wer = max(0.01, min(0.9, wer))
|
| 599 |
+
|
| 600 |
+
db.record_single(
|
| 601 |
+
run_id=f"{run_id}_{engine}",
|
| 602 |
+
corpus_name=corpus,
|
| 603 |
+
engine_name=engine,
|
| 604 |
+
cer_mean=round(cer, 4),
|
| 605 |
+
wer_mean=round(wer, 4),
|
| 606 |
+
doc_count=12,
|
| 607 |
+
timestamp=run_date.isoformat(),
|
| 608 |
+
metadata={
|
| 609 |
+
"note": f"Run de démonstration #{run_idx + 1}",
|
| 610 |
+
"engine_version": f"5.{run_idx}.0" if engine == "tesseract" else "0.7.2",
|
| 611 |
+
},
|
| 612 |
+
)
|
|
@@ -0,0 +1,711 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Analyse de robustesse des moteurs OCR face aux dégradations d'image.
|
| 2 |
+
|
| 3 |
+
Fonctionnement
|
| 4 |
+
--------------
|
| 5 |
+
1. Génération de versions dégradées des images du corpus à différents niveaux :
|
| 6 |
+
- Bruit gaussien (sigma croissant)
|
| 7 |
+
- Flou gaussien (kernel size croissant)
|
| 8 |
+
- Rotation (angle croissant)
|
| 9 |
+
- Réduction de résolution (facteur de downscaling)
|
| 10 |
+
- Binarisation (seuillage Otsu ou fixe)
|
| 11 |
+
2. Exécution du moteur OCR sur chaque version dégradée
|
| 12 |
+
3. Calcul du CER pour chaque niveau de dégradation
|
| 13 |
+
4. Génération de courbes de robustesse (CER en fonction du niveau)
|
| 14 |
+
5. Identification du seuil critique (niveau à partir duquel CER > seuil)
|
| 15 |
+
|
| 16 |
+
Usage
|
| 17 |
+
-----
|
| 18 |
+
>>> from picarones.core.robustness import RobustnessAnalyzer
|
| 19 |
+
>>> analyzer = RobustnessAnalyzer(engine, degradation_types=["noise", "blur"])
|
| 20 |
+
>>> report = analyzer.analyze(corpus)
|
| 21 |
+
>>> print(report.critical_thresholds)
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
from __future__ import annotations
|
| 25 |
+
|
| 26 |
+
import logging
|
| 27 |
+
import math
|
| 28 |
+
import os
|
| 29 |
+
import tempfile
|
| 30 |
+
from dataclasses import dataclass, field
|
| 31 |
+
from pathlib import Path
|
| 32 |
+
from typing import Optional
|
| 33 |
+
|
| 34 |
+
logger = logging.getLogger(__name__)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
# ---------------------------------------------------------------------------
|
| 38 |
+
# Paramètres de dégradation
|
| 39 |
+
# ---------------------------------------------------------------------------
|
| 40 |
+
|
| 41 |
+
# Niveaux de dégradation pour chaque type
|
| 42 |
+
DEGRADATION_LEVELS: dict[str, list] = {
|
| 43 |
+
"noise": [0, 5, 15, 30, 50, 80], # sigma du bruit gaussien
|
| 44 |
+
"blur": [0, 1, 2, 3, 5, 8], # rayon du flou gaussien (pixels)
|
| 45 |
+
"rotation": [0, 1, 2, 5, 10, 20], # angle de rotation (degrés)
|
| 46 |
+
"resolution": [1.0, 0.75, 0.5, 0.33, 0.25, 0.1], # facteur de résolution
|
| 47 |
+
"binarization": [0, 64, 96, 128, 160, 192], # seuil de binarisation (0 = Otsu)
|
| 48 |
+
}
|
| 49 |
+
|
| 50 |
+
DEGRADATION_LABELS: dict[str, list[str]] = {
|
| 51 |
+
"noise": ["original", "σ=5", "σ=15", "σ=30", "σ=50", "σ=80"],
|
| 52 |
+
"blur": ["original", "r=1", "r=2", "r=3", "r=5", "r=8"],
|
| 53 |
+
"rotation": ["0°", "1°", "2°", "5°", "10°", "20°"],
|
| 54 |
+
"resolution": ["100%", "75%", "50%", "33%", "25%", "10%"],
|
| 55 |
+
"binarization": ["original", "seuil=64", "seuil=96", "seuil=128", "seuil=160", "seuil=192"],
|
| 56 |
+
}
|
| 57 |
+
|
| 58 |
+
ALL_DEGRADATION_TYPES = list(DEGRADATION_LEVELS.keys())
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
# ---------------------------------------------------------------------------
|
| 62 |
+
# Dégradation d'image (pure Python + stdlib, optionnellement Pillow/NumPy)
|
| 63 |
+
# ---------------------------------------------------------------------------
|
| 64 |
+
|
| 65 |
+
def _apply_gaussian_noise(pixels: list[list[list[int]]], sigma: float, rng_seed: int = 0) -> list[list[list[int]]]:
|
| 66 |
+
"""Applique du bruit gaussien (pure Python)."""
|
| 67 |
+
import random
|
| 68 |
+
rng = random.Random(rng_seed)
|
| 69 |
+
h = len(pixels)
|
| 70 |
+
w = len(pixels[0]) if h > 0 else 0
|
| 71 |
+
result = []
|
| 72 |
+
for y in range(h):
|
| 73 |
+
row = []
|
| 74 |
+
for x in range(w):
|
| 75 |
+
pixel = []
|
| 76 |
+
for c in pixels[y][x]:
|
| 77 |
+
noise = rng.gauss(0, sigma)
|
| 78 |
+
val = int(c + noise)
|
| 79 |
+
pixel.append(max(0, min(255, val)))
|
| 80 |
+
row.append(pixel)
|
| 81 |
+
result.append(row)
|
| 82 |
+
return result
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def _apply_box_blur(pixels: list[list[list[int]]], radius: int) -> list[list[list[int]]]:
|
| 86 |
+
"""Applique un flou de boîte (approximation du flou gaussien, pure Python)."""
|
| 87 |
+
if radius <= 0:
|
| 88 |
+
return pixels
|
| 89 |
+
h = len(pixels)
|
| 90 |
+
w = len(pixels[0]) if h > 0 else 0
|
| 91 |
+
channels = len(pixels[0][0]) if h > 0 and w > 0 else 3
|
| 92 |
+
|
| 93 |
+
def blur_pass(data: list[list[list[int]]]) -> list[list[list[int]]]:
|
| 94 |
+
out = []
|
| 95 |
+
for y in range(h):
|
| 96 |
+
row = []
|
| 97 |
+
for x in range(w):
|
| 98 |
+
totals = [0] * channels
|
| 99 |
+
count = 0
|
| 100 |
+
for dy in range(-radius, radius + 1):
|
| 101 |
+
for dx in range(-radius, radius + 1):
|
| 102 |
+
ny, nx = y + dy, x + dx
|
| 103 |
+
if 0 <= ny < h and 0 <= nx < w:
|
| 104 |
+
for c in range(channels):
|
| 105 |
+
totals[c] += data[ny][nx][c]
|
| 106 |
+
count += 1
|
| 107 |
+
row.append([t // count for t in totals])
|
| 108 |
+
out.append(row)
|
| 109 |
+
return out
|
| 110 |
+
|
| 111 |
+
return blur_pass(pixels)
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def _apply_rotation_simple(pixels: list[list[list[int]]], angle_deg: float) -> list[list[list[int]]]:
|
| 115 |
+
"""Rotation avec interpolation au plus proche voisin (pure Python).
|
| 116 |
+
|
| 117 |
+
Pour des angles faibles, l'effet est réaliste.
|
| 118 |
+
"""
|
| 119 |
+
if angle_deg == 0:
|
| 120 |
+
return pixels
|
| 121 |
+
h = len(pixels)
|
| 122 |
+
w = len(pixels[0]) if h > 0 else 0
|
| 123 |
+
channels = len(pixels[0][0]) if h > 0 and w > 0 else 3
|
| 124 |
+
|
| 125 |
+
angle_rad = math.radians(angle_deg)
|
| 126 |
+
cos_a = math.cos(angle_rad)
|
| 127 |
+
sin_a = math.sin(angle_rad)
|
| 128 |
+
cx, cy = w / 2, h / 2
|
| 129 |
+
|
| 130 |
+
result = [[[245, 240, 232][:channels] for _ in range(w)] for _ in range(h)]
|
| 131 |
+
for y in range(h):
|
| 132 |
+
for x in range(w):
|
| 133 |
+
# Coordonnées source
|
| 134 |
+
sx = cos_a * (x - cx) + sin_a * (y - cy) + cx
|
| 135 |
+
sy = -sin_a * (x - cx) + cos_a * (y - cy) + cy
|
| 136 |
+
ix, iy = int(round(sx)), int(round(sy))
|
| 137 |
+
if 0 <= ix < w and 0 <= iy < h:
|
| 138 |
+
result[y][x] = list(pixels[iy][ix])
|
| 139 |
+
return result
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
def _apply_resolution_reduction(
|
| 143 |
+
pixels: list[list[list[int]]], factor: float
|
| 144 |
+
) -> list[list[list[int]]]:
|
| 145 |
+
"""Réduit la résolution puis remonte à la taille originale (pixelisation)."""
|
| 146 |
+
if factor >= 1.0:
|
| 147 |
+
return pixels
|
| 148 |
+
h = len(pixels)
|
| 149 |
+
w = len(pixels[0]) if h > 0 else 0
|
| 150 |
+
new_h = max(1, int(h * factor))
|
| 151 |
+
new_w = max(1, int(w * factor))
|
| 152 |
+
|
| 153 |
+
# Downscale
|
| 154 |
+
small = []
|
| 155 |
+
for y in range(new_h):
|
| 156 |
+
row = []
|
| 157 |
+
src_y = int(y / factor)
|
| 158 |
+
for x in range(new_w):
|
| 159 |
+
src_x = int(x / factor)
|
| 160 |
+
row.append(list(pixels[min(src_y, h - 1)][min(src_x, w - 1)]))
|
| 161 |
+
small.append(row)
|
| 162 |
+
|
| 163 |
+
# Upscale (nearest-neighbor)
|
| 164 |
+
result = []
|
| 165 |
+
for y in range(h):
|
| 166 |
+
row = []
|
| 167 |
+
src_y = min(int(y * factor), new_h - 1)
|
| 168 |
+
for x in range(w):
|
| 169 |
+
src_x = min(int(x * factor), new_w - 1)
|
| 170 |
+
row.append(list(small[src_y][src_x]))
|
| 171 |
+
result.append(row)
|
| 172 |
+
return result
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def _apply_binarization(
|
| 176 |
+
pixels: list[list[list[int]]], threshold: int
|
| 177 |
+
) -> list[list[list[int]]]:
|
| 178 |
+
"""Binarise l'image (seuillage fixe sur luminosité)."""
|
| 179 |
+
h = len(pixels)
|
| 180 |
+
w = len(pixels[0]) if h > 0 else 0
|
| 181 |
+
result = []
|
| 182 |
+
|
| 183 |
+
# Calculer le seuil Otsu si threshold == 0
|
| 184 |
+
if threshold == 0:
|
| 185 |
+
histogram = [0] * 256
|
| 186 |
+
total = h * w
|
| 187 |
+
for y in range(h):
|
| 188 |
+
for x in range(w):
|
| 189 |
+
p = pixels[y][x]
|
| 190 |
+
lum = int(0.299 * p[0] + 0.587 * p[1] + 0.114 * p[2]) if len(p) >= 3 else p[0]
|
| 191 |
+
histogram[lum] += 1
|
| 192 |
+
# Otsu simplifié
|
| 193 |
+
best_thresh = 128
|
| 194 |
+
best_var = -1.0
|
| 195 |
+
total_sum = sum(i * histogram[i] for i in range(256))
|
| 196 |
+
w0, w1, sum0 = 0, total, 0.0
|
| 197 |
+
for t in range(256):
|
| 198 |
+
w0 += histogram[t]
|
| 199 |
+
if w0 == 0:
|
| 200 |
+
continue
|
| 201 |
+
w1 = total - w0
|
| 202 |
+
if w1 == 0:
|
| 203 |
+
break
|
| 204 |
+
sum0 += t * histogram[t]
|
| 205 |
+
mean0 = sum0 / w0
|
| 206 |
+
mean1 = (total_sum - sum0) / w1
|
| 207 |
+
var = w0 * w1 * (mean0 - mean1) ** 2
|
| 208 |
+
if var > best_var:
|
| 209 |
+
best_var = var
|
| 210 |
+
best_thresh = t
|
| 211 |
+
threshold = best_thresh
|
| 212 |
+
|
| 213 |
+
for y in range(h):
|
| 214 |
+
row = []
|
| 215 |
+
for x in range(w):
|
| 216 |
+
p = pixels[y][x]
|
| 217 |
+
lum = int(0.299 * p[0] + 0.587 * p[1] + 0.114 * p[2]) if len(p) >= 3 else p[0]
|
| 218 |
+
val = 255 if lum >= threshold else 0
|
| 219 |
+
row.append([val] * len(p))
|
| 220 |
+
result.append(row)
|
| 221 |
+
return result
|
| 222 |
+
|
| 223 |
+
|
| 224 |
+
def degrade_image_bytes(
|
| 225 |
+
png_bytes: bytes,
|
| 226 |
+
degradation_type: str,
|
| 227 |
+
level: float,
|
| 228 |
+
) -> bytes:
|
| 229 |
+
"""Dégrade une image PNG et retourne les bytes PNG modifiés.
|
| 230 |
+
|
| 231 |
+
Utilise Pillow si disponible, sinon utilise l'implémentation pure Python.
|
| 232 |
+
|
| 233 |
+
Parameters
|
| 234 |
+
----------
|
| 235 |
+
png_bytes:
|
| 236 |
+
Bytes de l'image PNG source.
|
| 237 |
+
degradation_type:
|
| 238 |
+
Type de dégradation (``"noise"``, ``"blur"``, ``"rotation"``,
|
| 239 |
+
``"resolution"``, ``"binarization"``).
|
| 240 |
+
level:
|
| 241 |
+
Niveau de dégradation (valeur numérique selon le type).
|
| 242 |
+
|
| 243 |
+
Returns
|
| 244 |
+
-------
|
| 245 |
+
bytes
|
| 246 |
+
Bytes de l'image PNG dégradée.
|
| 247 |
+
"""
|
| 248 |
+
try:
|
| 249 |
+
return _degrade_pillow(png_bytes, degradation_type, level)
|
| 250 |
+
except ImportError:
|
| 251 |
+
return _degrade_pure_python(png_bytes, degradation_type, level)
|
| 252 |
+
|
| 253 |
+
|
| 254 |
+
def _degrade_pillow(png_bytes: bytes, degradation_type: str, level: float) -> bytes:
|
| 255 |
+
"""Dégradation avec Pillow (meilleure qualité)."""
|
| 256 |
+
import io
|
| 257 |
+
from PIL import Image, ImageFilter
|
| 258 |
+
|
| 259 |
+
img = Image.open(io.BytesIO(png_bytes)).convert("RGB")
|
| 260 |
+
|
| 261 |
+
if degradation_type == "noise":
|
| 262 |
+
if level > 0:
|
| 263 |
+
import random
|
| 264 |
+
import struct
|
| 265 |
+
data = list(img.getdata())
|
| 266 |
+
rng = random.Random(0)
|
| 267 |
+
noisy = []
|
| 268 |
+
for r, g, b in data:
|
| 269 |
+
noisy.append((
|
| 270 |
+
max(0, min(255, int(r + rng.gauss(0, level)))),
|
| 271 |
+
max(0, min(255, int(g + rng.gauss(0, level)))),
|
| 272 |
+
max(0, min(255, int(b + rng.gauss(0, level)))),
|
| 273 |
+
))
|
| 274 |
+
img.putdata(noisy)
|
| 275 |
+
|
| 276 |
+
elif degradation_type == "blur":
|
| 277 |
+
if level > 0:
|
| 278 |
+
img = img.filter(ImageFilter.GaussianBlur(radius=level))
|
| 279 |
+
|
| 280 |
+
elif degradation_type == "rotation":
|
| 281 |
+
if level != 0:
|
| 282 |
+
img = img.rotate(-level, expand=False, fillcolor=(245, 240, 232))
|
| 283 |
+
|
| 284 |
+
elif degradation_type == "resolution":
|
| 285 |
+
if level < 1.0:
|
| 286 |
+
w, h = img.size
|
| 287 |
+
new_w, new_h = max(1, int(w * level)), max(1, int(h * level))
|
| 288 |
+
img = img.resize((new_w, new_h), Image.NEAREST)
|
| 289 |
+
img = img.resize((w, h), Image.NEAREST)
|
| 290 |
+
|
| 291 |
+
elif degradation_type == "binarization":
|
| 292 |
+
img = img.convert("L") # niveaux de gris
|
| 293 |
+
if level == 0:
|
| 294 |
+
# Seuillage Otsu approché
|
| 295 |
+
threshold = 128
|
| 296 |
+
else:
|
| 297 |
+
threshold = int(level)
|
| 298 |
+
img = img.point(lambda p: 255 if p >= threshold else 0, "1").convert("RGB")
|
| 299 |
+
|
| 300 |
+
buf = io.BytesIO()
|
| 301 |
+
img.save(buf, format="PNG")
|
| 302 |
+
return buf.getvalue()
|
| 303 |
+
|
| 304 |
+
|
| 305 |
+
def _degrade_pure_python(png_bytes: bytes, degradation_type: str, level: float) -> bytes:
|
| 306 |
+
"""Dégradation en pur Python (sans Pillow).
|
| 307 |
+
|
| 308 |
+
Décode le PNG, applique la transformation, ré-encode en PNG.
|
| 309 |
+
Note : n'implémente pas le décodage PNG complet — utilise des stubs.
|
| 310 |
+
"""
|
| 311 |
+
# Pour l'implémentation pure Python, on applique des transformations
|
| 312 |
+
# minimales sur les bytes bruts en créant une image de test synthétique.
|
| 313 |
+
# En pratique, Pillow est presque toujours disponible dans l'environnement Picarones.
|
| 314 |
+
logger.warning(
|
| 315 |
+
"Pillow non disponible : dégradation '%s' appliquée en mode dégradé (stub)",
|
| 316 |
+
degradation_type,
|
| 317 |
+
)
|
| 318 |
+
# Retourner l'image originale légèrement modifiée (simulation)
|
| 319 |
+
return png_bytes
|
| 320 |
+
|
| 321 |
+
|
| 322 |
+
# ---------------------------------------------------------------------------
|
| 323 |
+
# Structures de résultats
|
| 324 |
+
# ---------------------------------------------------------------------------
|
| 325 |
+
|
| 326 |
+
@dataclass
|
| 327 |
+
class DegradationCurve:
|
| 328 |
+
"""Courbe CER vs niveau de dégradation pour un moteur et un type de dégradation."""
|
| 329 |
+
engine_name: str
|
| 330 |
+
degradation_type: str
|
| 331 |
+
levels: list[float]
|
| 332 |
+
labels: list[str]
|
| 333 |
+
cer_values: list[Optional[float]]
|
| 334 |
+
"""CER moyen (0-1) à chaque niveau. None si calcul impossible."""
|
| 335 |
+
critical_threshold_level: Optional[float] = None
|
| 336 |
+
"""Niveau à partir duquel CER > cer_threshold."""
|
| 337 |
+
cer_threshold: float = 0.20
|
| 338 |
+
"""Seuil de CER utilisé pour déterminer le niveau critique."""
|
| 339 |
+
|
| 340 |
+
def as_dict(self) -> dict:
|
| 341 |
+
return {
|
| 342 |
+
"engine_name": self.engine_name,
|
| 343 |
+
"degradation_type": self.degradation_type,
|
| 344 |
+
"levels": self.levels,
|
| 345 |
+
"labels": self.labels,
|
| 346 |
+
"cer_values": self.cer_values,
|
| 347 |
+
"critical_threshold_level": self.critical_threshold_level,
|
| 348 |
+
"cer_threshold": self.cer_threshold,
|
| 349 |
+
}
|
| 350 |
+
|
| 351 |
+
|
| 352 |
+
@dataclass
|
| 353 |
+
class RobustnessReport:
|
| 354 |
+
"""Rapport complet d'analyse de robustesse pour un ou plusieurs moteurs."""
|
| 355 |
+
engine_names: list[str]
|
| 356 |
+
corpus_name: str
|
| 357 |
+
degradation_types: list[str]
|
| 358 |
+
curves: list[DegradationCurve]
|
| 359 |
+
summary: dict = field(default_factory=dict)
|
| 360 |
+
"""Résumé : moteur le plus robuste par type de dégradation, seuils critiques…"""
|
| 361 |
+
|
| 362 |
+
def get_curves_for_engine(self, engine_name: str) -> list[DegradationCurve]:
|
| 363 |
+
return [c for c in self.curves if c.engine_name == engine_name]
|
| 364 |
+
|
| 365 |
+
def get_curves_for_type(self, degradation_type: str) -> list[DegradationCurve]:
|
| 366 |
+
return [c for c in self.curves if c.degradation_type == degradation_type]
|
| 367 |
+
|
| 368 |
+
def as_dict(self) -> dict:
|
| 369 |
+
return {
|
| 370 |
+
"engine_names": self.engine_names,
|
| 371 |
+
"corpus_name": self.corpus_name,
|
| 372 |
+
"degradation_types": self.degradation_types,
|
| 373 |
+
"curves": [c.as_dict() for c in self.curves],
|
| 374 |
+
"summary": self.summary,
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
|
| 378 |
+
# ---------------------------------------------------------------------------
|
| 379 |
+
# Analyseur de robustesse
|
| 380 |
+
# ---------------------------------------------------------------------------
|
| 381 |
+
|
| 382 |
+
class RobustnessAnalyzer:
|
| 383 |
+
"""Lance une analyse de robustesse sur un corpus.
|
| 384 |
+
|
| 385 |
+
Parameters
|
| 386 |
+
----------
|
| 387 |
+
engines:
|
| 388 |
+
Un ou plusieurs moteurs OCR (``BaseOCREngine``).
|
| 389 |
+
degradation_types:
|
| 390 |
+
Liste des types de dégradation à tester.
|
| 391 |
+
Par défaut : tous (``"noise"``, ``"blur"``, ``"rotation"``,
|
| 392 |
+
``"resolution"``, ``"binarization"``).
|
| 393 |
+
cer_threshold:
|
| 394 |
+
Seuil de CER pour définir le niveau critique (défaut : 0.20 = 20%).
|
| 395 |
+
custom_levels:
|
| 396 |
+
Niveaux personnalisés par type (remplace les valeurs par défaut).
|
| 397 |
+
|
| 398 |
+
Examples
|
| 399 |
+
--------
|
| 400 |
+
>>> from picarones.engines.tesseract import TesseractEngine
|
| 401 |
+
>>> from picarones.core.robustness import RobustnessAnalyzer
|
| 402 |
+
>>> engine = TesseractEngine(config={"lang": "fra"})
|
| 403 |
+
>>> analyzer = RobustnessAnalyzer([engine], degradation_types=["noise", "blur"])
|
| 404 |
+
>>> report = analyzer.analyze(corpus)
|
| 405 |
+
"""
|
| 406 |
+
|
| 407 |
+
def __init__(
|
| 408 |
+
self,
|
| 409 |
+
engines: "list[BaseOCREngine]",
|
| 410 |
+
degradation_types: Optional[list[str]] = None,
|
| 411 |
+
cer_threshold: float = 0.20,
|
| 412 |
+
custom_levels: Optional[dict[str, list]] = None,
|
| 413 |
+
) -> None:
|
| 414 |
+
if not isinstance(engines, list):
|
| 415 |
+
engines = [engines]
|
| 416 |
+
self.engines = engines
|
| 417 |
+
self.degradation_types = degradation_types or ALL_DEGRADATION_TYPES
|
| 418 |
+
self.cer_threshold = cer_threshold
|
| 419 |
+
self.levels = dict(DEGRADATION_LEVELS)
|
| 420 |
+
if custom_levels:
|
| 421 |
+
self.levels.update(custom_levels)
|
| 422 |
+
|
| 423 |
+
def analyze(
|
| 424 |
+
self,
|
| 425 |
+
corpus: "Corpus",
|
| 426 |
+
show_progress: bool = True,
|
| 427 |
+
max_docs: int = 10,
|
| 428 |
+
) -> RobustnessReport:
|
| 429 |
+
"""Lance l'analyse de robustesse sur le corpus.
|
| 430 |
+
|
| 431 |
+
Parameters
|
| 432 |
+
----------
|
| 433 |
+
corpus:
|
| 434 |
+
Corpus Picarones avec images et GT.
|
| 435 |
+
show_progress:
|
| 436 |
+
Affiche la progression.
|
| 437 |
+
max_docs:
|
| 438 |
+
Nombre maximum de documents à traiter (pour la rapidité).
|
| 439 |
+
|
| 440 |
+
Returns
|
| 441 |
+
-------
|
| 442 |
+
RobustnessReport
|
| 443 |
+
"""
|
| 444 |
+
from picarones.core.metrics import compute_metrics
|
| 445 |
+
|
| 446 |
+
docs = corpus.documents[:max_docs]
|
| 447 |
+
curves: list[DegradationCurve] = []
|
| 448 |
+
|
| 449 |
+
for engine in self.engines:
|
| 450 |
+
for deg_type in self.degradation_types:
|
| 451 |
+
levels = self.levels[deg_type]
|
| 452 |
+
labels = DEGRADATION_LABELS.get(deg_type, [str(l) for l in levels])
|
| 453 |
+
|
| 454 |
+
cer_per_level: list[Optional[float]] = []
|
| 455 |
+
|
| 456 |
+
if show_progress:
|
| 457 |
+
try:
|
| 458 |
+
from tqdm import tqdm
|
| 459 |
+
level_iter = tqdm(
|
| 460 |
+
list(enumerate(levels)),
|
| 461 |
+
desc=f"{engine.name} / {deg_type}",
|
| 462 |
+
)
|
| 463 |
+
except ImportError:
|
| 464 |
+
level_iter = enumerate(levels)
|
| 465 |
+
else:
|
| 466 |
+
level_iter = enumerate(levels)
|
| 467 |
+
|
| 468 |
+
for lvl_idx, level in level_iter:
|
| 469 |
+
doc_cers: list[float] = []
|
| 470 |
+
|
| 471 |
+
for doc in docs:
|
| 472 |
+
gt = doc.ground_truth.strip()
|
| 473 |
+
if not gt:
|
| 474 |
+
continue
|
| 475 |
+
|
| 476 |
+
# Obtenir l'image (fichier ou data URI)
|
| 477 |
+
degraded_bytes = self._get_degraded_image(
|
| 478 |
+
doc, deg_type, level
|
| 479 |
+
)
|
| 480 |
+
if degraded_bytes is None:
|
| 481 |
+
continue
|
| 482 |
+
|
| 483 |
+
# Sauvegarder temporairement et OCR
|
| 484 |
+
with tempfile.NamedTemporaryFile(
|
| 485 |
+
suffix=".png", delete=False
|
| 486 |
+
) as tmp:
|
| 487 |
+
tmp.write(degraded_bytes)
|
| 488 |
+
tmp_path = tmp.name
|
| 489 |
+
|
| 490 |
+
try:
|
| 491 |
+
hypothesis = engine.process_image(tmp_path)
|
| 492 |
+
metrics = compute_metrics(gt, hypothesis)
|
| 493 |
+
doc_cers.append(metrics.cer)
|
| 494 |
+
except Exception as exc:
|
| 495 |
+
logger.debug(
|
| 496 |
+
"Erreur OCR %s niveau %s=%s: %s",
|
| 497 |
+
engine.name, deg_type, level, exc
|
| 498 |
+
)
|
| 499 |
+
finally:
|
| 500 |
+
try:
|
| 501 |
+
os.unlink(tmp_path)
|
| 502 |
+
except OSError:
|
| 503 |
+
pass
|
| 504 |
+
|
| 505 |
+
if doc_cers:
|
| 506 |
+
cer_per_level.append(sum(doc_cers) / len(doc_cers))
|
| 507 |
+
else:
|
| 508 |
+
cer_per_level.append(None)
|
| 509 |
+
|
| 510 |
+
# Calculer le niveau critique
|
| 511 |
+
critical = self._find_critical_level(
|
| 512 |
+
levels, cer_per_level, self.cer_threshold
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
curves.append(DegradationCurve(
|
| 516 |
+
engine_name=engine.name,
|
| 517 |
+
degradation_type=deg_type,
|
| 518 |
+
levels=levels,
|
| 519 |
+
labels=labels[:len(levels)],
|
| 520 |
+
cer_values=cer_per_level,
|
| 521 |
+
critical_threshold_level=critical,
|
| 522 |
+
cer_threshold=self.cer_threshold,
|
| 523 |
+
))
|
| 524 |
+
|
| 525 |
+
summary = self._build_summary(curves)
|
| 526 |
+
|
| 527 |
+
return RobustnessReport(
|
| 528 |
+
engine_names=[e.name for e in self.engines],
|
| 529 |
+
corpus_name=corpus.name,
|
| 530 |
+
degradation_types=self.degradation_types,
|
| 531 |
+
curves=curves,
|
| 532 |
+
summary=summary,
|
| 533 |
+
)
|
| 534 |
+
|
| 535 |
+
def _get_degraded_image(
|
| 536 |
+
self,
|
| 537 |
+
doc: "Document",
|
| 538 |
+
degradation_type: str,
|
| 539 |
+
level: float,
|
| 540 |
+
) -> Optional[bytes]:
|
| 541 |
+
"""Retourne les bytes PNG de l'image dégradée."""
|
| 542 |
+
# Charger l'image originale
|
| 543 |
+
original_bytes = self._load_image(doc)
|
| 544 |
+
if original_bytes is None:
|
| 545 |
+
return None
|
| 546 |
+
|
| 547 |
+
if (degradation_type == "noise" and level == 0) or \
|
| 548 |
+
(degradation_type == "blur" and level == 0) or \
|
| 549 |
+
(degradation_type == "rotation" and level == 0) or \
|
| 550 |
+
(degradation_type == "resolution" and level >= 1.0) or \
|
| 551 |
+
(degradation_type == "binarization" and level == 0 and
|
| 552 |
+
degradation_type not in ("binarization",)):
|
| 553 |
+
# Niveau 0 = image originale (sauf binarisation à 0 = Otsu)
|
| 554 |
+
if degradation_type != "binarization":
|
| 555 |
+
return original_bytes
|
| 556 |
+
|
| 557 |
+
return degrade_image_bytes(original_bytes, degradation_type, level)
|
| 558 |
+
|
| 559 |
+
def _load_image(self, doc: "Document") -> Optional[bytes]:
|
| 560 |
+
"""Charge les bytes PNG de l'image d'un document."""
|
| 561 |
+
img_path = doc.image_path
|
| 562 |
+
|
| 563 |
+
# Data URI (base64)
|
| 564 |
+
if img_path.startswith("data:image/"):
|
| 565 |
+
import base64
|
| 566 |
+
try:
|
| 567 |
+
_, b64 = img_path.split(",", 1)
|
| 568 |
+
return base64.b64decode(b64)
|
| 569 |
+
except Exception as exc:
|
| 570 |
+
logger.debug("Impossible de décoder data URI: %s", exc)
|
| 571 |
+
return None
|
| 572 |
+
|
| 573 |
+
# Fichier local
|
| 574 |
+
path = Path(img_path)
|
| 575 |
+
if path.exists():
|
| 576 |
+
return path.read_bytes()
|
| 577 |
+
|
| 578 |
+
logger.debug("Image introuvable : %s", img_path)
|
| 579 |
+
return None
|
| 580 |
+
|
| 581 |
+
@staticmethod
|
| 582 |
+
def _find_critical_level(
|
| 583 |
+
levels: list[float],
|
| 584 |
+
cer_values: list[Optional[float]],
|
| 585 |
+
threshold: float,
|
| 586 |
+
) -> Optional[float]:
|
| 587 |
+
"""Trouve le niveau à partir duquel CER dépasse le seuil."""
|
| 588 |
+
for level, cer in zip(levels, cer_values):
|
| 589 |
+
if cer is not None and cer > threshold:
|
| 590 |
+
return level
|
| 591 |
+
return None
|
| 592 |
+
|
| 593 |
+
@staticmethod
|
| 594 |
+
def _build_summary(curves: list[DegradationCurve]) -> dict:
|
| 595 |
+
"""Construit le résumé de l'analyse."""
|
| 596 |
+
summary: dict = {}
|
| 597 |
+
|
| 598 |
+
# Par type de dégradation : moteur le plus robuste
|
| 599 |
+
by_type: dict[str, dict[str, list]] = {}
|
| 600 |
+
for curve in curves:
|
| 601 |
+
dt = curve.degradation_type
|
| 602 |
+
if dt not in by_type:
|
| 603 |
+
by_type[dt] = {}
|
| 604 |
+
valid_cers = [c for c in curve.cer_values if c is not None]
|
| 605 |
+
if valid_cers:
|
| 606 |
+
by_type[dt][curve.engine_name] = valid_cers
|
| 607 |
+
|
| 608 |
+
for dt, engine_cers in by_type.items():
|
| 609 |
+
if not engine_cers:
|
| 610 |
+
continue
|
| 611 |
+
# Robustesse = CER moyen sur tous les niveaux (plus bas = plus robuste)
|
| 612 |
+
best_engine = min(engine_cers, key=lambda e: sum(engine_cers[e]) / len(engine_cers[e]))
|
| 613 |
+
summary[f"most_robust_{dt}"] = best_engine
|
| 614 |
+
|
| 615 |
+
# Seuils critiques par moteur
|
| 616 |
+
for curve in curves:
|
| 617 |
+
key = f"critical_{curve.engine_name}_{curve.degradation_type}"
|
| 618 |
+
summary[key] = curve.critical_threshold_level
|
| 619 |
+
|
| 620 |
+
return summary
|
| 621 |
+
|
| 622 |
+
|
| 623 |
+
# ---------------------------------------------------------------------------
|
| 624 |
+
# Données de démonstration de robustesse
|
| 625 |
+
# ---------------------------------------------------------------------------
|
| 626 |
+
|
| 627 |
+
def generate_demo_robustness_report(
|
| 628 |
+
engine_names: Optional[list[str]] = None,
|
| 629 |
+
seed: int = 42,
|
| 630 |
+
) -> RobustnessReport:
|
| 631 |
+
"""Génère un rapport de robustesse fictif mais réaliste pour la démo.
|
| 632 |
+
|
| 633 |
+
Parameters
|
| 634 |
+
----------
|
| 635 |
+
engine_names:
|
| 636 |
+
Noms des moteurs à simuler (défaut : tesseract, pero_ocr).
|
| 637 |
+
seed:
|
| 638 |
+
Graine aléatoire.
|
| 639 |
+
|
| 640 |
+
Returns
|
| 641 |
+
-------
|
| 642 |
+
RobustnessReport
|
| 643 |
+
"""
|
| 644 |
+
import random
|
| 645 |
+
rng = random.Random(seed)
|
| 646 |
+
|
| 647 |
+
if engine_names is None:
|
| 648 |
+
engine_names = ["tesseract", "pero_ocr"]
|
| 649 |
+
|
| 650 |
+
# CER de base par moteur
|
| 651 |
+
base_cer = {
|
| 652 |
+
"tesseract": 0.12,
|
| 653 |
+
"pero_ocr": 0.07,
|
| 654 |
+
"ancien_moteur": 0.25,
|
| 655 |
+
}
|
| 656 |
+
|
| 657 |
+
# Sensibilité par type de dégradation (facteur multiplicatif par niveau)
|
| 658 |
+
sensitivity = {
|
| 659 |
+
"tesseract": {
|
| 660 |
+
"noise": 0.04, "blur": 0.05, "rotation": 0.06,
|
| 661 |
+
"resolution": 0.12, "binarization": 0.03,
|
| 662 |
+
},
|
| 663 |
+
"pero_ocr": {
|
| 664 |
+
"noise": 0.02, "blur": 0.03, "rotation": 0.04,
|
| 665 |
+
"resolution": 0.08, "binarization": 0.02,
|
| 666 |
+
},
|
| 667 |
+
"ancien_moteur": {
|
| 668 |
+
"noise": 0.06, "blur": 0.08, "rotation": 0.10,
|
| 669 |
+
"resolution": 0.15, "binarization": 0.05,
|
| 670 |
+
},
|
| 671 |
+
}
|
| 672 |
+
|
| 673 |
+
deg_types = ALL_DEGRADATION_TYPES
|
| 674 |
+
curves: list[DegradationCurve] = []
|
| 675 |
+
|
| 676 |
+
for engine_name in engine_names:
|
| 677 |
+
cer_base = base_cer.get(engine_name, 0.15)
|
| 678 |
+
sens = sensitivity.get(engine_name, {dt: 0.05 for dt in deg_types})
|
| 679 |
+
|
| 680 |
+
for deg_type in deg_types:
|
| 681 |
+
levels = DEGRADATION_LEVELS[deg_type]
|
| 682 |
+
labels = DEGRADATION_LABELS[deg_type]
|
| 683 |
+
s = sens.get(deg_type, 0.05)
|
| 684 |
+
|
| 685 |
+
cer_values = []
|
| 686 |
+
for i, level in enumerate(levels):
|
| 687 |
+
noise = rng.gauss(0, 0.005)
|
| 688 |
+
cer = min(1.0, cer_base + s * i + noise)
|
| 689 |
+
cer_values.append(round(max(0.0, cer), 4))
|
| 690 |
+
|
| 691 |
+
critical = RobustnessAnalyzer._find_critical_level(levels, cer_values, 0.20)
|
| 692 |
+
|
| 693 |
+
curves.append(DegradationCurve(
|
| 694 |
+
engine_name=engine_name,
|
| 695 |
+
degradation_type=deg_type,
|
| 696 |
+
levels=list(levels),
|
| 697 |
+
labels=labels[:len(levels)],
|
| 698 |
+
cer_values=cer_values,
|
| 699 |
+
critical_threshold_level=critical,
|
| 700 |
+
cer_threshold=0.20,
|
| 701 |
+
))
|
| 702 |
+
|
| 703 |
+
summary = RobustnessAnalyzer._build_summary(curves)
|
| 704 |
+
|
| 705 |
+
return RobustnessReport(
|
| 706 |
+
engine_names=engine_names,
|
| 707 |
+
corpus_name="Corpus de démonstration — Chroniques médiévales",
|
| 708 |
+
degradation_types=deg_types,
|
| 709 |
+
curves=curves,
|
| 710 |
+
summary=summary,
|
| 711 |
+
)
|
|
@@ -1,5 +1,18 @@
|
|
| 1 |
-
"""Importeurs de corpus depuis des sources distantes (IIIF, HuggingFace, HTR-United…)."""
|
| 2 |
|
| 3 |
from picarones.importers.iiif import IIIFImporter, import_iiif_manifest
|
|
|
|
|
|
|
| 4 |
|
| 5 |
-
__all__ = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Importeurs de corpus depuis des sources distantes (IIIF, HuggingFace, HTR-United, Gallica, eScriptorium…)."""
|
| 2 |
|
| 3 |
from picarones.importers.iiif import IIIFImporter, import_iiif_manifest
|
| 4 |
+
from picarones.importers.gallica import GallicaClient, GallicaRecord, search_gallica, import_gallica_document
|
| 5 |
+
from picarones.importers.escriptorium import EScriptoriumClient, EScriptoriumProject, EScriptoriumDocument, connect_escriptorium
|
| 6 |
|
| 7 |
+
__all__ = [
|
| 8 |
+
"IIIFImporter",
|
| 9 |
+
"import_iiif_manifest",
|
| 10 |
+
"GallicaClient",
|
| 11 |
+
"GallicaRecord",
|
| 12 |
+
"search_gallica",
|
| 13 |
+
"import_gallica_document",
|
| 14 |
+
"EScriptoriumClient",
|
| 15 |
+
"EScriptoriumProject",
|
| 16 |
+
"EScriptoriumDocument",
|
| 17 |
+
"connect_escriptorium",
|
| 18 |
+
]
|
|
@@ -0,0 +1,532 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Intégration eScriptorium — import et export via API REST.
|
| 2 |
+
|
| 3 |
+
Fonctionnement
|
| 4 |
+
--------------
|
| 5 |
+
1. Authentification par token (settings → API key dans eScriptorium)
|
| 6 |
+
2. Listing et import de projets, documents et transcriptions
|
| 7 |
+
3. Export des résultats de benchmark Picarones comme couche OCR dans eScriptorium
|
| 8 |
+
|
| 9 |
+
API eScriptorium
|
| 10 |
+
----------------
|
| 11 |
+
eScriptorium expose une API REST documentée à /api/.
|
| 12 |
+
Les endpoints principaux utilisés ici :
|
| 13 |
+
- GET /api/projects/ → liste des projets
|
| 14 |
+
- GET /api/documents/ → liste des documents (filtrables par projet)
|
| 15 |
+
- GET /api/documents/{pk}/parts/ → liste des pages d'un document
|
| 16 |
+
- GET /api/documents/{pk}/parts/{pk}/transcriptions/ → transcriptions d'une page
|
| 17 |
+
- POST /api/documents/{pk}/parts/{pk}/transcriptions/ → créer une couche OCR
|
| 18 |
+
|
| 19 |
+
Usage
|
| 20 |
+
-----
|
| 21 |
+
>>> from picarones.importers.escriptorium import EScriptoriumClient
|
| 22 |
+
>>> client = EScriptoriumClient("https://escriptorium.example.org", token="abc123")
|
| 23 |
+
>>> projects = client.list_projects()
|
| 24 |
+
>>> corpus = client.import_document(doc_id=42, transcription_layer="manual")
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
from __future__ import annotations
|
| 28 |
+
|
| 29 |
+
import json
|
| 30 |
+
import logging
|
| 31 |
+
import time
|
| 32 |
+
import urllib.error
|
| 33 |
+
import urllib.parse
|
| 34 |
+
import urllib.request
|
| 35 |
+
from dataclasses import dataclass, field
|
| 36 |
+
from pathlib import Path
|
| 37 |
+
from typing import Optional
|
| 38 |
+
|
| 39 |
+
from picarones.core.corpus import Corpus, Document
|
| 40 |
+
|
| 41 |
+
logger = logging.getLogger(__name__)
|
| 42 |
+
|
| 43 |
+
# ---------------------------------------------------------------------------
|
| 44 |
+
# Structures de données eScriptorium
|
| 45 |
+
# ---------------------------------------------------------------------------
|
| 46 |
+
|
| 47 |
+
@dataclass
|
| 48 |
+
class EScriptoriumProject:
|
| 49 |
+
"""Représentation d'un projet eScriptorium."""
|
| 50 |
+
pk: int
|
| 51 |
+
name: str
|
| 52 |
+
slug: str
|
| 53 |
+
owner: str = ""
|
| 54 |
+
document_count: int = 0
|
| 55 |
+
|
| 56 |
+
def as_dict(self) -> dict:
|
| 57 |
+
return {
|
| 58 |
+
"pk": self.pk,
|
| 59 |
+
"name": self.name,
|
| 60 |
+
"slug": self.slug,
|
| 61 |
+
"owner": self.owner,
|
| 62 |
+
"document_count": self.document_count,
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
@dataclass
|
| 67 |
+
class EScriptoriumDocument:
|
| 68 |
+
"""Représentation d'un document eScriptorium."""
|
| 69 |
+
pk: int
|
| 70 |
+
name: str
|
| 71 |
+
project: str = ""
|
| 72 |
+
part_count: int = 0
|
| 73 |
+
transcription_layers: list[str] = field(default_factory=list)
|
| 74 |
+
|
| 75 |
+
def as_dict(self) -> dict:
|
| 76 |
+
return {
|
| 77 |
+
"pk": self.pk,
|
| 78 |
+
"name": self.name,
|
| 79 |
+
"project": self.project,
|
| 80 |
+
"part_count": self.part_count,
|
| 81 |
+
"transcription_layers": self.transcription_layers,
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
@dataclass
|
| 86 |
+
class EScriptoriumPart:
|
| 87 |
+
"""Une page (part) d'un document eScriptorium."""
|
| 88 |
+
pk: int
|
| 89 |
+
title: str
|
| 90 |
+
image_url: str
|
| 91 |
+
order: int = 0
|
| 92 |
+
transcriptions: list[dict] = field(default_factory=list)
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# ---------------------------------------------------------------------------
|
| 96 |
+
# Client API eScriptorium
|
| 97 |
+
# ---------------------------------------------------------------------------
|
| 98 |
+
|
| 99 |
+
class EScriptoriumClient:
|
| 100 |
+
"""Client pour l'API REST d'eScriptorium.
|
| 101 |
+
|
| 102 |
+
Parameters
|
| 103 |
+
----------
|
| 104 |
+
base_url:
|
| 105 |
+
URL racine de l'instance (ex : ``"https://escriptorium.bnf.fr"``).
|
| 106 |
+
token:
|
| 107 |
+
Token d'authentification API (depuis Settings > API dans eScriptorium).
|
| 108 |
+
timeout:
|
| 109 |
+
Timeout HTTP en secondes.
|
| 110 |
+
|
| 111 |
+
Examples
|
| 112 |
+
--------
|
| 113 |
+
>>> client = EScriptoriumClient("https://escriptorium.example.org", token="abc123")
|
| 114 |
+
>>> projects = client.list_projects()
|
| 115 |
+
>>> corpus = client.import_document(42, transcription_layer="manual")
|
| 116 |
+
"""
|
| 117 |
+
|
| 118 |
+
def __init__(
|
| 119 |
+
self,
|
| 120 |
+
base_url: str,
|
| 121 |
+
token: str,
|
| 122 |
+
timeout: int = 30,
|
| 123 |
+
) -> None:
|
| 124 |
+
self.base_url = base_url.rstrip("/")
|
| 125 |
+
self.token = token
|
| 126 |
+
self.timeout = timeout
|
| 127 |
+
|
| 128 |
+
# ------------------------------------------------------------------
|
| 129 |
+
# HTTP helpers
|
| 130 |
+
# ------------------------------------------------------------------
|
| 131 |
+
|
| 132 |
+
def _headers(self) -> dict[str, str]:
|
| 133 |
+
return {
|
| 134 |
+
"Authorization": f"Token {self.token}",
|
| 135 |
+
"Accept": "application/json",
|
| 136 |
+
"Content-Type": "application/json",
|
| 137 |
+
}
|
| 138 |
+
|
| 139 |
+
def _get(self, path: str, params: Optional[dict] = None) -> dict:
|
| 140 |
+
"""Effectue une requête GET et retourne le JSON."""
|
| 141 |
+
url = f"{self.base_url}/api/{path.lstrip('/')}"
|
| 142 |
+
if params:
|
| 143 |
+
url += "?" + urllib.parse.urlencode(params)
|
| 144 |
+
req = urllib.request.Request(url, headers=self._headers())
|
| 145 |
+
try:
|
| 146 |
+
with urllib.request.urlopen(req, timeout=self.timeout) as resp:
|
| 147 |
+
return json.loads(resp.read().decode("utf-8"))
|
| 148 |
+
except urllib.error.HTTPError as exc:
|
| 149 |
+
raise RuntimeError(
|
| 150 |
+
f"eScriptorium API erreur {exc.code} sur {url}: {exc.reason}"
|
| 151 |
+
) from exc
|
| 152 |
+
except urllib.error.URLError as exc:
|
| 153 |
+
raise RuntimeError(
|
| 154 |
+
f"Impossible de joindre {self.base_url}: {exc.reason}"
|
| 155 |
+
) from exc
|
| 156 |
+
|
| 157 |
+
def _post(self, path: str, payload: dict) -> dict:
|
| 158 |
+
"""Effectue une requête POST avec payload JSON."""
|
| 159 |
+
url = f"{self.base_url}/api/{path.lstrip('/')}"
|
| 160 |
+
data = json.dumps(payload).encode("utf-8")
|
| 161 |
+
req = urllib.request.Request(
|
| 162 |
+
url, data=data, headers=self._headers(), method="POST"
|
| 163 |
+
)
|
| 164 |
+
try:
|
| 165 |
+
with urllib.request.urlopen(req, timeout=self.timeout) as resp:
|
| 166 |
+
body = resp.read().decode("utf-8")
|
| 167 |
+
return json.loads(body) if body else {}
|
| 168 |
+
except urllib.error.HTTPError as exc:
|
| 169 |
+
raise RuntimeError(
|
| 170 |
+
f"eScriptorium API erreur {exc.code} sur {url}: {exc.reason}"
|
| 171 |
+
) from exc
|
| 172 |
+
except urllib.error.URLError as exc:
|
| 173 |
+
raise RuntimeError(
|
| 174 |
+
f"Impossible de joindre {self.base_url}: {exc.reason}"
|
| 175 |
+
) from exc
|
| 176 |
+
|
| 177 |
+
def _paginate(self, path: str, params: Optional[dict] = None) -> list[dict]:
|
| 178 |
+
"""Parcourt toutes les pages de résultats paginés."""
|
| 179 |
+
results: list[dict] = []
|
| 180 |
+
current_params = dict(params or {})
|
| 181 |
+
current_params.setdefault("page_size", 100)
|
| 182 |
+
page_num = 1
|
| 183 |
+
while True:
|
| 184 |
+
current_params["page"] = page_num
|
| 185 |
+
data = self._get(path, current_params)
|
| 186 |
+
if isinstance(data, list):
|
| 187 |
+
results.extend(data)
|
| 188 |
+
break
|
| 189 |
+
results.extend(data.get("results", []))
|
| 190 |
+
if not data.get("next"):
|
| 191 |
+
break
|
| 192 |
+
page_num += 1
|
| 193 |
+
return results
|
| 194 |
+
|
| 195 |
+
# ------------------------------------------------------------------
|
| 196 |
+
# API publique
|
| 197 |
+
# ------------------------------------------------------------------
|
| 198 |
+
|
| 199 |
+
def test_connection(self) -> bool:
|
| 200 |
+
"""Vérifie que l'URL et le token sont valides.
|
| 201 |
+
|
| 202 |
+
Returns
|
| 203 |
+
-------
|
| 204 |
+
bool
|
| 205 |
+
True si l'authentification réussit.
|
| 206 |
+
"""
|
| 207 |
+
try:
|
| 208 |
+
self._get("projects/", {"page_size": 1})
|
| 209 |
+
return True
|
| 210 |
+
except RuntimeError:
|
| 211 |
+
return False
|
| 212 |
+
|
| 213 |
+
def list_projects(self) -> list[EScriptoriumProject]:
|
| 214 |
+
"""Retourne la liste des projets accessibles.
|
| 215 |
+
|
| 216 |
+
Returns
|
| 217 |
+
-------
|
| 218 |
+
list[EScriptoriumProject]
|
| 219 |
+
"""
|
| 220 |
+
raw = self._paginate("projects/")
|
| 221 |
+
projects = []
|
| 222 |
+
for item in raw:
|
| 223 |
+
projects.append(EScriptoriumProject(
|
| 224 |
+
pk=item["pk"],
|
| 225 |
+
name=item.get("name", ""),
|
| 226 |
+
slug=item.get("slug", ""),
|
| 227 |
+
owner=item.get("owner", {}).get("username", "") if isinstance(item.get("owner"), dict) else str(item.get("owner", "")),
|
| 228 |
+
document_count=item.get("documents_count", 0),
|
| 229 |
+
))
|
| 230 |
+
return projects
|
| 231 |
+
|
| 232 |
+
def list_documents(
|
| 233 |
+
self,
|
| 234 |
+
project_pk: Optional[int] = None,
|
| 235 |
+
) -> list[EScriptoriumDocument]:
|
| 236 |
+
"""Retourne la liste des documents, filtrés par projet si fourni.
|
| 237 |
+
|
| 238 |
+
Parameters
|
| 239 |
+
----------
|
| 240 |
+
project_pk:
|
| 241 |
+
PK du projet eScriptorium (optionnel).
|
| 242 |
+
|
| 243 |
+
Returns
|
| 244 |
+
-------
|
| 245 |
+
list[EScriptoriumDocument]
|
| 246 |
+
"""
|
| 247 |
+
params: dict = {}
|
| 248 |
+
if project_pk is not None:
|
| 249 |
+
params["project"] = project_pk
|
| 250 |
+
raw = self._paginate("documents/", params)
|
| 251 |
+
docs = []
|
| 252 |
+
for item in raw:
|
| 253 |
+
layers = [
|
| 254 |
+
t.get("name", "") if isinstance(t, dict) else str(t)
|
| 255 |
+
for t in item.get("transcriptions", [])
|
| 256 |
+
]
|
| 257 |
+
docs.append(EScriptoriumDocument(
|
| 258 |
+
pk=item["pk"],
|
| 259 |
+
name=item.get("name", ""),
|
| 260 |
+
project=str(item.get("project", "")),
|
| 261 |
+
part_count=item.get("parts_count", 0),
|
| 262 |
+
transcription_layers=layers,
|
| 263 |
+
))
|
| 264 |
+
return docs
|
| 265 |
+
|
| 266 |
+
def list_parts(self, doc_pk: int) -> list[EScriptoriumPart]:
|
| 267 |
+
"""Retourne les pages (parts) d'un document.
|
| 268 |
+
|
| 269 |
+
Parameters
|
| 270 |
+
----------
|
| 271 |
+
doc_pk:
|
| 272 |
+
PK du document eScriptorium.
|
| 273 |
+
|
| 274 |
+
Returns
|
| 275 |
+
-------
|
| 276 |
+
list[EScriptoriumPart]
|
| 277 |
+
"""
|
| 278 |
+
raw = self._paginate(f"documents/{doc_pk}/parts/")
|
| 279 |
+
parts = []
|
| 280 |
+
for item in raw:
|
| 281 |
+
parts.append(EScriptoriumPart(
|
| 282 |
+
pk=item["pk"],
|
| 283 |
+
title=item.get("title", "") or f"Part {item.get('order', 0) + 1}",
|
| 284 |
+
image_url=item.get("image", "") or "",
|
| 285 |
+
order=item.get("order", 0),
|
| 286 |
+
))
|
| 287 |
+
return parts
|
| 288 |
+
|
| 289 |
+
def get_transcriptions(self, doc_pk: int, part_pk: int) -> list[dict]:
|
| 290 |
+
"""Retourne les transcriptions disponibles pour une page.
|
| 291 |
+
|
| 292 |
+
Parameters
|
| 293 |
+
----------
|
| 294 |
+
doc_pk:
|
| 295 |
+
PK du document.
|
| 296 |
+
part_pk:
|
| 297 |
+
PK de la page.
|
| 298 |
+
|
| 299 |
+
Returns
|
| 300 |
+
-------
|
| 301 |
+
list[dict]
|
| 302 |
+
Chaque dict contient ``{"name": str, "content": str}``.
|
| 303 |
+
"""
|
| 304 |
+
raw = self._get(f"documents/{doc_pk}/parts/{part_pk}/transcriptions/")
|
| 305 |
+
if isinstance(raw, list):
|
| 306 |
+
return raw
|
| 307 |
+
return raw.get("results", [])
|
| 308 |
+
|
| 309 |
+
def import_document(
|
| 310 |
+
self,
|
| 311 |
+
doc_pk: int,
|
| 312 |
+
transcription_layer: str = "manual",
|
| 313 |
+
output_dir: Optional[str] = None,
|
| 314 |
+
download_images: bool = True,
|
| 315 |
+
show_progress: bool = True,
|
| 316 |
+
) -> Corpus:
|
| 317 |
+
"""Importe un document eScriptorium comme corpus Picarones.
|
| 318 |
+
|
| 319 |
+
Télécharge les images et récupère les transcriptions de la couche
|
| 320 |
+
spécifiée comme vérité terrain.
|
| 321 |
+
|
| 322 |
+
Parameters
|
| 323 |
+
----------
|
| 324 |
+
doc_pk:
|
| 325 |
+
PK du document dans eScriptorium.
|
| 326 |
+
transcription_layer:
|
| 327 |
+
Nom de la couche de transcription à utiliser comme GT.
|
| 328 |
+
output_dir:
|
| 329 |
+
Dossier local pour les images téléchargées. Si None, les images
|
| 330 |
+
sont stockées en mémoire (pas de sauvegarde sur disque).
|
| 331 |
+
download_images:
|
| 332 |
+
Si True, télécharge les images dans output_dir.
|
| 333 |
+
show_progress:
|
| 334 |
+
Affiche une barre de progression tqdm.
|
| 335 |
+
|
| 336 |
+
Returns
|
| 337 |
+
-------
|
| 338 |
+
Corpus
|
| 339 |
+
Corpus Picarones avec documents et GT.
|
| 340 |
+
"""
|
| 341 |
+
# Récupérer les métadonnées du document
|
| 342 |
+
doc_info = self._get(f"documents/{doc_pk}/")
|
| 343 |
+
doc_name = doc_info.get("name", f"document_{doc_pk}")
|
| 344 |
+
|
| 345 |
+
parts = self.list_parts(doc_pk)
|
| 346 |
+
if not parts:
|
| 347 |
+
raise ValueError(f"Aucune page trouvée dans le document {doc_pk}")
|
| 348 |
+
|
| 349 |
+
if show_progress:
|
| 350 |
+
try:
|
| 351 |
+
from tqdm import tqdm
|
| 352 |
+
iterator = tqdm(parts, desc=f"Import {doc_name}")
|
| 353 |
+
except ImportError:
|
| 354 |
+
iterator = iter(parts)
|
| 355 |
+
else:
|
| 356 |
+
iterator = iter(parts)
|
| 357 |
+
|
| 358 |
+
out_path: Optional[Path] = None
|
| 359 |
+
if output_dir and download_images:
|
| 360 |
+
out_path = Path(output_dir)
|
| 361 |
+
out_path.mkdir(parents=True, exist_ok=True)
|
| 362 |
+
|
| 363 |
+
documents: list[Document] = []
|
| 364 |
+
for part in iterator:
|
| 365 |
+
# Récupérer les transcriptions
|
| 366 |
+
transcriptions = self.get_transcriptions(doc_pk, part.pk)
|
| 367 |
+
gt_text = ""
|
| 368 |
+
for t in transcriptions:
|
| 369 |
+
layer_name = t.get("transcription", {}).get("name", "") if isinstance(t.get("transcription"), dict) else t.get("name", "")
|
| 370 |
+
if layer_name == transcription_layer or not transcription_layer:
|
| 371 |
+
# Le contenu est dans "content" ou dans les lignes
|
| 372 |
+
lines = t.get("lines", []) or []
|
| 373 |
+
if lines:
|
| 374 |
+
gt_text = "\n".join(
|
| 375 |
+
line.get("content", "") or ""
|
| 376 |
+
for line in lines
|
| 377 |
+
if line.get("content")
|
| 378 |
+
)
|
| 379 |
+
else:
|
| 380 |
+
gt_text = t.get("content", "") or ""
|
| 381 |
+
break
|
| 382 |
+
|
| 383 |
+
# Image
|
| 384 |
+
image_path = part.image_url or f"escriptorium://doc{doc_pk}/part{part.pk}"
|
| 385 |
+
if out_path and part.image_url and download_images:
|
| 386 |
+
ext = Path(urllib.parse.urlparse(part.image_url).path).suffix or ".jpg"
|
| 387 |
+
local_img = out_path / f"part_{part.pk:05d}{ext}"
|
| 388 |
+
try:
|
| 389 |
+
urllib.request.urlretrieve(part.image_url, local_img)
|
| 390 |
+
image_path = str(local_img)
|
| 391 |
+
except Exception as exc:
|
| 392 |
+
logger.warning("Impossible de télécharger l'image %s: %s", part.image_url, exc)
|
| 393 |
+
|
| 394 |
+
# Sauvegarder la GT
|
| 395 |
+
gt_path = out_path / f"part_{part.pk:05d}.gt.txt"
|
| 396 |
+
gt_path.write_text(gt_text, encoding="utf-8")
|
| 397 |
+
|
| 398 |
+
documents.append(Document(
|
| 399 |
+
doc_id=f"part_{part.pk:05d}",
|
| 400 |
+
image_path=image_path,
|
| 401 |
+
ground_truth=gt_text,
|
| 402 |
+
metadata={
|
| 403 |
+
"source": "escriptorium",
|
| 404 |
+
"doc_pk": doc_pk,
|
| 405 |
+
"part_pk": part.pk,
|
| 406 |
+
"part_title": part.title,
|
| 407 |
+
"transcription_layer": transcription_layer,
|
| 408 |
+
},
|
| 409 |
+
))
|
| 410 |
+
|
| 411 |
+
return Corpus(
|
| 412 |
+
name=doc_name,
|
| 413 |
+
source=f"{self.base_url}/document/{doc_pk}/",
|
| 414 |
+
documents=documents,
|
| 415 |
+
metadata={
|
| 416 |
+
"escriptorium_url": self.base_url,
|
| 417 |
+
"doc_pk": doc_pk,
|
| 418 |
+
"transcription_layer": transcription_layer,
|
| 419 |
+
},
|
| 420 |
+
)
|
| 421 |
+
|
| 422 |
+
def export_benchmark_as_layer(
|
| 423 |
+
self,
|
| 424 |
+
benchmark_result: "BenchmarkResult",
|
| 425 |
+
doc_pk: int,
|
| 426 |
+
engine_name: str,
|
| 427 |
+
layer_name: Optional[str] = None,
|
| 428 |
+
part_mapping: Optional[dict[str, int]] = None,
|
| 429 |
+
) -> int:
|
| 430 |
+
"""Exporte les résultats Picarones comme couche OCR dans eScriptorium.
|
| 431 |
+
|
| 432 |
+
Parameters
|
| 433 |
+
----------
|
| 434 |
+
benchmark_result:
|
| 435 |
+
Résultats du benchmark Picarones.
|
| 436 |
+
doc_pk:
|
| 437 |
+
PK du document cible dans eScriptorium.
|
| 438 |
+
engine_name:
|
| 439 |
+
Nom du moteur dont on exporte les transcriptions.
|
| 440 |
+
layer_name:
|
| 441 |
+
Nom de la couche à créer (défaut : ``"picarones_{engine_name}"``).
|
| 442 |
+
part_mapping:
|
| 443 |
+
Correspondance ``doc_id → part_pk`` eScriptorium. Si None,
|
| 444 |
+
la correspondance est inférée depuis les métadonnées des documents.
|
| 445 |
+
|
| 446 |
+
Returns
|
| 447 |
+
-------
|
| 448 |
+
int
|
| 449 |
+
Nombre de pages exportées avec succès.
|
| 450 |
+
"""
|
| 451 |
+
if layer_name is None:
|
| 452 |
+
layer_name = f"picarones_{engine_name}"
|
| 453 |
+
|
| 454 |
+
# Trouver le rapport du moteur
|
| 455 |
+
engine_report = None
|
| 456 |
+
for report in benchmark_result.engine_reports:
|
| 457 |
+
if report.engine_name == engine_name:
|
| 458 |
+
engine_report = report
|
| 459 |
+
break
|
| 460 |
+
if engine_report is None:
|
| 461 |
+
raise ValueError(f"Moteur '{engine_name}' introuvable dans les résultats.")
|
| 462 |
+
|
| 463 |
+
exported = 0
|
| 464 |
+
for doc_result in engine_report.document_results:
|
| 465 |
+
if doc_result.engine_error:
|
| 466 |
+
continue
|
| 467 |
+
|
| 468 |
+
# Déterminer le part_pk
|
| 469 |
+
part_pk: Optional[int] = None
|
| 470 |
+
if part_mapping and doc_result.doc_id in part_mapping:
|
| 471 |
+
part_pk = part_mapping[doc_result.doc_id]
|
| 472 |
+
else:
|
| 473 |
+
# Essayer d'extraire depuis doc_id (ex: "part_00042")
|
| 474 |
+
try:
|
| 475 |
+
part_pk = int(doc_result.doc_id.replace("part_", "").lstrip("0") or "0")
|
| 476 |
+
except ValueError:
|
| 477 |
+
logger.warning("Impossible de déterminer part_pk pour %s", doc_result.doc_id)
|
| 478 |
+
continue
|
| 479 |
+
|
| 480 |
+
try:
|
| 481 |
+
self._post(
|
| 482 |
+
f"documents/{doc_pk}/parts/{part_pk}/transcriptions/",
|
| 483 |
+
{
|
| 484 |
+
"name": layer_name,
|
| 485 |
+
"content": doc_result.hypothesis,
|
| 486 |
+
"source": "picarones",
|
| 487 |
+
},
|
| 488 |
+
)
|
| 489 |
+
exported += 1
|
| 490 |
+
logger.debug("Exporté part %d → couche '%s'", part_pk, layer_name)
|
| 491 |
+
except RuntimeError as exc:
|
| 492 |
+
logger.warning("Erreur export part %d: %s", part_pk, exc)
|
| 493 |
+
|
| 494 |
+
return exported
|
| 495 |
+
|
| 496 |
+
|
| 497 |
+
# ---------------------------------------------------------------------------
|
| 498 |
+
# Interface de niveau module
|
| 499 |
+
# ---------------------------------------------------------------------------
|
| 500 |
+
|
| 501 |
+
def connect_escriptorium(
|
| 502 |
+
base_url: str,
|
| 503 |
+
token: str,
|
| 504 |
+
timeout: int = 30,
|
| 505 |
+
) -> EScriptoriumClient:
|
| 506 |
+
"""Crée et retourne un client eScriptorium authentifié.
|
| 507 |
+
|
| 508 |
+
Parameters
|
| 509 |
+
----------
|
| 510 |
+
base_url:
|
| 511 |
+
URL de l'instance eScriptorium.
|
| 512 |
+
token:
|
| 513 |
+
Token API.
|
| 514 |
+
timeout:
|
| 515 |
+
Timeout HTTP.
|
| 516 |
+
|
| 517 |
+
Returns
|
| 518 |
+
-------
|
| 519 |
+
EScriptoriumClient
|
| 520 |
+
|
| 521 |
+
Raises
|
| 522 |
+
------
|
| 523 |
+
RuntimeError
|
| 524 |
+
Si la connexion échoue (URL invalide, token incorrect, serveur inaccessible).
|
| 525 |
+
"""
|
| 526 |
+
client = EScriptoriumClient(base_url, token, timeout)
|
| 527 |
+
if not client.test_connection():
|
| 528 |
+
raise RuntimeError(
|
| 529 |
+
f"Impossible de se connecter à {base_url}. "
|
| 530 |
+
"Vérifiez l'URL et le token API."
|
| 531 |
+
)
|
| 532 |
+
return client
|
|
@@ -0,0 +1,540 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Import de corpus depuis Gallica (BnF) via l'API SRU et IIIF.
|
| 2 |
+
|
| 3 |
+
Fonctionnement
|
| 4 |
+
--------------
|
| 5 |
+
1. Recherche dans Gallica par cote (ark), titre, auteur ou date via l'API SRU BnF
|
| 6 |
+
2. Récupération des images via l'API IIIF Gallica
|
| 7 |
+
3. Récupération de l'OCR Gallica existant (texte brut ou ALTO) comme concurrent de référence
|
| 8 |
+
|
| 9 |
+
API utilisées
|
| 10 |
+
-------------
|
| 11 |
+
- SRU BnF : https://gallica.bnf.fr/SRU?operation=searchRetrieve&query=...
|
| 12 |
+
- IIIF Gallica : https://gallica.bnf.fr/ark:/12148/{ark}/manifest.json
|
| 13 |
+
- OCR texte brut : https://gallica.bnf.fr/ark:/12148/{ark}/f{n}.texteBrut
|
| 14 |
+
- Métadonnées OAI-PMH : https://gallica.bnf.fr/services/OAIRecord?ark={ark}
|
| 15 |
+
|
| 16 |
+
Usage
|
| 17 |
+
-----
|
| 18 |
+
>>> from picarones.importers.gallica import GallicaClient
|
| 19 |
+
>>> client = GallicaClient()
|
| 20 |
+
>>> results = client.search(title="Froissart", date_from=1380, date_to=1420, max_results=10)
|
| 21 |
+
>>> corpus = client.import_document(results[0].ark, pages="1-5", include_gallica_ocr=True)
|
| 22 |
+
"""
|
| 23 |
+
|
| 24 |
+
from __future__ import annotations
|
| 25 |
+
|
| 26 |
+
import json
|
| 27 |
+
import logging
|
| 28 |
+
import re
|
| 29 |
+
import time
|
| 30 |
+
import urllib.error
|
| 31 |
+
import urllib.parse
|
| 32 |
+
import urllib.request
|
| 33 |
+
import xml.etree.ElementTree as ET
|
| 34 |
+
from dataclasses import dataclass, field
|
| 35 |
+
from pathlib import Path
|
| 36 |
+
from typing import Optional
|
| 37 |
+
|
| 38 |
+
from picarones.core.corpus import Corpus, Document
|
| 39 |
+
|
| 40 |
+
logger = logging.getLogger(__name__)
|
| 41 |
+
|
| 42 |
+
# Namespaces SRU/OAI
|
| 43 |
+
_NS_SRU = "http://www.loc.gov/zing/srw/"
|
| 44 |
+
_NS_DC = "http://purl.org/dc/elements/1.1/"
|
| 45 |
+
_NS_OAI = "http://www.openarchives.org/OAI/2.0/"
|
| 46 |
+
|
| 47 |
+
_GALLICA_BASE = "https://gallica.bnf.fr"
|
| 48 |
+
_SRU_URL = f"{_GALLICA_BASE}/SRU"
|
| 49 |
+
_IIIF_MANIFEST_TPL = f"{_GALLICA_BASE}/ark:/{{ark}}/manifest.json"
|
| 50 |
+
_OCR_BRUT_TPL = f"{_GALLICA_BASE}/ark:/{{ark}}/f{{page}}.texteBrut"
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
# ---------------------------------------------------------------------------
|
| 54 |
+
# Structures de données
|
| 55 |
+
# ---------------------------------------------------------------------------
|
| 56 |
+
|
| 57 |
+
@dataclass
|
| 58 |
+
class GallicaRecord:
|
| 59 |
+
"""Un résultat de recherche Gallica."""
|
| 60 |
+
ark: str
|
| 61 |
+
"""Identifiant ARK sans préfixe (ex: ``'12148/btv1b8453561w'``)."""
|
| 62 |
+
title: str
|
| 63 |
+
creator: str = ""
|
| 64 |
+
date: str = ""
|
| 65 |
+
description: str = ""
|
| 66 |
+
type_doc: str = ""
|
| 67 |
+
language: str = ""
|
| 68 |
+
rights: str = ""
|
| 69 |
+
has_ocr: bool = False
|
| 70 |
+
"""True si Gallica fournit un OCR pour ce document."""
|
| 71 |
+
|
| 72 |
+
@property
|
| 73 |
+
def url(self) -> str:
|
| 74 |
+
return f"{_GALLICA_BASE}/ark:/12148/{self.ark}"
|
| 75 |
+
|
| 76 |
+
@property
|
| 77 |
+
def manifest_url(self) -> str:
|
| 78 |
+
return f"{_GALLICA_BASE}/ark:/12148/{self.ark}/manifest.json"
|
| 79 |
+
|
| 80 |
+
def as_dict(self) -> dict:
|
| 81 |
+
return {
|
| 82 |
+
"ark": self.ark,
|
| 83 |
+
"title": self.title,
|
| 84 |
+
"creator": self.creator,
|
| 85 |
+
"date": self.date,
|
| 86 |
+
"description": self.description,
|
| 87 |
+
"type_doc": self.type_doc,
|
| 88 |
+
"language": self.language,
|
| 89 |
+
"has_ocr": self.has_ocr,
|
| 90 |
+
"url": self.url,
|
| 91 |
+
"manifest_url": self.manifest_url,
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
# ---------------------------------------------------------------------------
|
| 96 |
+
# Client Gallica
|
| 97 |
+
# ---------------------------------------------------------------------------
|
| 98 |
+
|
| 99 |
+
class GallicaClient:
|
| 100 |
+
"""Client pour les APIs Gallica (SRU, IIIF, OCR texte brut).
|
| 101 |
+
|
| 102 |
+
Parameters
|
| 103 |
+
----------
|
| 104 |
+
timeout:
|
| 105 |
+
Timeout HTTP en secondes.
|
| 106 |
+
delay_between_requests:
|
| 107 |
+
Délai en secondes entre chaque requête (pour respecter les conditions
|
| 108 |
+
d'utilisation Gallica).
|
| 109 |
+
|
| 110 |
+
Examples
|
| 111 |
+
--------
|
| 112 |
+
>>> client = GallicaClient()
|
| 113 |
+
>>> results = client.search(author="Froissart", max_results=5)
|
| 114 |
+
>>> for r in results:
|
| 115 |
+
... print(r.title, r.date)
|
| 116 |
+
>>> corpus = client.import_document(results[0].ark, pages="1-3")
|
| 117 |
+
"""
|
| 118 |
+
|
| 119 |
+
def __init__(
|
| 120 |
+
self,
|
| 121 |
+
timeout: int = 30,
|
| 122 |
+
delay_between_requests: float = 0.5,
|
| 123 |
+
) -> None:
|
| 124 |
+
self.timeout = timeout
|
| 125 |
+
self.delay = delay_between_requests
|
| 126 |
+
|
| 127 |
+
def _fetch_url(self, url: str) -> bytes:
|
| 128 |
+
"""Télécharge le contenu d'une URL."""
|
| 129 |
+
req = urllib.request.Request(
|
| 130 |
+
url,
|
| 131 |
+
headers={"User-Agent": "Picarones/1.0 (BnF; research tool)"},
|
| 132 |
+
)
|
| 133 |
+
try:
|
| 134 |
+
with urllib.request.urlopen(req, timeout=self.timeout) as resp:
|
| 135 |
+
return resp.read()
|
| 136 |
+
except urllib.error.HTTPError as exc:
|
| 137 |
+
raise RuntimeError(
|
| 138 |
+
f"HTTP {exc.code} sur {url}: {exc.reason}"
|
| 139 |
+
) from exc
|
| 140 |
+
except urllib.error.URLError as exc:
|
| 141 |
+
raise RuntimeError(
|
| 142 |
+
f"Impossible de joindre {url}: {exc.reason}"
|
| 143 |
+
) from exc
|
| 144 |
+
finally:
|
| 145 |
+
if self.delay > 0:
|
| 146 |
+
time.sleep(self.delay)
|
| 147 |
+
|
| 148 |
+
def _build_sru_query(
|
| 149 |
+
self,
|
| 150 |
+
ark: Optional[str] = None,
|
| 151 |
+
title: Optional[str] = None,
|
| 152 |
+
author: Optional[str] = None,
|
| 153 |
+
date_from: Optional[int] = None,
|
| 154 |
+
date_to: Optional[int] = None,
|
| 155 |
+
doc_type: Optional[str] = None,
|
| 156 |
+
language: Optional[str] = None,
|
| 157 |
+
) -> str:
|
| 158 |
+
"""Construit une requête CQL pour l'API SRU BnF."""
|
| 159 |
+
clauses: list[str] = []
|
| 160 |
+
|
| 161 |
+
if ark:
|
| 162 |
+
# Recherche par identifiant ARK
|
| 163 |
+
clauses.append(f'dc.identifier any "{ark}"')
|
| 164 |
+
if title:
|
| 165 |
+
clauses.append(f'dc.title all "{title}"')
|
| 166 |
+
if author:
|
| 167 |
+
clauses.append(f'dc.creator all "{author}"')
|
| 168 |
+
if date_from and date_to:
|
| 169 |
+
clauses.append(f'dc.date >= "{date_from}" and dc.date <= "{date_to}"')
|
| 170 |
+
elif date_from:
|
| 171 |
+
clauses.append(f'dc.date >= "{date_from}"')
|
| 172 |
+
elif date_to:
|
| 173 |
+
clauses.append(f'dc.date <= "{date_to}"')
|
| 174 |
+
if doc_type:
|
| 175 |
+
clauses.append(f'dc.type all "{doc_type}"')
|
| 176 |
+
if language:
|
| 177 |
+
clauses.append(f'dc.language all "{language}"')
|
| 178 |
+
|
| 179 |
+
if not clauses:
|
| 180 |
+
return 'gallica all "document"'
|
| 181 |
+
return " and ".join(clauses)
|
| 182 |
+
|
| 183 |
+
def search(
|
| 184 |
+
self,
|
| 185 |
+
ark: Optional[str] = None,
|
| 186 |
+
title: Optional[str] = None,
|
| 187 |
+
author: Optional[str] = None,
|
| 188 |
+
date_from: Optional[int] = None,
|
| 189 |
+
date_to: Optional[int] = None,
|
| 190 |
+
doc_type: Optional[str] = None,
|
| 191 |
+
language: Optional[str] = None,
|
| 192 |
+
max_results: int = 20,
|
| 193 |
+
) -> list[GallicaRecord]:
|
| 194 |
+
"""Recherche dans Gallica via l'API SRU BnF.
|
| 195 |
+
|
| 196 |
+
Parameters
|
| 197 |
+
----------
|
| 198 |
+
ark:
|
| 199 |
+
Identifiant ARK (ex : ``'12148/btv1b8453561w'``).
|
| 200 |
+
title:
|
| 201 |
+
Mots-clés dans le titre.
|
| 202 |
+
author:
|
| 203 |
+
Mots-clés dans l'auteur/créateur.
|
| 204 |
+
date_from:
|
| 205 |
+
Borne inférieure de date (année).
|
| 206 |
+
date_to:
|
| 207 |
+
Borne supérieure de date (année).
|
| 208 |
+
doc_type:
|
| 209 |
+
Type de document (``'monographie'``, ``'périodique'``, ``'manuscrit'``…).
|
| 210 |
+
language:
|
| 211 |
+
Code langue ISO 639 (``'fre'``, ``'lat'``, ``'ger'``…).
|
| 212 |
+
max_results:
|
| 213 |
+
Nombre maximum de résultats à retourner.
|
| 214 |
+
|
| 215 |
+
Returns
|
| 216 |
+
-------
|
| 217 |
+
list[GallicaRecord]
|
| 218 |
+
Liste des documents trouvés.
|
| 219 |
+
"""
|
| 220 |
+
query = self._build_sru_query(
|
| 221 |
+
ark=ark,
|
| 222 |
+
title=title,
|
| 223 |
+
author=author,
|
| 224 |
+
date_from=date_from,
|
| 225 |
+
date_to=date_to,
|
| 226 |
+
doc_type=doc_type,
|
| 227 |
+
language=language,
|
| 228 |
+
)
|
| 229 |
+
|
| 230 |
+
params = urllib.parse.urlencode({
|
| 231 |
+
"operation": "searchRetrieve",
|
| 232 |
+
"version": "1.2",
|
| 233 |
+
"query": query,
|
| 234 |
+
"maximumRecords": min(max_results, 50),
|
| 235 |
+
"startRecord": 1,
|
| 236 |
+
"recordSchema": "unimarcXchange",
|
| 237 |
+
})
|
| 238 |
+
url = f"{_SRU_URL}?{params}"
|
| 239 |
+
|
| 240 |
+
try:
|
| 241 |
+
raw = self._fetch_url(url)
|
| 242 |
+
except RuntimeError as exc:
|
| 243 |
+
logger.error("Erreur recherche SRU Gallica: %s", exc)
|
| 244 |
+
return []
|
| 245 |
+
|
| 246 |
+
return self._parse_sru_response(raw, max_results)
|
| 247 |
+
|
| 248 |
+
def _parse_sru_response(self, xml_bytes: bytes, max_results: int) -> list[GallicaRecord]:
|
| 249 |
+
"""Parse la réponse SRU XML de Gallica."""
|
| 250 |
+
records: list[GallicaRecord] = []
|
| 251 |
+
try:
|
| 252 |
+
root = ET.fromstring(xml_bytes)
|
| 253 |
+
except ET.ParseError as exc:
|
| 254 |
+
logger.error("Impossible de parser la réponse SRU: %s", exc)
|
| 255 |
+
return records
|
| 256 |
+
|
| 257 |
+
# Les enregistrements sont dans srw:records/srw:record/srw:recordData
|
| 258 |
+
for rec_elem in root.iter():
|
| 259 |
+
if rec_elem.tag.endswith("}record") or rec_elem.tag == "record":
|
| 260 |
+
record = self._parse_record_element(rec_elem)
|
| 261 |
+
if record:
|
| 262 |
+
records.append(record)
|
| 263 |
+
if len(records) >= max_results:
|
| 264 |
+
break
|
| 265 |
+
|
| 266 |
+
return records
|
| 267 |
+
|
| 268 |
+
def _parse_record_element(self, elem: ET.Element) -> Optional[GallicaRecord]:
|
| 269 |
+
"""Extrait les métadonnées d'un enregistrement SRU."""
|
| 270 |
+
# Chercher les champs Dublin Core dans l'enregistrement
|
| 271 |
+
def find_text(tag_suffix: str) -> str:
|
| 272 |
+
for child in elem.iter():
|
| 273 |
+
if child.tag.endswith(tag_suffix) and child.text:
|
| 274 |
+
return child.text.strip()
|
| 275 |
+
return ""
|
| 276 |
+
|
| 277 |
+
def find_all_text(tag_suffix: str) -> list[str]:
|
| 278 |
+
return [
|
| 279 |
+
child.text.strip()
|
| 280 |
+
for child in elem.iter()
|
| 281 |
+
if child.tag.endswith(tag_suffix) and child.text
|
| 282 |
+
]
|
| 283 |
+
|
| 284 |
+
# Chercher l'ARK dans l'identifiant
|
| 285 |
+
identifiers = find_all_text("identifier")
|
| 286 |
+
ark = ""
|
| 287 |
+
for ident in identifiers:
|
| 288 |
+
# Format typique : "https://gallica.bnf.fr/ark:/12148/btv1b8453561w"
|
| 289 |
+
m = re.search(r"ark:/(\d+/\w+)", ident)
|
| 290 |
+
if m:
|
| 291 |
+
ark = m.group(1)
|
| 292 |
+
break
|
| 293 |
+
|
| 294 |
+
if not ark:
|
| 295 |
+
return None
|
| 296 |
+
|
| 297 |
+
title = find_text("title") or "Sans titre"
|
| 298 |
+
creator = find_text("creator")
|
| 299 |
+
date = find_text("date")
|
| 300 |
+
|
| 301 |
+
# Vérifier si OCR disponible (heuristique : type monographie/périodique généralement)
|
| 302 |
+
doc_types = find_all_text("type")
|
| 303 |
+
has_ocr = any(
|
| 304 |
+
t.lower() in ("monographie", "fascicule", "texte", "text")
|
| 305 |
+
for t in doc_types
|
| 306 |
+
)
|
| 307 |
+
|
| 308 |
+
return GallicaRecord(
|
| 309 |
+
ark=ark,
|
| 310 |
+
title=title,
|
| 311 |
+
creator=creator,
|
| 312 |
+
date=date,
|
| 313 |
+
description=find_text("description"),
|
| 314 |
+
type_doc=", ".join(doc_types),
|
| 315 |
+
language=find_text("language"),
|
| 316 |
+
has_ocr=has_ocr,
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
def get_ocr_text(self, ark: str, page: int) -> str:
|
| 320 |
+
"""Récupère l'OCR Gallica d'une page spécifique (texte brut).
|
| 321 |
+
|
| 322 |
+
Parameters
|
| 323 |
+
----------
|
| 324 |
+
ark:
|
| 325 |
+
Identifiant ARK (ex : ``'12148/btv1b8453561w'``).
|
| 326 |
+
page:
|
| 327 |
+
Numéro de page 1-based.
|
| 328 |
+
|
| 329 |
+
Returns
|
| 330 |
+
-------
|
| 331 |
+
str
|
| 332 |
+
Texte OCR Gallica pour cette page (peut être vide si non disponible).
|
| 333 |
+
"""
|
| 334 |
+
url = _OCR_BRUT_TPL.format(ark=ark, page=page)
|
| 335 |
+
try:
|
| 336 |
+
raw = self._fetch_url(url)
|
| 337 |
+
text = raw.decode("utf-8", errors="replace").strip()
|
| 338 |
+
# Gallica retourne parfois du HTML pour les pages sans OCR
|
| 339 |
+
if text.startswith("<!") or "<html" in text[:100].lower():
|
| 340 |
+
return ""
|
| 341 |
+
return text
|
| 342 |
+
except RuntimeError as exc:
|
| 343 |
+
logger.debug("OCR non disponible pour %s f%d: %s", ark, page, exc)
|
| 344 |
+
return ""
|
| 345 |
+
|
| 346 |
+
def import_document(
|
| 347 |
+
self,
|
| 348 |
+
ark: str,
|
| 349 |
+
pages: str = "all",
|
| 350 |
+
output_dir: Optional[str] = None,
|
| 351 |
+
include_gallica_ocr: bool = True,
|
| 352 |
+
max_resolution: int = 0,
|
| 353 |
+
show_progress: bool = True,
|
| 354 |
+
) -> Corpus:
|
| 355 |
+
"""Importe un document Gallica comme corpus Picarones.
|
| 356 |
+
|
| 357 |
+
Utilise le manifeste IIIF Gallica pour lister les pages et télécharger
|
| 358 |
+
les images. L'OCR Gallica est optionnellement récupéré comme GT ou comme
|
| 359 |
+
transcription de référence.
|
| 360 |
+
|
| 361 |
+
Parameters
|
| 362 |
+
----------
|
| 363 |
+
ark:
|
| 364 |
+
Identifiant ARK (ex : ``'12148/btv1b8453561w'``).
|
| 365 |
+
pages:
|
| 366 |
+
Sélecteur de pages (``'all'``, ``'1-10'``, ``'1,3,5'``…).
|
| 367 |
+
output_dir:
|
| 368 |
+
Dossier local pour stocker images et GT.
|
| 369 |
+
include_gallica_ocr:
|
| 370 |
+
Si True, récupère l'OCR Gallica comme texte de référence.
|
| 371 |
+
max_resolution:
|
| 372 |
+
Largeur maximale des images téléchargées (0 = maximum disponible).
|
| 373 |
+
show_progress:
|
| 374 |
+
Affiche une barre de progression.
|
| 375 |
+
|
| 376 |
+
Returns
|
| 377 |
+
-------
|
| 378 |
+
Corpus
|
| 379 |
+
Corpus avec images et OCR Gallica comme GT (si disponible).
|
| 380 |
+
"""
|
| 381 |
+
from picarones.importers.iiif import IIIFImporter
|
| 382 |
+
|
| 383 |
+
manifest_url = f"{_GALLICA_BASE}/ark:/12148/{ark}/manifest.json"
|
| 384 |
+
logger.info("Import Gallica ARK %s via IIIF : %s", ark, manifest_url)
|
| 385 |
+
|
| 386 |
+
# Utiliser l'importeur IIIF existant pour les images
|
| 387 |
+
importer = IIIFImporter(manifest_url, max_resolution=max_resolution)
|
| 388 |
+
importer.load()
|
| 389 |
+
|
| 390 |
+
corpus = importer.import_corpus(
|
| 391 |
+
pages=pages,
|
| 392 |
+
output_dir=output_dir or f"./corpus_gallica_{ark.split('/')[-1]}/",
|
| 393 |
+
show_progress=show_progress,
|
| 394 |
+
)
|
| 395 |
+
|
| 396 |
+
# Enrichir avec l'OCR Gallica si demandé
|
| 397 |
+
if include_gallica_ocr:
|
| 398 |
+
selected_indices = importer.list_canvases(pages)
|
| 399 |
+
for i, doc in enumerate(corpus.documents):
|
| 400 |
+
page_num = selected_indices[i] + 1 if i < len(selected_indices) else i + 1
|
| 401 |
+
gallica_ocr = self.get_ocr_text(ark, page_num)
|
| 402 |
+
if gallica_ocr:
|
| 403 |
+
doc.metadata["gallica_ocr"] = gallica_ocr
|
| 404 |
+
# Si pas de GT manuscrite, utiliser l'OCR Gallica comme référence
|
| 405 |
+
if not doc.ground_truth.strip():
|
| 406 |
+
doc.ground_truth = gallica_ocr
|
| 407 |
+
doc.metadata["gt_source"] = "gallica_ocr"
|
| 408 |
+
|
| 409 |
+
# Ajouter métadonnées Gallica
|
| 410 |
+
corpus.metadata.update({
|
| 411 |
+
"source": "gallica",
|
| 412 |
+
"ark": ark,
|
| 413 |
+
"manifest_url": manifest_url,
|
| 414 |
+
"gallica_url": f"{_GALLICA_BASE}/ark:/12148/{ark}",
|
| 415 |
+
"include_gallica_ocr": include_gallica_ocr,
|
| 416 |
+
})
|
| 417 |
+
|
| 418 |
+
return corpus
|
| 419 |
+
|
| 420 |
+
def get_metadata(self, ark: str) -> dict:
|
| 421 |
+
"""Récupère les métadonnées OAI-PMH d'un document Gallica.
|
| 422 |
+
|
| 423 |
+
Parameters
|
| 424 |
+
----------
|
| 425 |
+
ark:
|
| 426 |
+
Identifiant ARK.
|
| 427 |
+
|
| 428 |
+
Returns
|
| 429 |
+
-------
|
| 430 |
+
dict
|
| 431 |
+
Métadonnées Dublin Core du document.
|
| 432 |
+
"""
|
| 433 |
+
url = f"{_GALLICA_BASE}/services/OAIRecord?ark=ark:/12148/{ark}"
|
| 434 |
+
try:
|
| 435 |
+
raw = self._fetch_url(url)
|
| 436 |
+
root = ET.fromstring(raw)
|
| 437 |
+
except (RuntimeError, ET.ParseError) as exc:
|
| 438 |
+
logger.error("Erreur métadonnées OAI %s: %s", ark, exc)
|
| 439 |
+
return {"ark": ark}
|
| 440 |
+
|
| 441 |
+
def find_text(tag_suffix: str) -> str:
|
| 442 |
+
for elem in root.iter():
|
| 443 |
+
if elem.tag.endswith(tag_suffix) and elem.text:
|
| 444 |
+
return elem.text.strip()
|
| 445 |
+
return ""
|
| 446 |
+
|
| 447 |
+
return {
|
| 448 |
+
"ark": ark,
|
| 449 |
+
"title": find_text("title"),
|
| 450 |
+
"creator": find_text("creator"),
|
| 451 |
+
"date": find_text("date"),
|
| 452 |
+
"description": find_text("description"),
|
| 453 |
+
"subject": find_text("subject"),
|
| 454 |
+
"language": find_text("language"),
|
| 455 |
+
"type": find_text("type"),
|
| 456 |
+
"format": find_text("format"),
|
| 457 |
+
"source": find_text("source"),
|
| 458 |
+
"url": f"{_GALLICA_BASE}/ark:/12148/{ark}",
|
| 459 |
+
}
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
# ---------------------------------------------------------------------------
|
| 463 |
+
# Fonctions de commodité
|
| 464 |
+
# ---------------------------------------------------------------------------
|
| 465 |
+
|
| 466 |
+
def search_gallica(
|
| 467 |
+
title: Optional[str] = None,
|
| 468 |
+
author: Optional[str] = None,
|
| 469 |
+
ark: Optional[str] = None,
|
| 470 |
+
date_from: Optional[int] = None,
|
| 471 |
+
date_to: Optional[int] = None,
|
| 472 |
+
max_results: int = 20,
|
| 473 |
+
) -> list[GallicaRecord]:
|
| 474 |
+
"""Recherche rapide dans Gallica.
|
| 475 |
+
|
| 476 |
+
Crée un client temporaire et effectue une recherche.
|
| 477 |
+
|
| 478 |
+
Parameters
|
| 479 |
+
----------
|
| 480 |
+
title, author, ark, date_from, date_to:
|
| 481 |
+
Critères de recherche.
|
| 482 |
+
max_results:
|
| 483 |
+
Nombre maximum de résultats.
|
| 484 |
+
|
| 485 |
+
Returns
|
| 486 |
+
-------
|
| 487 |
+
list[GallicaRecord]
|
| 488 |
+
|
| 489 |
+
Examples
|
| 490 |
+
--------
|
| 491 |
+
>>> results = search_gallica(title="Froissart", date_from=1380, date_to=1430)
|
| 492 |
+
>>> for r in results[:3]:
|
| 493 |
+
... print(r.title, r.ark)
|
| 494 |
+
"""
|
| 495 |
+
client = GallicaClient()
|
| 496 |
+
return client.search(
|
| 497 |
+
ark=ark,
|
| 498 |
+
title=title,
|
| 499 |
+
author=author,
|
| 500 |
+
date_from=date_from,
|
| 501 |
+
date_to=date_to,
|
| 502 |
+
max_results=max_results,
|
| 503 |
+
)
|
| 504 |
+
|
| 505 |
+
|
| 506 |
+
def import_gallica_document(
|
| 507 |
+
ark: str,
|
| 508 |
+
pages: str = "all",
|
| 509 |
+
output_dir: Optional[str] = None,
|
| 510 |
+
include_gallica_ocr: bool = True,
|
| 511 |
+
) -> Corpus:
|
| 512 |
+
"""Importe un document Gallica en une ligne.
|
| 513 |
+
|
| 514 |
+
Parameters
|
| 515 |
+
----------
|
| 516 |
+
ark:
|
| 517 |
+
Identifiant ARK (``'12148/btv1b8453561w'`` ou URL complète).
|
| 518 |
+
pages:
|
| 519 |
+
Sélecteur de pages (``'all'``, ``'1-10'``…).
|
| 520 |
+
output_dir:
|
| 521 |
+
Dossier de sortie.
|
| 522 |
+
include_gallica_ocr:
|
| 523 |
+
Inclure l'OCR Gallica comme GT.
|
| 524 |
+
|
| 525 |
+
Returns
|
| 526 |
+
-------
|
| 527 |
+
Corpus
|
| 528 |
+
"""
|
| 529 |
+
# Normaliser l'ARK (extraire depuis URL complète si besoin)
|
| 530 |
+
m = re.search(r"ark:/(\d+/\w+)", ark)
|
| 531 |
+
if m:
|
| 532 |
+
ark = m.group(1)
|
| 533 |
+
|
| 534 |
+
client = GallicaClient()
|
| 535 |
+
return client.import_document(
|
| 536 |
+
ark=ark,
|
| 537 |
+
pages=pages,
|
| 538 |
+
output_dir=output_dir,
|
| 539 |
+
include_gallica_ocr=include_gallica_ocr,
|
| 540 |
+
)
|
|
@@ -0,0 +1,678 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests Sprint 8 — Intégration eScriptorium et import Gallica.
|
| 2 |
+
|
| 3 |
+
Classes de tests
|
| 4 |
+
----------------
|
| 5 |
+
TestEScriptoriumClient (12 tests) — client API eScriptorium (mocks HTTP)
|
| 6 |
+
TestEScriptoriumConnect (4 tests) — fonction connect_escriptorium
|
| 7 |
+
TestEScriptoriumExport (8 tests) — export benchmark → couche OCR eScriptorium
|
| 8 |
+
TestGallicaRecord (6 tests) — structure GallicaRecord
|
| 9 |
+
TestGallicaClient (12 tests) — client Gallica (mocks HTTP)
|
| 10 |
+
TestGallicaSearchQuery (8 tests) — construction de requêtes SRU
|
| 11 |
+
TestGallicaOCR (6 tests) — récupération OCR Gallica
|
| 12 |
+
TestImportersInit (4 tests) — __init__.py importers
|
| 13 |
+
TestCLIHistory (6 tests) — commande picarones history
|
| 14 |
+
TestCLIRobustness (6 tests) — commande picarones robustness
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
from __future__ import annotations
|
| 18 |
+
|
| 19 |
+
import json
|
| 20 |
+
import unittest
|
| 21 |
+
from unittest.mock import MagicMock, patch
|
| 22 |
+
import pytest
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# ===========================================================================
|
| 26 |
+
# TestEScriptoriumClient
|
| 27 |
+
# ===========================================================================
|
| 28 |
+
|
| 29 |
+
class TestEScriptoriumClient:
|
| 30 |
+
|
| 31 |
+
def test_import_module(self):
|
| 32 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 33 |
+
assert EScriptoriumClient is not None
|
| 34 |
+
|
| 35 |
+
def test_init_attributes(self):
|
| 36 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 37 |
+
client = EScriptoriumClient("https://escriptorium.example.org", token="tok123", timeout=60)
|
| 38 |
+
assert client.base_url == "https://escriptorium.example.org"
|
| 39 |
+
assert client.token == "tok123"
|
| 40 |
+
assert client.timeout == 60
|
| 41 |
+
|
| 42 |
+
def test_base_url_trailing_slash_stripped(self):
|
| 43 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 44 |
+
client = EScriptoriumClient("https://escriptorium.example.org/", token="tok")
|
| 45 |
+
assert not client.base_url.endswith("/")
|
| 46 |
+
|
| 47 |
+
def test_headers_contain_token(self):
|
| 48 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 49 |
+
client = EScriptoriumClient("https://example.org", token="mytoken")
|
| 50 |
+
headers = client._headers()
|
| 51 |
+
assert "Token mytoken" in headers.get("Authorization", "")
|
| 52 |
+
|
| 53 |
+
def test_headers_contain_accept_json(self):
|
| 54 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 55 |
+
client = EScriptoriumClient("https://example.org", token="tok")
|
| 56 |
+
headers = client._headers()
|
| 57 |
+
assert "application/json" in headers.get("Accept", "")
|
| 58 |
+
|
| 59 |
+
def test_test_connection_success(self):
|
| 60 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 61 |
+
client = EScriptoriumClient("https://example.org", token="tok")
|
| 62 |
+
with patch.object(client, "_get", return_value={"results": [], "count": 0}):
|
| 63 |
+
assert client.test_connection() is True
|
| 64 |
+
|
| 65 |
+
def test_test_connection_failure(self):
|
| 66 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 67 |
+
client = EScriptoriumClient("https://example.org", token="bad")
|
| 68 |
+
with patch.object(client, "_get", side_effect=RuntimeError("403")):
|
| 69 |
+
assert client.test_connection() is False
|
| 70 |
+
|
| 71 |
+
def test_list_projects_empty(self):
|
| 72 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 73 |
+
client = EScriptoriumClient("https://example.org", token="tok")
|
| 74 |
+
with patch.object(client, "_paginate", return_value=[]):
|
| 75 |
+
projects = client.list_projects()
|
| 76 |
+
assert projects == []
|
| 77 |
+
|
| 78 |
+
def test_list_projects_parses_items(self):
|
| 79 |
+
from picarones.importers.escriptorium import EScriptoriumClient, EScriptoriumProject
|
| 80 |
+
client = EScriptoriumClient("https://example.org", token="tok")
|
| 81 |
+
mock_data = [
|
| 82 |
+
{"pk": 1, "name": "Projet BnF", "slug": "projet-bnf",
|
| 83 |
+
"owner": {"username": "user1"}, "documents_count": 5},
|
| 84 |
+
]
|
| 85 |
+
with patch.object(client, "_paginate", return_value=mock_data):
|
| 86 |
+
projects = client.list_projects()
|
| 87 |
+
assert len(projects) == 1
|
| 88 |
+
assert isinstance(projects[0], EScriptoriumProject)
|
| 89 |
+
assert projects[0].pk == 1
|
| 90 |
+
assert projects[0].name == "Projet BnF"
|
| 91 |
+
assert projects[0].document_count == 5
|
| 92 |
+
|
| 93 |
+
def test_list_documents_with_project_filter(self):
|
| 94 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 95 |
+
client = EScriptoriumClient("https://example.org", token="tok")
|
| 96 |
+
with patch.object(client, "_paginate", return_value=[]) as mock_pag:
|
| 97 |
+
client.list_documents(project_pk=42)
|
| 98 |
+
call_kwargs = mock_pag.call_args
|
| 99 |
+
assert call_kwargs[0][1]["project"] == 42
|
| 100 |
+
|
| 101 |
+
def test_list_parts_returns_list(self):
|
| 102 |
+
from picarones.importers.escriptorium import EScriptoriumClient, EScriptoriumPart
|
| 103 |
+
client = EScriptoriumClient("https://example.org", token="tok")
|
| 104 |
+
mock_data = [
|
| 105 |
+
{"pk": 10, "title": "f. 1r", "image": "https://example.org/img/1.jpg", "order": 0},
|
| 106 |
+
{"pk": 11, "title": "f. 1v", "image": "https://example.org/img/2.jpg", "order": 1},
|
| 107 |
+
]
|
| 108 |
+
with patch.object(client, "_paginate", return_value=mock_data):
|
| 109 |
+
parts = client.list_parts(doc_pk=5)
|
| 110 |
+
assert len(parts) == 2
|
| 111 |
+
assert isinstance(parts[0], EScriptoriumPart)
|
| 112 |
+
assert parts[0].pk == 10
|
| 113 |
+
|
| 114 |
+
def test_escriptorium_project_as_dict(self):
|
| 115 |
+
from picarones.importers.escriptorium import EScriptoriumProject
|
| 116 |
+
p = EScriptoriumProject(pk=1, name="Test", slug="test", owner="user", document_count=3)
|
| 117 |
+
d = p.as_dict()
|
| 118 |
+
assert d["pk"] == 1
|
| 119 |
+
assert d["name"] == "Test"
|
| 120 |
+
assert d["document_count"] == 3
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
# ===========================================================================
|
| 124 |
+
# TestEScriptoriumConnect
|
| 125 |
+
# ===========================================================================
|
| 126 |
+
|
| 127 |
+
class TestEScriptoriumConnect:
|
| 128 |
+
|
| 129 |
+
def test_connect_success(self):
|
| 130 |
+
from picarones.importers.escriptorium import connect_escriptorium, EScriptoriumClient
|
| 131 |
+
with patch.object(EScriptoriumClient, "test_connection", return_value=True):
|
| 132 |
+
client = connect_escriptorium("https://example.org", token="tok")
|
| 133 |
+
assert isinstance(client, EScriptoriumClient)
|
| 134 |
+
|
| 135 |
+
def test_connect_failure_raises(self):
|
| 136 |
+
from picarones.importers.escriptorium import connect_escriptorium, EScriptoriumClient
|
| 137 |
+
with patch.object(EScriptoriumClient, "test_connection", return_value=False):
|
| 138 |
+
with pytest.raises(RuntimeError, match="Impossible de se connecter"):
|
| 139 |
+
connect_escriptorium("https://example.org", token="bad")
|
| 140 |
+
|
| 141 |
+
def test_connect_returns_client_with_correct_url(self):
|
| 142 |
+
from picarones.importers.escriptorium import connect_escriptorium, EScriptoriumClient
|
| 143 |
+
with patch.object(EScriptoriumClient, "test_connection", return_value=True):
|
| 144 |
+
client = connect_escriptorium("https://myinstance.org", token="tok")
|
| 145 |
+
assert "myinstance.org" in client.base_url
|
| 146 |
+
|
| 147 |
+
def test_connect_timeout_passed(self):
|
| 148 |
+
from picarones.importers.escriptorium import connect_escriptorium, EScriptoriumClient
|
| 149 |
+
with patch.object(EScriptoriumClient, "test_connection", return_value=True):
|
| 150 |
+
client = connect_escriptorium("https://example.org", token="tok", timeout=120)
|
| 151 |
+
assert client.timeout == 120
|
| 152 |
+
|
| 153 |
+
|
| 154 |
+
# ===========================================================================
|
| 155 |
+
# TestEScriptoriumExport
|
| 156 |
+
# ===========================================================================
|
| 157 |
+
|
| 158 |
+
class TestEScriptoriumExport:
|
| 159 |
+
|
| 160 |
+
def _make_benchmark(self, engine_name: str = "tesseract") -> "BenchmarkResult":
|
| 161 |
+
from picarones.core.results import BenchmarkResult, EngineReport, DocumentResult
|
| 162 |
+
from picarones.core.metrics import MetricsResult
|
| 163 |
+
metrics = MetricsResult(cer=0.05, wer=0.10, cer_nfc=0.05,
|
| 164 |
+
cer_caseless=0.04, cer_diplomatic=0.04,
|
| 165 |
+
wer_normalized=0.09, mer=0.09, wil=0.05,
|
| 166 |
+
reference_length=100, hypothesis_length=100)
|
| 167 |
+
doc = DocumentResult(
|
| 168 |
+
doc_id="part_00001",
|
| 169 |
+
image_path="/img/1.jpg",
|
| 170 |
+
ground_truth="texte gt",
|
| 171 |
+
hypothesis="texte ocr",
|
| 172 |
+
metrics=metrics,
|
| 173 |
+
duration_seconds=1.0,
|
| 174 |
+
)
|
| 175 |
+
report = EngineReport(
|
| 176 |
+
engine_name=engine_name,
|
| 177 |
+
engine_version="5.3",
|
| 178 |
+
engine_config={},
|
| 179 |
+
document_results=[doc],
|
| 180 |
+
)
|
| 181 |
+
return BenchmarkResult(
|
| 182 |
+
corpus_name="Test",
|
| 183 |
+
corpus_source="/test/",
|
| 184 |
+
document_count=1,
|
| 185 |
+
engine_reports=[report],
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
def test_export_unknown_engine_raises(self):
|
| 189 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 190 |
+
client = EScriptoriumClient("https://example.org", token="tok")
|
| 191 |
+
bm = self._make_benchmark("tesseract")
|
| 192 |
+
with pytest.raises(ValueError, match="unknown_engine"):
|
| 193 |
+
client.export_benchmark_as_layer(bm, doc_pk=1, engine_name="unknown_engine")
|
| 194 |
+
|
| 195 |
+
def test_export_returns_count(self):
|
| 196 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 197 |
+
client = EScriptoriumClient("https://example.org", token="tok")
|
| 198 |
+
bm = self._make_benchmark("tesseract")
|
| 199 |
+
with patch.object(client, "_post", return_value={}):
|
| 200 |
+
count = client.export_benchmark_as_layer(
|
| 201 |
+
bm, doc_pk=1, engine_name="tesseract"
|
| 202 |
+
)
|
| 203 |
+
assert count == 1
|
| 204 |
+
|
| 205 |
+
def test_export_layer_name_default(self):
|
| 206 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 207 |
+
client = EScriptoriumClient("https://example.org", token="tok")
|
| 208 |
+
bm = self._make_benchmark("tesseract")
|
| 209 |
+
calls = []
|
| 210 |
+
with patch.object(client, "_post", side_effect=lambda path, payload: calls.append(payload) or {}):
|
| 211 |
+
client.export_benchmark_as_layer(bm, doc_pk=1, engine_name="tesseract")
|
| 212 |
+
assert calls[0]["name"] == "picarones_tesseract"
|
| 213 |
+
|
| 214 |
+
def test_export_custom_layer_name(self):
|
| 215 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 216 |
+
client = EScriptoriumClient("https://example.org", token="tok")
|
| 217 |
+
bm = self._make_benchmark("tesseract")
|
| 218 |
+
calls = []
|
| 219 |
+
with patch.object(client, "_post", side_effect=lambda path, payload: calls.append(payload) or {}):
|
| 220 |
+
client.export_benchmark_as_layer(
|
| 221 |
+
bm, doc_pk=1, engine_name="tesseract", layer_name="my_layer"
|
| 222 |
+
)
|
| 223 |
+
assert calls[0]["name"] == "my_layer"
|
| 224 |
+
|
| 225 |
+
def test_export_skips_error_docs(self):
|
| 226 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 227 |
+
from picarones.core.results import BenchmarkResult, EngineReport, DocumentResult
|
| 228 |
+
from picarones.core.metrics import MetricsResult
|
| 229 |
+
metrics = MetricsResult(cer=0.1, wer=0.2, cer_nfc=0.1, cer_caseless=0.1,
|
| 230 |
+
cer_diplomatic=0.1, wer_normalized=0.2, mer=0.2, wil=0.1,
|
| 231 |
+
reference_length=50, hypothesis_length=50)
|
| 232 |
+
docs = [
|
| 233 |
+
DocumentResult("part_00001", "/img/1.jpg", "gt", "hyp", metrics, 1.0),
|
| 234 |
+
DocumentResult("part_00002", "/img/2.jpg", "gt", "", metrics, 0.5, engine_error="timeout"),
|
| 235 |
+
]
|
| 236 |
+
report = EngineReport("tesseract", "5.3", {}, docs)
|
| 237 |
+
bm = BenchmarkResult("C", "/", 2, [report])
|
| 238 |
+
client = EScriptoriumClient("https://example.org", token="tok")
|
| 239 |
+
with patch.object(client, "_post", return_value={}):
|
| 240 |
+
count = client.export_benchmark_as_layer(bm, doc_pk=1, engine_name="tesseract")
|
| 241 |
+
assert count == 1 # seul le doc sans erreur est exporté
|
| 242 |
+
|
| 243 |
+
def test_export_with_part_mapping(self):
|
| 244 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 245 |
+
client = EScriptoriumClient("https://example.org", token="tok")
|
| 246 |
+
bm = self._make_benchmark("tesseract")
|
| 247 |
+
calls = []
|
| 248 |
+
with patch.object(client, "_post", side_effect=lambda path, payload: calls.append(path) or {}):
|
| 249 |
+
client.export_benchmark_as_layer(
|
| 250 |
+
bm, doc_pk=1, engine_name="tesseract",
|
| 251 |
+
part_mapping={"part_00001": 999},
|
| 252 |
+
)
|
| 253 |
+
assert "999" in calls[0]
|
| 254 |
+
|
| 255 |
+
def test_export_post_error_is_logged_not_raised(self):
|
| 256 |
+
from picarones.importers.escriptorium import EScriptoriumClient
|
| 257 |
+
client = EScriptoriumClient("https://example.org", token="tok")
|
| 258 |
+
bm = self._make_benchmark("tesseract")
|
| 259 |
+
with patch.object(client, "_post", side_effect=RuntimeError("500")):
|
| 260 |
+
count = client.export_benchmark_as_layer(bm, doc_pk=1, engine_name="tesseract")
|
| 261 |
+
assert count == 0
|
| 262 |
+
|
| 263 |
+
def test_document_result_as_dict_used(self):
|
| 264 |
+
from picarones.importers.escriptorium import EScriptoriumDocument
|
| 265 |
+
d = EScriptoriumDocument(pk=42, name="Doc", project="1", part_count=10,
|
| 266 |
+
transcription_layers=["manual", "auto"])
|
| 267 |
+
d_dict = d.as_dict()
|
| 268 |
+
assert d_dict["pk"] == 42
|
| 269 |
+
assert "manual" in d_dict["transcription_layers"]
|
| 270 |
+
|
| 271 |
+
|
| 272 |
+
# ===========================================================================
|
| 273 |
+
# TestGallicaRecord
|
| 274 |
+
# ===========================================================================
|
| 275 |
+
|
| 276 |
+
class TestGallicaRecord:
|
| 277 |
+
|
| 278 |
+
def test_import_module(self):
|
| 279 |
+
from picarones.importers.gallica import GallicaRecord
|
| 280 |
+
assert GallicaRecord is not None
|
| 281 |
+
|
| 282 |
+
def test_ark_property(self):
|
| 283 |
+
from picarones.importers.gallica import GallicaRecord
|
| 284 |
+
r = GallicaRecord(ark="12148/btv1b8453561w", title="Test")
|
| 285 |
+
assert "12148/btv1b8453561w" in r.url
|
| 286 |
+
|
| 287 |
+
def test_manifest_url(self):
|
| 288 |
+
from picarones.importers.gallica import GallicaRecord
|
| 289 |
+
r = GallicaRecord(ark="12148/btv1b8453561w", title="Test")
|
| 290 |
+
assert "manifest.json" in r.manifest_url
|
| 291 |
+
assert "12148/btv1b8453561w" in r.manifest_url
|
| 292 |
+
|
| 293 |
+
def test_as_dict_keys(self):
|
| 294 |
+
from picarones.importers.gallica import GallicaRecord
|
| 295 |
+
r = GallicaRecord(ark="12148/btv1b8453561w", title="Froissart", creator="Froissart")
|
| 296 |
+
d = r.as_dict()
|
| 297 |
+
assert "ark" in d
|
| 298 |
+
assert "title" in d
|
| 299 |
+
assert "manifest_url" in d
|
| 300 |
+
assert "url" in d
|
| 301 |
+
|
| 302 |
+
def test_has_ocr_default_false(self):
|
| 303 |
+
from picarones.importers.gallica import GallicaRecord
|
| 304 |
+
r = GallicaRecord(ark="12148/xxx", title="Test")
|
| 305 |
+
assert r.has_ocr is False
|
| 306 |
+
|
| 307 |
+
def test_has_ocr_true(self):
|
| 308 |
+
from picarones.importers.gallica import GallicaRecord
|
| 309 |
+
r = GallicaRecord(ark="12148/xxx", title="Test", has_ocr=True)
|
| 310 |
+
assert r.has_ocr is True
|
| 311 |
+
|
| 312 |
+
|
| 313 |
+
# ===========================================================================
|
| 314 |
+
# TestGallicaClient
|
| 315 |
+
# ===========================================================================
|
| 316 |
+
|
| 317 |
+
class TestGallicaClient:
|
| 318 |
+
|
| 319 |
+
def test_import_module(self):
|
| 320 |
+
from picarones.importers.gallica import GallicaClient
|
| 321 |
+
assert GallicaClient is not None
|
| 322 |
+
|
| 323 |
+
def test_init_defaults(self):
|
| 324 |
+
from picarones.importers.gallica import GallicaClient
|
| 325 |
+
client = GallicaClient()
|
| 326 |
+
assert client.timeout == 30
|
| 327 |
+
assert client.delay >= 0
|
| 328 |
+
|
| 329 |
+
def test_search_returns_list(self):
|
| 330 |
+
from picarones.importers.gallica import GallicaClient
|
| 331 |
+
client = GallicaClient(delay_between_requests=0)
|
| 332 |
+
with patch.object(client, "_fetch_url", side_effect=RuntimeError("network")):
|
| 333 |
+
results = client.search(title="Froissart", max_results=5)
|
| 334 |
+
assert isinstance(results, list)
|
| 335 |
+
|
| 336 |
+
def test_search_empty_on_network_error(self):
|
| 337 |
+
from picarones.importers.gallica import GallicaClient
|
| 338 |
+
client = GallicaClient(delay_between_requests=0)
|
| 339 |
+
with patch.object(client, "_fetch_url", side_effect=RuntimeError("timeout")):
|
| 340 |
+
results = client.search(title="test")
|
| 341 |
+
assert results == []
|
| 342 |
+
|
| 343 |
+
def test_get_ocr_text_returns_string(self):
|
| 344 |
+
from picarones.importers.gallica import GallicaClient
|
| 345 |
+
client = GallicaClient(delay_between_requests=0)
|
| 346 |
+
with patch.object(client, "_fetch_url", return_value=b"Froissart transcription"):
|
| 347 |
+
text = client.get_ocr_text("12148/btv1b8453561w", page=1)
|
| 348 |
+
assert isinstance(text, str)
|
| 349 |
+
assert "Froissart" in text
|
| 350 |
+
|
| 351 |
+
def test_get_ocr_text_empty_on_html_response(self):
|
| 352 |
+
from picarones.importers.gallica import GallicaClient
|
| 353 |
+
client = GallicaClient(delay_between_requests=0)
|
| 354 |
+
html = b"<!DOCTYPE html><html><body>Page non disponible</body></html>"
|
| 355 |
+
with patch.object(client, "_fetch_url", return_value=html):
|
| 356 |
+
text = client.get_ocr_text("12148/xxx", page=1)
|
| 357 |
+
assert text == ""
|
| 358 |
+
|
| 359 |
+
def test_get_ocr_text_empty_on_error(self):
|
| 360 |
+
from picarones.importers.gallica import GallicaClient
|
| 361 |
+
client = GallicaClient(delay_between_requests=0)
|
| 362 |
+
with patch.object(client, "_fetch_url", side_effect=RuntimeError("404")):
|
| 363 |
+
text = client.get_ocr_text("12148/xxx", page=99)
|
| 364 |
+
assert text == ""
|
| 365 |
+
|
| 366 |
+
def test_get_metadata_returns_dict(self):
|
| 367 |
+
from picarones.importers.gallica import GallicaClient
|
| 368 |
+
client = GallicaClient(delay_between_requests=0)
|
| 369 |
+
xml_bytes = b"""<?xml version="1.0" encoding="UTF-8"?>
|
| 370 |
+
<oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"
|
| 371 |
+
xmlns:dc="http://purl.org/dc/elements/1.1/">
|
| 372 |
+
<dc:title>Chroniques de France</dc:title>
|
| 373 |
+
<dc:creator>Jean Froissart</dc:creator>
|
| 374 |
+
<dc:date>1380</dc:date>
|
| 375 |
+
</oai_dc:dc>"""
|
| 376 |
+
with patch.object(client, "_fetch_url", return_value=xml_bytes):
|
| 377 |
+
meta = client.get_metadata("12148/btv1b8453561w")
|
| 378 |
+
assert "ark" in meta
|
| 379 |
+
assert meta["title"] == "Chroniques de France"
|
| 380 |
+
assert meta["creator"] == "Jean Froissart"
|
| 381 |
+
|
| 382 |
+
def test_get_metadata_on_error_returns_ark_dict(self):
|
| 383 |
+
from picarones.importers.gallica import GallicaClient
|
| 384 |
+
client = GallicaClient(delay_between_requests=0)
|
| 385 |
+
with patch.object(client, "_fetch_url", side_effect=RuntimeError("500")):
|
| 386 |
+
meta = client.get_metadata("12148/xxx")
|
| 387 |
+
assert meta == {"ark": "12148/xxx"}
|
| 388 |
+
|
| 389 |
+
def test_parse_sru_empty_xml(self):
|
| 390 |
+
from picarones.importers.gallica import GallicaClient
|
| 391 |
+
client = GallicaClient(delay_between_requests=0)
|
| 392 |
+
xml = b"""<?xml version="1.0"?>
|
| 393 |
+
<searchRetrieveResponse xmlns="http://www.loc.gov/zing/srw/">
|
| 394 |
+
<numberOfRecords>0</numberOfRecords>
|
| 395 |
+
<records/>
|
| 396 |
+
</searchRetrieveResponse>"""
|
| 397 |
+
records = client._parse_sru_response(xml, max_results=10)
|
| 398 |
+
assert records == []
|
| 399 |
+
|
| 400 |
+
def test_parse_sru_invalid_xml_returns_empty(self):
|
| 401 |
+
from picarones.importers.gallica import GallicaClient
|
| 402 |
+
client = GallicaClient(delay_between_requests=0)
|
| 403 |
+
records = client._parse_sru_response(b"not xml at all !!!", max_results=10)
|
| 404 |
+
assert records == []
|
| 405 |
+
|
| 406 |
+
def test_client_has_delay_attribute(self):
|
| 407 |
+
from picarones.importers.gallica import GallicaClient
|
| 408 |
+
client = GallicaClient(delay_between_requests=0.1)
|
| 409 |
+
assert client.delay == 0.1
|
| 410 |
+
|
| 411 |
+
|
| 412 |
+
# ===========================================================================
|
| 413 |
+
# TestGallicaSearchQuery
|
| 414 |
+
# ===========================================================================
|
| 415 |
+
|
| 416 |
+
class TestGallicaSearchQuery:
|
| 417 |
+
|
| 418 |
+
def test_build_query_title(self):
|
| 419 |
+
from picarones.importers.gallica import GallicaClient
|
| 420 |
+
client = GallicaClient()
|
| 421 |
+
query = client._build_sru_query(title="Froissart")
|
| 422 |
+
assert "Froissart" in query
|
| 423 |
+
assert "dc.title" in query
|
| 424 |
+
|
| 425 |
+
def test_build_query_author(self):
|
| 426 |
+
from picarones.importers.gallica import GallicaClient
|
| 427 |
+
client = GallicaClient()
|
| 428 |
+
query = client._build_sru_query(author="Froissart")
|
| 429 |
+
assert "dc.creator" in query
|
| 430 |
+
|
| 431 |
+
def test_build_query_date_range(self):
|
| 432 |
+
from picarones.importers.gallica import GallicaClient
|
| 433 |
+
client = GallicaClient()
|
| 434 |
+
query = client._build_sru_query(date_from=1380, date_to=1420)
|
| 435 |
+
assert "1380" in query
|
| 436 |
+
assert "1420" in query
|
| 437 |
+
|
| 438 |
+
def test_build_query_date_from_only(self):
|
| 439 |
+
from picarones.importers.gallica import GallicaClient
|
| 440 |
+
client = GallicaClient()
|
| 441 |
+
query = client._build_sru_query(date_from=1400)
|
| 442 |
+
assert "1400" in query
|
| 443 |
+
assert ">=" in query
|
| 444 |
+
|
| 445 |
+
def test_build_query_ark(self):
|
| 446 |
+
from picarones.importers.gallica import GallicaClient
|
| 447 |
+
client = GallicaClient()
|
| 448 |
+
query = client._build_sru_query(ark="12148/btv1b8453561w")
|
| 449 |
+
assert "12148/btv1b8453561w" in query
|
| 450 |
+
|
| 451 |
+
def test_build_query_empty_returns_default(self):
|
| 452 |
+
from picarones.importers.gallica import GallicaClient
|
| 453 |
+
client = GallicaClient()
|
| 454 |
+
query = client._build_sru_query()
|
| 455 |
+
assert len(query) > 0
|
| 456 |
+
|
| 457 |
+
def test_build_query_combined(self):
|
| 458 |
+
from picarones.importers.gallica import GallicaClient
|
| 459 |
+
client = GallicaClient()
|
| 460 |
+
query = client._build_sru_query(title="Froissart", author="Jean", date_from=1380)
|
| 461 |
+
assert "Froissart" in query
|
| 462 |
+
assert "Jean" in query
|
| 463 |
+
assert "1380" in query
|
| 464 |
+
|
| 465 |
+
def test_search_gallica_function(self):
|
| 466 |
+
from picarones.importers.gallica import search_gallica, GallicaClient
|
| 467 |
+
with patch.object(GallicaClient, "search", return_value=[]):
|
| 468 |
+
results = search_gallica(title="test")
|
| 469 |
+
assert isinstance(results, list)
|
| 470 |
+
|
| 471 |
+
|
| 472 |
+
# ===========================================================================
|
| 473 |
+
# TestGallicaOCR
|
| 474 |
+
# ===========================================================================
|
| 475 |
+
|
| 476 |
+
class TestGallicaOCR:
|
| 477 |
+
|
| 478 |
+
def test_ocr_url_format(self):
|
| 479 |
+
from picarones.importers import gallica as g
|
| 480 |
+
url = g._OCR_BRUT_TPL.format(ark="12148/btv1b8453561w", page=3)
|
| 481 |
+
assert "12148/btv1b8453561w" in url
|
| 482 |
+
assert "f3" in url
|
| 483 |
+
assert "texteBrut" in url
|
| 484 |
+
|
| 485 |
+
def test_import_gallica_document_function_exists(self):
|
| 486 |
+
from picarones.importers.gallica import import_gallica_document
|
| 487 |
+
assert callable(import_gallica_document)
|
| 488 |
+
|
| 489 |
+
def test_gallica_base_url(self):
|
| 490 |
+
from picarones.importers import gallica as g
|
| 491 |
+
assert "gallica.bnf.fr" in g._GALLICA_BASE
|
| 492 |
+
|
| 493 |
+
def test_ark_normalization_in_import(self):
|
| 494 |
+
from picarones.importers.gallica import import_gallica_document, GallicaClient
|
| 495 |
+
import re
|
| 496 |
+
# Tester que l'ARK est normalisé depuis une URL complète
|
| 497 |
+
full_url = "https://gallica.bnf.fr/ark:/12148/btv1b8453561w"
|
| 498 |
+
m = re.search(r"ark:/(\d+/\w+)", full_url)
|
| 499 |
+
assert m is not None
|
| 500 |
+
assert m.group(1) == "12148/btv1b8453561w"
|
| 501 |
+
|
| 502 |
+
def test_iiif_manifest_url_pattern(self):
|
| 503 |
+
from picarones.importers import gallica as g
|
| 504 |
+
url = g._IIIF_MANIFEST_TPL.format(ark="12148/btv1b8453561w")
|
| 505 |
+
assert "manifest.json" in url
|
| 506 |
+
assert "12148/btv1b8453561w" in url
|
| 507 |
+
|
| 508 |
+
def test_gallica_record_url_structure(self):
|
| 509 |
+
from picarones.importers.gallica import GallicaRecord
|
| 510 |
+
r = GallicaRecord(ark="12148/btv1b8453561w", title="Test")
|
| 511 |
+
assert r.url.startswith("https://gallica.bnf.fr")
|
| 512 |
+
assert "12148/btv1b8453561w" in r.url
|
| 513 |
+
|
| 514 |
+
|
| 515 |
+
# ===========================================================================
|
| 516 |
+
# TestImportersInit
|
| 517 |
+
# ===========================================================================
|
| 518 |
+
|
| 519 |
+
class TestImportersInit:
|
| 520 |
+
|
| 521 |
+
def test_escriptorium_client_exported(self):
|
| 522 |
+
from picarones.importers import EScriptoriumClient
|
| 523 |
+
assert EScriptoriumClient is not None
|
| 524 |
+
|
| 525 |
+
def test_gallica_client_exported(self):
|
| 526 |
+
from picarones.importers import GallicaClient
|
| 527 |
+
assert GallicaClient is not None
|
| 528 |
+
|
| 529 |
+
def test_search_gallica_exported(self):
|
| 530 |
+
from picarones.importers import search_gallica
|
| 531 |
+
assert callable(search_gallica)
|
| 532 |
+
|
| 533 |
+
def test_connect_escriptorium_exported(self):
|
| 534 |
+
from picarones.importers import connect_escriptorium
|
| 535 |
+
assert callable(connect_escriptorium)
|
| 536 |
+
|
| 537 |
+
|
| 538 |
+
# ===========================================================================
|
| 539 |
+
# TestCLIHistory (tests Click runner)
|
| 540 |
+
# ===========================================================================
|
| 541 |
+
|
| 542 |
+
class TestCLIHistory:
|
| 543 |
+
|
| 544 |
+
def test_history_command_exists(self):
|
| 545 |
+
from picarones.cli import cli
|
| 546 |
+
assert "history" in [cmd.name for cmd in cli.commands.values()]
|
| 547 |
+
|
| 548 |
+
def test_history_demo_mode(self):
|
| 549 |
+
from click.testing import CliRunner
|
| 550 |
+
from picarones.cli import cli
|
| 551 |
+
runner = CliRunner()
|
| 552 |
+
result = runner.invoke(cli, ["history", "--demo", "--db", ":memory:"])
|
| 553 |
+
assert result.exit_code == 0
|
| 554 |
+
assert "entrées" in result.output
|
| 555 |
+
|
| 556 |
+
def test_history_empty_db(self):
|
| 557 |
+
from click.testing import CliRunner
|
| 558 |
+
from picarones.cli import cli
|
| 559 |
+
import tempfile, os
|
| 560 |
+
runner = CliRunner()
|
| 561 |
+
with tempfile.NamedTemporaryFile(suffix=".db", delete=False) as f:
|
| 562 |
+
db_path = f.name
|
| 563 |
+
try:
|
| 564 |
+
result = runner.invoke(cli, ["history", "--db", db_path])
|
| 565 |
+
assert result.exit_code == 0
|
| 566 |
+
assert "Aucun" in result.output or "Aucun benchmark" in result.output
|
| 567 |
+
finally:
|
| 568 |
+
os.unlink(db_path)
|
| 569 |
+
|
| 570 |
+
def test_history_with_regression_flag(self):
|
| 571 |
+
from click.testing import CliRunner
|
| 572 |
+
from picarones.cli import cli
|
| 573 |
+
runner = CliRunner()
|
| 574 |
+
result = runner.invoke(cli, ["history", "--demo", "--db", ":memory:", "--regression"])
|
| 575 |
+
assert result.exit_code == 0
|
| 576 |
+
|
| 577 |
+
def test_history_engine_filter(self):
|
| 578 |
+
from click.testing import CliRunner
|
| 579 |
+
from picarones.cli import cli
|
| 580 |
+
runner = CliRunner()
|
| 581 |
+
result = runner.invoke(cli, [
|
| 582 |
+
"history", "--demo", "--db", ":memory:", "--engine", "tesseract"
|
| 583 |
+
])
|
| 584 |
+
assert result.exit_code == 0
|
| 585 |
+
|
| 586 |
+
def test_history_export_json(self):
|
| 587 |
+
from click.testing import CliRunner
|
| 588 |
+
from picarones.cli import cli
|
| 589 |
+
import tempfile, os
|
| 590 |
+
runner = CliRunner()
|
| 591 |
+
with tempfile.NamedTemporaryFile(suffix=".json", delete=False) as f:
|
| 592 |
+
json_path = f.name
|
| 593 |
+
try:
|
| 594 |
+
result = runner.invoke(cli, [
|
| 595 |
+
"history", "--demo", "--db", ":memory:", "--export-json", json_path
|
| 596 |
+
])
|
| 597 |
+
assert result.exit_code == 0
|
| 598 |
+
assert os.path.exists(json_path)
|
| 599 |
+
data = json.loads(open(json_path).read())
|
| 600 |
+
assert "runs" in data
|
| 601 |
+
finally:
|
| 602 |
+
os.unlink(json_path)
|
| 603 |
+
|
| 604 |
+
|
| 605 |
+
# ===========================================================================
|
| 606 |
+
# TestCLIRobustness
|
| 607 |
+
# ===========================================================================
|
| 608 |
+
|
| 609 |
+
class TestCLIRobustness:
|
| 610 |
+
|
| 611 |
+
def test_robustness_command_exists(self):
|
| 612 |
+
from picarones.cli import cli
|
| 613 |
+
assert "robustness" in [cmd.name for cmd in cli.commands.values()]
|
| 614 |
+
|
| 615 |
+
def test_robustness_demo_mode(self):
|
| 616 |
+
from click.testing import CliRunner
|
| 617 |
+
from picarones.cli import cli
|
| 618 |
+
import tempfile
|
| 619 |
+
runner = CliRunner()
|
| 620 |
+
with runner.isolated_filesystem():
|
| 621 |
+
import os; os.makedirs("corpus")
|
| 622 |
+
result = runner.invoke(cli, [
|
| 623 |
+
"robustness", "--corpus", "corpus", "--engine", "tesseract", "--demo"
|
| 624 |
+
])
|
| 625 |
+
assert result.exit_code == 0
|
| 626 |
+
|
| 627 |
+
def test_robustness_invalid_degradation(self):
|
| 628 |
+
from click.testing import CliRunner
|
| 629 |
+
from picarones.cli import cli
|
| 630 |
+
import tempfile
|
| 631 |
+
runner = CliRunner()
|
| 632 |
+
with runner.isolated_filesystem():
|
| 633 |
+
import os; os.makedirs("corpus")
|
| 634 |
+
result = runner.invoke(cli, [
|
| 635 |
+
"robustness", "--corpus", "corpus", "--engine", "tesseract",
|
| 636 |
+
"--degradations", "invalid_type", "--demo"
|
| 637 |
+
])
|
| 638 |
+
assert result.exit_code != 0
|
| 639 |
+
|
| 640 |
+
def test_robustness_shows_results(self):
|
| 641 |
+
from click.testing import CliRunner
|
| 642 |
+
from picarones.cli import cli
|
| 643 |
+
runner = CliRunner()
|
| 644 |
+
with runner.isolated_filesystem():
|
| 645 |
+
import os; os.makedirs("corpus")
|
| 646 |
+
result = runner.invoke(cli, [
|
| 647 |
+
"robustness", "--corpus", "corpus", "--engine", "tesseract",
|
| 648 |
+
"--demo", "--degradations", "noise"
|
| 649 |
+
])
|
| 650 |
+
assert result.exit_code == 0
|
| 651 |
+
assert "robustesse" in result.output.lower() or "noise" in result.output.lower()
|
| 652 |
+
|
| 653 |
+
def test_robustness_json_export(self):
|
| 654 |
+
from click.testing import CliRunner
|
| 655 |
+
from picarones.cli import cli
|
| 656 |
+
runner = CliRunner()
|
| 657 |
+
with runner.isolated_filesystem():
|
| 658 |
+
import os; os.makedirs("corpus")
|
| 659 |
+
result = runner.invoke(cli, [
|
| 660 |
+
"robustness", "--corpus", "corpus", "--engine", "tesseract",
|
| 661 |
+
"--demo", "--output-json", "robustness.json"
|
| 662 |
+
])
|
| 663 |
+
assert result.exit_code == 0
|
| 664 |
+
assert os.path.exists("robustness.json")
|
| 665 |
+
data = json.loads(open("robustness.json").read())
|
| 666 |
+
assert "curves" in data
|
| 667 |
+
|
| 668 |
+
def test_robustness_single_degradation_type(self):
|
| 669 |
+
from click.testing import CliRunner
|
| 670 |
+
from picarones.cli import cli
|
| 671 |
+
runner = CliRunner()
|
| 672 |
+
with runner.isolated_filesystem():
|
| 673 |
+
import os; os.makedirs("corpus")
|
| 674 |
+
result = runner.invoke(cli, [
|
| 675 |
+
"robustness", "--corpus", "corpus", "--engine", "tesseract",
|
| 676 |
+
"--demo", "--degradations", "blur"
|
| 677 |
+
])
|
| 678 |
+
assert result.exit_code == 0
|
|
@@ -0,0 +1,734 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests Sprint 8 — Suivi longitudinal et analyse de robustesse.
|
| 2 |
+
|
| 3 |
+
Classes de tests
|
| 4 |
+
----------------
|
| 5 |
+
TestBenchmarkHistory (15 tests) — base SQLite historique
|
| 6 |
+
TestHistoryEntry (6 tests) — structure HistoryEntry
|
| 7 |
+
TestRegressionResult (8 tests) — détection de régression
|
| 8 |
+
TestGenerateDemoHistory (5 tests) — données fictives longitudinales
|
| 9 |
+
TestDegradationLevels (6 tests) — paramètres de dégradation
|
| 10 |
+
TestDegradationFunctions (10 tests) — fonctions de dégradation image
|
| 11 |
+
TestDegradationCurve (6 tests) — structure DegradationCurve
|
| 12 |
+
TestRobustnessReport (8 tests) — rapport de robustesse
|
| 13 |
+
TestRobustnessAnalyzer (8 tests) — analyseur statique
|
| 14 |
+
TestGenerateDemoRobustness (10 tests) — données fictives robustesse
|
| 15 |
+
TestCLIDemo (5 tests) — picarones demo --with-history --with-robustness
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
from __future__ import annotations
|
| 19 |
+
|
| 20 |
+
import json
|
| 21 |
+
import pytest
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# ===========================================================================
|
| 25 |
+
# TestBenchmarkHistory
|
| 26 |
+
# ===========================================================================
|
| 27 |
+
|
| 28 |
+
class TestBenchmarkHistory:
|
| 29 |
+
|
| 30 |
+
@pytest.fixture
|
| 31 |
+
def db(self):
|
| 32 |
+
from picarones.core.history import BenchmarkHistory
|
| 33 |
+
return BenchmarkHistory(":memory:")
|
| 34 |
+
|
| 35 |
+
def test_import_module(self):
|
| 36 |
+
from picarones.core.history import BenchmarkHistory
|
| 37 |
+
assert BenchmarkHistory is not None
|
| 38 |
+
|
| 39 |
+
def test_init_in_memory(self, db):
|
| 40 |
+
assert db.db_path == ":memory:"
|
| 41 |
+
assert db.count() == 0
|
| 42 |
+
|
| 43 |
+
def test_record_single(self, db):
|
| 44 |
+
db.record_single(
|
| 45 |
+
run_id="run001",
|
| 46 |
+
corpus_name="BnF Test",
|
| 47 |
+
engine_name="tesseract",
|
| 48 |
+
cer_mean=0.12,
|
| 49 |
+
wer_mean=0.20,
|
| 50 |
+
doc_count=10,
|
| 51 |
+
)
|
| 52 |
+
assert db.count() == 1
|
| 53 |
+
|
| 54 |
+
def test_record_single_idempotent(self, db):
|
| 55 |
+
db.record_single("run001", "C", "tesseract", 0.12, 0.20, 10)
|
| 56 |
+
db.record_single("run001", "C", "tesseract", 0.10, 0.18, 10) # même run_id → REPLACE
|
| 57 |
+
assert db.count() == 1
|
| 58 |
+
|
| 59 |
+
def test_query_returns_entries(self, db):
|
| 60 |
+
db.record_single("r1", "C", "tesseract", 0.10, 0.18, 5)
|
| 61 |
+
db.record_single("r2", "C", "pero_ocr", 0.07, 0.12, 5)
|
| 62 |
+
entries = db.query()
|
| 63 |
+
assert len(entries) == 2
|
| 64 |
+
|
| 65 |
+
def test_query_filter_engine(self, db):
|
| 66 |
+
db.record_single("r1", "C", "tesseract", 0.10, 0.18, 5)
|
| 67 |
+
db.record_single("r2", "C", "pero_ocr", 0.07, 0.12, 5)
|
| 68 |
+
entries = db.query(engine="tesseract")
|
| 69 |
+
assert len(entries) == 1
|
| 70 |
+
assert entries[0].engine_name == "tesseract"
|
| 71 |
+
|
| 72 |
+
def test_query_filter_corpus(self, db):
|
| 73 |
+
db.record_single("r1", "CorpusA", "tesseract", 0.10, 0.18, 5)
|
| 74 |
+
db.record_single("r2", "CorpusB", "tesseract", 0.07, 0.12, 5)
|
| 75 |
+
entries = db.query(corpus="CorpusA")
|
| 76 |
+
assert len(entries) == 1
|
| 77 |
+
assert entries[0].corpus_name == "CorpusA"
|
| 78 |
+
|
| 79 |
+
def test_query_filter_since(self, db):
|
| 80 |
+
db.record_single("r1", "C", "tesseract", 0.12, 0.20, 5, timestamp="2024-01-01T00:00:00+00:00")
|
| 81 |
+
db.record_single("r2", "C", "tesseract", 0.10, 0.18, 5, timestamp="2025-06-01T00:00:00+00:00")
|
| 82 |
+
entries = db.query(since="2025-01-01")
|
| 83 |
+
assert len(entries) == 1
|
| 84 |
+
assert "2025" in entries[0].timestamp
|
| 85 |
+
|
| 86 |
+
def test_list_engines(self, db):
|
| 87 |
+
db.record_single("r1", "C", "tesseract", 0.10, 0.18, 5)
|
| 88 |
+
db.record_single("r2", "C", "pero_ocr", 0.07, 0.12, 5)
|
| 89 |
+
engines = db.list_engines()
|
| 90 |
+
assert "tesseract" in engines
|
| 91 |
+
assert "pero_ocr" in engines
|
| 92 |
+
|
| 93 |
+
def test_list_corpora(self, db):
|
| 94 |
+
db.record_single("r1", "CorpusA", "tesseract", 0.10, 0.18, 5)
|
| 95 |
+
db.record_single("r2", "CorpusB", "pero_ocr", 0.07, 0.12, 5)
|
| 96 |
+
corpora = db.list_corpora()
|
| 97 |
+
assert "CorpusA" in corpora
|
| 98 |
+
assert "CorpusB" in corpora
|
| 99 |
+
|
| 100 |
+
def test_get_cer_curve(self, db):
|
| 101 |
+
db.record_single("r1", "C", "tesseract", 0.15, 0.25, 5, timestamp="2024-01-01T00:00:00+00:00")
|
| 102 |
+
db.record_single("r2", "C", "tesseract", 0.12, 0.20, 5, timestamp="2024-06-01T00:00:00+00:00")
|
| 103 |
+
db.record_single("r3", "C", "tesseract", 0.10, 0.18, 5, timestamp="2025-01-01T00:00:00+00:00")
|
| 104 |
+
curve = db.get_cer_curve("tesseract")
|
| 105 |
+
assert len(curve) == 3
|
| 106 |
+
assert all("cer" in point for point in curve)
|
| 107 |
+
assert all("timestamp" in point for point in curve)
|
| 108 |
+
|
| 109 |
+
def test_get_cer_curve_filters_engine(self, db):
|
| 110 |
+
db.record_single("r1", "C", "tesseract", 0.10, 0.18, 5)
|
| 111 |
+
db.record_single("r2", "C", "pero_ocr", 0.07, 0.12, 5)
|
| 112 |
+
curve = db.get_cer_curve("tesseract")
|
| 113 |
+
assert all(point["cer"] is not None for point in curve)
|
| 114 |
+
|
| 115 |
+
def test_export_json(self, db, tmp_path):
|
| 116 |
+
db.record_single("r1", "C", "tesseract", 0.10, 0.18, 5)
|
| 117 |
+
path = db.export_json(str(tmp_path / "history.json"))
|
| 118 |
+
assert path.exists()
|
| 119 |
+
data = json.loads(path.read_text())
|
| 120 |
+
assert data["picarones_history"] is True
|
| 121 |
+
assert "runs" in data
|
| 122 |
+
assert len(data["runs"]) == 1
|
| 123 |
+
|
| 124 |
+
def test_record_benchmark_result(self, db):
|
| 125 |
+
from picarones.fixtures import generate_sample_benchmark
|
| 126 |
+
bm = generate_sample_benchmark(n_docs=3, seed=0)
|
| 127 |
+
run_id = db.record(bm)
|
| 128 |
+
assert isinstance(run_id, str)
|
| 129 |
+
# Autant d'entrées que de moteurs dans le benchmark
|
| 130 |
+
assert db.count() == len(bm.engine_reports)
|
| 131 |
+
|
| 132 |
+
def test_repr(self, db):
|
| 133 |
+
r = repr(db)
|
| 134 |
+
assert "BenchmarkHistory" in r
|
| 135 |
+
assert ":memory:" in r
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
# ===========================================================================
|
| 139 |
+
# TestHistoryEntry
|
| 140 |
+
# ===========================================================================
|
| 141 |
+
|
| 142 |
+
class TestHistoryEntry:
|
| 143 |
+
|
| 144 |
+
def test_import(self):
|
| 145 |
+
from picarones.core.history import HistoryEntry
|
| 146 |
+
assert HistoryEntry is not None
|
| 147 |
+
|
| 148 |
+
def test_cer_percent(self):
|
| 149 |
+
from picarones.core.history import HistoryEntry
|
| 150 |
+
entry = HistoryEntry(
|
| 151 |
+
run_id="r1", timestamp="2025-01-01T00:00:00+00:00",
|
| 152 |
+
corpus_name="C", engine_name="tesseract",
|
| 153 |
+
cer_mean=0.12, wer_mean=0.20, doc_count=10,
|
| 154 |
+
)
|
| 155 |
+
assert abs(entry.cer_percent - 12.0) < 0.01
|
| 156 |
+
|
| 157 |
+
def test_cer_percent_none(self):
|
| 158 |
+
from picarones.core.history import HistoryEntry
|
| 159 |
+
entry = HistoryEntry("r", "2025", "C", "e", None, None, 0)
|
| 160 |
+
assert entry.cer_percent is None
|
| 161 |
+
|
| 162 |
+
def test_as_dict_keys(self):
|
| 163 |
+
from picarones.core.history import HistoryEntry
|
| 164 |
+
entry = HistoryEntry("r1", "2025-01-01", "C", "tesseract", 0.10, 0.18, 5)
|
| 165 |
+
d = entry.as_dict()
|
| 166 |
+
assert "run_id" in d
|
| 167 |
+
assert "cer_mean" in d
|
| 168 |
+
assert "engine_name" in d
|
| 169 |
+
|
| 170 |
+
def test_as_dict_metadata(self):
|
| 171 |
+
from picarones.core.history import HistoryEntry
|
| 172 |
+
entry = HistoryEntry("r1", "2025-01-01", "C", "tesseract", 0.10, 0.18, 5,
|
| 173 |
+
metadata={"key": "value"})
|
| 174 |
+
d = entry.as_dict()
|
| 175 |
+
assert d["metadata"] == {"key": "value"}
|
| 176 |
+
|
| 177 |
+
def test_query_result_is_history_entry(self):
|
| 178 |
+
from picarones.core.history import BenchmarkHistory, HistoryEntry
|
| 179 |
+
db = BenchmarkHistory(":memory:")
|
| 180 |
+
db.record_single("r1", "C", "tesseract", 0.10, 0.18, 5)
|
| 181 |
+
entries = db.query()
|
| 182 |
+
assert isinstance(entries[0], HistoryEntry)
|
| 183 |
+
|
| 184 |
+
|
| 185 |
+
# ===========================================================================
|
| 186 |
+
# TestRegressionResult
|
| 187 |
+
# ===========================================================================
|
| 188 |
+
|
| 189 |
+
class TestRegressionResult:
|
| 190 |
+
|
| 191 |
+
@pytest.fixture
|
| 192 |
+
def db_with_runs(self):
|
| 193 |
+
from picarones.core.history import BenchmarkHistory
|
| 194 |
+
db = BenchmarkHistory(":memory:")
|
| 195 |
+
db.record_single("r1", "C", "tesseract", 0.12, 0.20, 10, timestamp="2025-01-01T00:00:00+00:00")
|
| 196 |
+
db.record_single("r2", "C", "tesseract", 0.15, 0.25, 10, timestamp="2025-06-01T00:00:00+00:00")
|
| 197 |
+
return db
|
| 198 |
+
|
| 199 |
+
def test_detect_regression_is_detected(self, db_with_runs):
|
| 200 |
+
result = db_with_runs.detect_regression("tesseract", threshold=0.01)
|
| 201 |
+
assert result is not None
|
| 202 |
+
assert result.is_regression is True
|
| 203 |
+
|
| 204 |
+
def test_detect_regression_delta_positive(self, db_with_runs):
|
| 205 |
+
result = db_with_runs.detect_regression("tesseract")
|
| 206 |
+
assert result.delta_cer > 0
|
| 207 |
+
|
| 208 |
+
def test_detect_regression_fields(self, db_with_runs):
|
| 209 |
+
result = db_with_runs.detect_regression("tesseract")
|
| 210 |
+
assert result.engine_name == "tesseract"
|
| 211 |
+
assert result.baseline_cer is not None
|
| 212 |
+
assert result.current_cer is not None
|
| 213 |
+
|
| 214 |
+
def test_detect_no_regression(self):
|
| 215 |
+
from picarones.core.history import BenchmarkHistory
|
| 216 |
+
db = BenchmarkHistory(":memory:")
|
| 217 |
+
# CER diminue = amélioration = pas de régression
|
| 218 |
+
db.record_single("r1", "C", "tesseract", 0.15, 0.25, 5, timestamp="2025-01-01T00:00:00+00:00")
|
| 219 |
+
db.record_single("r2", "C", "tesseract", 0.10, 0.18, 5, timestamp="2025-06-01T00:00:00+00:00")
|
| 220 |
+
result = db.detect_regression("tesseract", threshold=0.01)
|
| 221 |
+
assert result is not None
|
| 222 |
+
assert result.is_regression is False
|
| 223 |
+
|
| 224 |
+
def test_detect_regression_none_if_single_run(self):
|
| 225 |
+
from picarones.core.history import BenchmarkHistory
|
| 226 |
+
db = BenchmarkHistory(":memory:")
|
| 227 |
+
db.record_single("r1", "C", "tesseract", 0.12, 0.20, 5)
|
| 228 |
+
result = db.detect_regression("tesseract")
|
| 229 |
+
assert result is None
|
| 230 |
+
|
| 231 |
+
def test_detect_all_regressions(self):
|
| 232 |
+
from picarones.core.history import BenchmarkHistory
|
| 233 |
+
db = BenchmarkHistory(":memory:")
|
| 234 |
+
db.record_single("r1", "C", "tesseract", 0.10, 0.18, 5, timestamp="2025-01-01T00:00:00+00:00")
|
| 235 |
+
db.record_single("r2", "C", "tesseract", 0.20, 0.35, 5, timestamp="2025-06-01T00:00:00+00:00")
|
| 236 |
+
regressions = db.detect_all_regressions(threshold=0.01)
|
| 237 |
+
assert len(regressions) >= 1
|
| 238 |
+
|
| 239 |
+
def test_regression_result_as_dict(self, db_with_runs):
|
| 240 |
+
result = db_with_runs.detect_regression("tesseract")
|
| 241 |
+
d = result.as_dict()
|
| 242 |
+
assert "is_regression" in d
|
| 243 |
+
assert "delta_cer" in d
|
| 244 |
+
assert "engine_name" in d
|
| 245 |
+
|
| 246 |
+
def test_regression_threshold_respected(self):
|
| 247 |
+
from picarones.core.history import BenchmarkHistory
|
| 248 |
+
db = BenchmarkHistory(":memory:")
|
| 249 |
+
db.record_single("r1", "C", "tesseract", 0.100, 0.18, 5, timestamp="2025-01-01T00:00:00+00:00")
|
| 250 |
+
db.record_single("r2", "C", "tesseract", 0.105, 0.19, 5, timestamp="2025-06-01T00:00:00+00:00")
|
| 251 |
+
# Delta = 0.5%, threshold = 1% → pas de régression
|
| 252 |
+
result = db.detect_regression("tesseract", threshold=0.01)
|
| 253 |
+
assert result is not None
|
| 254 |
+
assert result.is_regression is False
|
| 255 |
+
# Avec threshold = 0.001% → régression
|
| 256 |
+
result2 = db.detect_regression("tesseract", threshold=0.001)
|
| 257 |
+
assert result2.is_regression is True
|
| 258 |
+
|
| 259 |
+
|
| 260 |
+
# ===========================================================================
|
| 261 |
+
# TestGenerateDemoHistory
|
| 262 |
+
# ===========================================================================
|
| 263 |
+
|
| 264 |
+
class TestGenerateDemoHistory:
|
| 265 |
+
|
| 266 |
+
def test_generate_fills_db(self):
|
| 267 |
+
from picarones.core.history import BenchmarkHistory, generate_demo_history
|
| 268 |
+
db = BenchmarkHistory(":memory:")
|
| 269 |
+
generate_demo_history(db, n_runs=5)
|
| 270 |
+
assert db.count() > 0
|
| 271 |
+
|
| 272 |
+
def test_generate_creates_multiple_engines(self):
|
| 273 |
+
from picarones.core.history import BenchmarkHistory, generate_demo_history
|
| 274 |
+
db = BenchmarkHistory(":memory:")
|
| 275 |
+
generate_demo_history(db, n_runs=4)
|
| 276 |
+
engines = db.list_engines()
|
| 277 |
+
assert len(engines) >= 2
|
| 278 |
+
|
| 279 |
+
def test_generate_n_runs(self):
|
| 280 |
+
from picarones.core.history import BenchmarkHistory, generate_demo_history
|
| 281 |
+
db = BenchmarkHistory(":memory:")
|
| 282 |
+
generate_demo_history(db, n_runs=8)
|
| 283 |
+
# 8 runs × 3 moteurs = 24 entrées
|
| 284 |
+
assert db.count() == 8 * 3
|
| 285 |
+
|
| 286 |
+
def test_cer_values_in_range(self):
|
| 287 |
+
from picarones.core.history import BenchmarkHistory, generate_demo_history
|
| 288 |
+
db = BenchmarkHistory(":memory:")
|
| 289 |
+
generate_demo_history(db, n_runs=5)
|
| 290 |
+
entries = db.query()
|
| 291 |
+
for e in entries:
|
| 292 |
+
if e.cer_mean is not None:
|
| 293 |
+
assert 0.0 <= e.cer_mean <= 1.0
|
| 294 |
+
|
| 295 |
+
def test_regression_detectable_in_demo(self):
|
| 296 |
+
"""La démo inclut une régression simulée au run 5 (tesseract)."""
|
| 297 |
+
from picarones.core.history import BenchmarkHistory, generate_demo_history
|
| 298 |
+
db = BenchmarkHistory(":memory:")
|
| 299 |
+
generate_demo_history(db, n_runs=8, seed=42)
|
| 300 |
+
# Vérifier que l'historique a été créé
|
| 301 |
+
assert db.count() > 0
|
| 302 |
+
# Vérifier que la courbe CER existe pour tesseract
|
| 303 |
+
curve = db.get_cer_curve("tesseract")
|
| 304 |
+
assert len(curve) > 0
|
| 305 |
+
|
| 306 |
+
|
| 307 |
+
# ===========================================================================
|
| 308 |
+
# TestDegradationLevels
|
| 309 |
+
# ===========================================================================
|
| 310 |
+
|
| 311 |
+
class TestDegradationLevels:
|
| 312 |
+
|
| 313 |
+
def test_import_constants(self):
|
| 314 |
+
from picarones.core.robustness import DEGRADATION_LEVELS, ALL_DEGRADATION_TYPES
|
| 315 |
+
assert len(DEGRADATION_LEVELS) > 0
|
| 316 |
+
assert len(ALL_DEGRADATION_TYPES) > 0
|
| 317 |
+
|
| 318 |
+
def test_all_types_in_levels(self):
|
| 319 |
+
from picarones.core.robustness import DEGRADATION_LEVELS, ALL_DEGRADATION_TYPES
|
| 320 |
+
for t in ALL_DEGRADATION_TYPES:
|
| 321 |
+
assert t in DEGRADATION_LEVELS
|
| 322 |
+
|
| 323 |
+
def test_noise_levels(self):
|
| 324 |
+
from picarones.core.robustness import DEGRADATION_LEVELS
|
| 325 |
+
levels = DEGRADATION_LEVELS["noise"]
|
| 326 |
+
assert len(levels) >= 2
|
| 327 |
+
assert 0 in levels # niveau original
|
| 328 |
+
|
| 329 |
+
def test_blur_levels(self):
|
| 330 |
+
from picarones.core.robustness import DEGRADATION_LEVELS
|
| 331 |
+
levels = DEGRADATION_LEVELS["blur"]
|
| 332 |
+
assert 0 in levels
|
| 333 |
+
|
| 334 |
+
def test_resolution_levels_include_1(self):
|
| 335 |
+
from picarones.core.robustness import DEGRADATION_LEVELS
|
| 336 |
+
levels = DEGRADATION_LEVELS["resolution"]
|
| 337 |
+
assert 1.0 in levels # résolution originale
|
| 338 |
+
|
| 339 |
+
def test_labels_match_levels(self):
|
| 340 |
+
from picarones.core.robustness import DEGRADATION_LEVELS, DEGRADATION_LABELS
|
| 341 |
+
for dtype in DEGRADATION_LEVELS:
|
| 342 |
+
if dtype in DEGRADATION_LABELS:
|
| 343 |
+
assert len(DEGRADATION_LABELS[dtype]) == len(DEGRADATION_LEVELS[dtype])
|
| 344 |
+
|
| 345 |
+
|
| 346 |
+
# ===========================================================================
|
| 347 |
+
# TestDegradationFunctions
|
| 348 |
+
# ===========================================================================
|
| 349 |
+
|
| 350 |
+
class TestDegradationFunctions:
|
| 351 |
+
|
| 352 |
+
def _make_png(self) -> bytes:
|
| 353 |
+
"""Génère un PNG minimal valide (10×10 pixels)."""
|
| 354 |
+
from picarones.fixtures import _make_placeholder_png
|
| 355 |
+
return _make_placeholder_png(40, 30)
|
| 356 |
+
|
| 357 |
+
def test_degrade_image_bytes_imports(self):
|
| 358 |
+
from picarones.core.robustness import degrade_image_bytes
|
| 359 |
+
assert callable(degrade_image_bytes)
|
| 360 |
+
|
| 361 |
+
def test_degrade_noise_returns_bytes(self):
|
| 362 |
+
from picarones.core.robustness import degrade_image_bytes
|
| 363 |
+
png = self._make_png()
|
| 364 |
+
result = degrade_image_bytes(png, "noise", 0)
|
| 365 |
+
assert isinstance(result, bytes)
|
| 366 |
+
assert len(result) > 0
|
| 367 |
+
|
| 368 |
+
def test_degrade_blur_returns_bytes(self):
|
| 369 |
+
from picarones.core.robustness import degrade_image_bytes
|
| 370 |
+
png = self._make_png()
|
| 371 |
+
result = degrade_image_bytes(png, "blur", 0)
|
| 372 |
+
assert isinstance(result, bytes)
|
| 373 |
+
|
| 374 |
+
def test_degrade_rotation_returns_bytes(self):
|
| 375 |
+
from picarones.core.robustness import degrade_image_bytes
|
| 376 |
+
png = self._make_png()
|
| 377 |
+
result = degrade_image_bytes(png, "rotation", 0)
|
| 378 |
+
assert isinstance(result, bytes)
|
| 379 |
+
|
| 380 |
+
def test_degrade_resolution_returns_bytes(self):
|
| 381 |
+
from picarones.core.robustness import degrade_image_bytes
|
| 382 |
+
png = self._make_png()
|
| 383 |
+
result = degrade_image_bytes(png, "resolution", 1.0)
|
| 384 |
+
assert isinstance(result, bytes)
|
| 385 |
+
|
| 386 |
+
def test_degrade_binarization_returns_bytes(self):
|
| 387 |
+
from picarones.core.robustness import degrade_image_bytes
|
| 388 |
+
png = self._make_png()
|
| 389 |
+
result = degrade_image_bytes(png, "binarization", 0)
|
| 390 |
+
assert isinstance(result, bytes)
|
| 391 |
+
|
| 392 |
+
def test_degrade_noise_level_5(self):
|
| 393 |
+
from picarones.core.robustness import degrade_image_bytes
|
| 394 |
+
png = self._make_png()
|
| 395 |
+
result = degrade_image_bytes(png, "noise", 5)
|
| 396 |
+
assert isinstance(result, bytes)
|
| 397 |
+
|
| 398 |
+
def test_degrade_blur_level_2(self):
|
| 399 |
+
from picarones.core.robustness import degrade_image_bytes
|
| 400 |
+
png = self._make_png()
|
| 401 |
+
result = degrade_image_bytes(png, "blur", 2)
|
| 402 |
+
assert isinstance(result, bytes)
|
| 403 |
+
|
| 404 |
+
def test_degrade_resolution_half(self):
|
| 405 |
+
from picarones.core.robustness import degrade_image_bytes
|
| 406 |
+
png = self._make_png()
|
| 407 |
+
result = degrade_image_bytes(png, "resolution", 0.5)
|
| 408 |
+
assert isinstance(result, bytes)
|
| 409 |
+
|
| 410 |
+
def test_degrade_rotation_10_degrees(self):
|
| 411 |
+
from picarones.core.robustness import degrade_image_bytes
|
| 412 |
+
png = self._make_png()
|
| 413 |
+
result = degrade_image_bytes(png, "rotation", 10)
|
| 414 |
+
assert isinstance(result, bytes)
|
| 415 |
+
|
| 416 |
+
|
| 417 |
+
# ===========================================================================
|
| 418 |
+
# TestDegradationCurve
|
| 419 |
+
# ===========================================================================
|
| 420 |
+
|
| 421 |
+
class TestDegradationCurve:
|
| 422 |
+
|
| 423 |
+
def test_import(self):
|
| 424 |
+
from picarones.core.robustness import DegradationCurve
|
| 425 |
+
assert DegradationCurve is not None
|
| 426 |
+
|
| 427 |
+
def test_as_dict_keys(self):
|
| 428 |
+
from picarones.core.robustness import DegradationCurve
|
| 429 |
+
curve = DegradationCurve(
|
| 430 |
+
engine_name="tesseract",
|
| 431 |
+
degradation_type="noise",
|
| 432 |
+
levels=[0, 5, 15],
|
| 433 |
+
labels=["original", "σ=5", "σ=15"],
|
| 434 |
+
cer_values=[0.10, 0.15, 0.25],
|
| 435 |
+
)
|
| 436 |
+
d = curve.as_dict()
|
| 437 |
+
assert "engine_name" in d
|
| 438 |
+
assert "degradation_type" in d
|
| 439 |
+
assert "levels" in d
|
| 440 |
+
assert "cer_values" in d
|
| 441 |
+
|
| 442 |
+
def test_critical_threshold(self):
|
| 443 |
+
from picarones.core.robustness import DegradationCurve
|
| 444 |
+
curve = DegradationCurve(
|
| 445 |
+
engine_name="tesseract",
|
| 446 |
+
degradation_type="noise",
|
| 447 |
+
levels=[0, 5, 15, 30],
|
| 448 |
+
labels=["o", "σ=5", "σ=15", "σ=30"],
|
| 449 |
+
cer_values=[0.10, 0.15, 0.22, 0.35],
|
| 450 |
+
critical_threshold_level=15,
|
| 451 |
+
cer_threshold=0.20,
|
| 452 |
+
)
|
| 453 |
+
assert curve.critical_threshold_level == 15
|
| 454 |
+
|
| 455 |
+
def test_none_cer_allowed(self):
|
| 456 |
+
from picarones.core.robustness import DegradationCurve
|
| 457 |
+
curve = DegradationCurve(
|
| 458 |
+
engine_name="e",
|
| 459 |
+
degradation_type="blur",
|
| 460 |
+
levels=[0, 2],
|
| 461 |
+
labels=["o", "r=2"],
|
| 462 |
+
cer_values=[None, 0.15],
|
| 463 |
+
)
|
| 464 |
+
assert curve.cer_values[0] is None
|
| 465 |
+
|
| 466 |
+
def test_default_cer_threshold(self):
|
| 467 |
+
from picarones.core.robustness import DegradationCurve
|
| 468 |
+
curve = DegradationCurve("e", "noise", [0], ["o"], [0.1])
|
| 469 |
+
assert curve.cer_threshold == 0.20
|
| 470 |
+
|
| 471 |
+
def test_engine_name_preserved(self):
|
| 472 |
+
from picarones.core.robustness import DegradationCurve
|
| 473 |
+
curve = DegradationCurve("pero_ocr", "blur", [0, 1], ["o", "r=1"], [0.05, 0.08])
|
| 474 |
+
assert curve.engine_name == "pero_ocr"
|
| 475 |
+
|
| 476 |
+
def test_as_dict_roundtrip(self):
|
| 477 |
+
from picarones.core.robustness import DegradationCurve
|
| 478 |
+
curve = DegradationCurve(
|
| 479 |
+
engine_name="tesseract",
|
| 480 |
+
degradation_type="rotation",
|
| 481 |
+
levels=[0, 5, 10],
|
| 482 |
+
labels=["0°", "5°", "10°"],
|
| 483 |
+
cer_values=[0.10, 0.18, 0.30],
|
| 484 |
+
critical_threshold_level=10,
|
| 485 |
+
)
|
| 486 |
+
d = curve.as_dict()
|
| 487 |
+
assert d["levels"] == [0, 5, 10]
|
| 488 |
+
assert d["cer_values"] == [0.10, 0.18, 0.30]
|
| 489 |
+
|
| 490 |
+
|
| 491 |
+
# ===========================================================================
|
| 492 |
+
# TestRobustnessReport
|
| 493 |
+
# ===========================================================================
|
| 494 |
+
|
| 495 |
+
class TestRobustnessReport:
|
| 496 |
+
|
| 497 |
+
def test_import(self):
|
| 498 |
+
from picarones.core.robustness import RobustnessReport
|
| 499 |
+
assert RobustnessReport is not None
|
| 500 |
+
|
| 501 |
+
def test_get_curves_for_engine(self):
|
| 502 |
+
from picarones.core.robustness import RobustnessReport, DegradationCurve
|
| 503 |
+
c1 = DegradationCurve("tesseract", "noise", [0, 5], ["o", "σ=5"], [0.10, 0.15])
|
| 504 |
+
c2 = DegradationCurve("pero_ocr", "noise", [0, 5], ["o", "σ=5"], [0.07, 0.10])
|
| 505 |
+
report = RobustnessReport(["tesseract", "pero_ocr"], "C", ["noise"], [c1, c2])
|
| 506 |
+
tess_curves = report.get_curves_for_engine("tesseract")
|
| 507 |
+
assert len(tess_curves) == 1
|
| 508 |
+
assert tess_curves[0].engine_name == "tesseract"
|
| 509 |
+
|
| 510 |
+
def test_get_curves_for_type(self):
|
| 511 |
+
from picarones.core.robustness import RobustnessReport, DegradationCurve
|
| 512 |
+
c1 = DegradationCurve("tesseract", "noise", [0, 5], ["o", "σ=5"], [0.10, 0.15])
|
| 513 |
+
c2 = DegradationCurve("tesseract", "blur", [0, 2], ["o", "r=2"], [0.10, 0.14])
|
| 514 |
+
report = RobustnessReport(["tesseract"], "C", ["noise", "blur"], [c1, c2])
|
| 515 |
+
noise_curves = report.get_curves_for_type("noise")
|
| 516 |
+
assert len(noise_curves) == 1
|
| 517 |
+
assert noise_curves[0].degradation_type == "noise"
|
| 518 |
+
|
| 519 |
+
def test_as_dict_keys(self):
|
| 520 |
+
from picarones.core.robustness import RobustnessReport
|
| 521 |
+
report = RobustnessReport(["tesseract"], "C", ["noise"], [])
|
| 522 |
+
d = report.as_dict()
|
| 523 |
+
assert "engine_names" in d
|
| 524 |
+
assert "curves" in d
|
| 525 |
+
assert "summary" in d
|
| 526 |
+
|
| 527 |
+
def test_as_dict_json_serializable(self):
|
| 528 |
+
from picarones.core.robustness import RobustnessReport, DegradationCurve
|
| 529 |
+
c = DegradationCurve("e", "noise", [0, 5], ["o", "n5"], [0.1, 0.2])
|
| 530 |
+
report = RobustnessReport(["e"], "C", ["noise"], [c])
|
| 531 |
+
d = report.as_dict()
|
| 532 |
+
# Doit être sérialisable en JSON sans erreur
|
| 533 |
+
json_str = json.dumps(d)
|
| 534 |
+
assert len(json_str) > 0
|
| 535 |
+
|
| 536 |
+
def test_summary_populated(self):
|
| 537 |
+
from picarones.core.robustness import generate_demo_robustness_report
|
| 538 |
+
report = generate_demo_robustness_report(engine_names=["tesseract"], seed=1)
|
| 539 |
+
assert isinstance(report.summary, dict)
|
| 540 |
+
assert len(report.summary) > 0
|
| 541 |
+
|
| 542 |
+
def test_corpus_name_preserved(self):
|
| 543 |
+
from picarones.core.robustness import RobustnessReport
|
| 544 |
+
report = RobustnessReport(["e"], "Mon Corpus", ["noise"], [])
|
| 545 |
+
assert report.corpus_name == "Mon Corpus"
|
| 546 |
+
|
| 547 |
+
def test_engine_names_list(self):
|
| 548 |
+
from picarones.core.robustness import RobustnessReport
|
| 549 |
+
report = RobustnessReport(["tesseract", "pero_ocr"], "C", [], [])
|
| 550 |
+
assert "tesseract" in report.engine_names
|
| 551 |
+
assert "pero_ocr" in report.engine_names
|
| 552 |
+
|
| 553 |
+
|
| 554 |
+
# ===========================================================================
|
| 555 |
+
# TestRobustnessAnalyzer
|
| 556 |
+
# ===========================================================================
|
| 557 |
+
|
| 558 |
+
class TestRobustnessAnalyzer:
|
| 559 |
+
|
| 560 |
+
def test_import(self):
|
| 561 |
+
from picarones.core.robustness import RobustnessAnalyzer
|
| 562 |
+
assert RobustnessAnalyzer is not None
|
| 563 |
+
|
| 564 |
+
def test_init_single_engine(self):
|
| 565 |
+
from picarones.core.robustness import RobustnessAnalyzer
|
| 566 |
+
mock_engine = type("E", (), {"name": "tesseract"})()
|
| 567 |
+
analyzer = RobustnessAnalyzer(mock_engine)
|
| 568 |
+
assert len(analyzer.engines) == 1
|
| 569 |
+
|
| 570 |
+
def test_init_list_engines(self):
|
| 571 |
+
from picarones.core.robustness import RobustnessAnalyzer
|
| 572 |
+
engines = [
|
| 573 |
+
type("E", (), {"name": "tesseract"})(),
|
| 574 |
+
type("E", (), {"name": "pero_ocr"})(),
|
| 575 |
+
]
|
| 576 |
+
analyzer = RobustnessAnalyzer(engines)
|
| 577 |
+
assert len(analyzer.engines) == 2
|
| 578 |
+
|
| 579 |
+
def test_default_degradation_types(self):
|
| 580 |
+
from picarones.core.robustness import RobustnessAnalyzer, ALL_DEGRADATION_TYPES
|
| 581 |
+
e = type("E", (), {"name": "e"})()
|
| 582 |
+
analyzer = RobustnessAnalyzer(e)
|
| 583 |
+
assert set(analyzer.degradation_types) == set(ALL_DEGRADATION_TYPES)
|
| 584 |
+
|
| 585 |
+
def test_custom_degradation_types(self):
|
| 586 |
+
from picarones.core.robustness import RobustnessAnalyzer
|
| 587 |
+
e = type("E", (), {"name": "e"})()
|
| 588 |
+
analyzer = RobustnessAnalyzer(e, degradation_types=["noise", "blur"])
|
| 589 |
+
assert analyzer.degradation_types == ["noise", "blur"]
|
| 590 |
+
|
| 591 |
+
def test_find_critical_level_found(self):
|
| 592 |
+
from picarones.core.robustness import RobustnessAnalyzer
|
| 593 |
+
levels = [0, 5, 15, 30]
|
| 594 |
+
cer_values = [0.10, 0.15, 0.22, 0.35]
|
| 595 |
+
critical = RobustnessAnalyzer._find_critical_level(levels, cer_values, 0.20)
|
| 596 |
+
assert critical == 15
|
| 597 |
+
|
| 598 |
+
def test_find_critical_level_none(self):
|
| 599 |
+
from picarones.core.robustness import RobustnessAnalyzer
|
| 600 |
+
levels = [0, 5, 15]
|
| 601 |
+
cer_values = [0.05, 0.10, 0.15]
|
| 602 |
+
critical = RobustnessAnalyzer._find_critical_level(levels, cer_values, 0.20)
|
| 603 |
+
assert critical is None
|
| 604 |
+
|
| 605 |
+
def test_build_summary(self):
|
| 606 |
+
from picarones.core.robustness import RobustnessAnalyzer, DegradationCurve
|
| 607 |
+
curves = [
|
| 608 |
+
DegradationCurve("tesseract", "noise", [0, 5], ["o", "n5"], [0.10, 0.20]),
|
| 609 |
+
DegradationCurve("pero_ocr", "noise", [0, 5], ["o", "n5"], [0.07, 0.12]),
|
| 610 |
+
]
|
| 611 |
+
summary = RobustnessAnalyzer._build_summary(curves)
|
| 612 |
+
assert "most_robust_noise" in summary
|
| 613 |
+
assert summary["most_robust_noise"] == "pero_ocr" # pero_ocr a le CER moyen le plus bas
|
| 614 |
+
|
| 615 |
+
|
| 616 |
+
# ===========================================================================
|
| 617 |
+
# TestGenerateDemoRobustness
|
| 618 |
+
# ===========================================================================
|
| 619 |
+
|
| 620 |
+
class TestGenerateDemoRobustness:
|
| 621 |
+
|
| 622 |
+
def test_import(self):
|
| 623 |
+
from picarones.core.robustness import generate_demo_robustness_report
|
| 624 |
+
assert callable(generate_demo_robustness_report)
|
| 625 |
+
|
| 626 |
+
def test_returns_report(self):
|
| 627 |
+
from picarones.core.robustness import generate_demo_robustness_report, RobustnessReport
|
| 628 |
+
report = generate_demo_robustness_report()
|
| 629 |
+
assert isinstance(report, RobustnessReport)
|
| 630 |
+
|
| 631 |
+
def test_default_engines(self):
|
| 632 |
+
from picarones.core.robustness import generate_demo_robustness_report
|
| 633 |
+
report = generate_demo_robustness_report()
|
| 634 |
+
assert "tesseract" in report.engine_names
|
| 635 |
+
assert "pero_ocr" in report.engine_names
|
| 636 |
+
|
| 637 |
+
def test_custom_engines(self):
|
| 638 |
+
from picarones.core.robustness import generate_demo_robustness_report
|
| 639 |
+
report = generate_demo_robustness_report(engine_names=["moteur_custom"])
|
| 640 |
+
assert "moteur_custom" in report.engine_names
|
| 641 |
+
|
| 642 |
+
def test_all_degradation_types_present(self):
|
| 643 |
+
from picarones.core.robustness import generate_demo_robustness_report, ALL_DEGRADATION_TYPES
|
| 644 |
+
report = generate_demo_robustness_report()
|
| 645 |
+
types_in_report = {c.degradation_type for c in report.curves}
|
| 646 |
+
assert types_in_report == set(ALL_DEGRADATION_TYPES)
|
| 647 |
+
|
| 648 |
+
def test_cer_values_in_range(self):
|
| 649 |
+
from picarones.core.robustness import generate_demo_robustness_report
|
| 650 |
+
report = generate_demo_robustness_report(seed=99)
|
| 651 |
+
for curve in report.curves:
|
| 652 |
+
for cer in curve.cer_values:
|
| 653 |
+
if cer is not None:
|
| 654 |
+
assert 0.0 <= cer <= 1.0
|
| 655 |
+
|
| 656 |
+
def test_cer_increases_with_degradation(self):
|
| 657 |
+
"""Pour la plupart des types, le CER doit augmenter avec le niveau de dégradation."""
|
| 658 |
+
from picarones.core.robustness import generate_demo_robustness_report
|
| 659 |
+
report = generate_demo_robustness_report(seed=42)
|
| 660 |
+
for curve in report.curves:
|
| 661 |
+
valid = [c for c in curve.cer_values if c is not None]
|
| 662 |
+
if len(valid) >= 3:
|
| 663 |
+
# Au moins le dernier niveau doit être >= le premier
|
| 664 |
+
assert valid[-1] >= valid[0], (
|
| 665 |
+
f"CER devrait augmenter pour {curve.engine_name}/{curve.degradation_type}: "
|
| 666 |
+
f"{valid[0]} → {valid[-1]}"
|
| 667 |
+
)
|
| 668 |
+
|
| 669 |
+
def test_reproducible_with_seed(self):
|
| 670 |
+
from picarones.core.robustness import generate_demo_robustness_report
|
| 671 |
+
r1 = generate_demo_robustness_report(seed=7)
|
| 672 |
+
r2 = generate_demo_robustness_report(seed=7)
|
| 673 |
+
assert r1.curves[0].cer_values == r2.curves[0].cer_values
|
| 674 |
+
|
| 675 |
+
def test_summary_contains_most_robust(self):
|
| 676 |
+
from picarones.core.robustness import generate_demo_robustness_report
|
| 677 |
+
report = generate_demo_robustness_report()
|
| 678 |
+
assert any("most_robust" in k for k in report.summary)
|
| 679 |
+
|
| 680 |
+
def test_json_serializable(self):
|
| 681 |
+
from picarones.core.robustness import generate_demo_robustness_report
|
| 682 |
+
report = generate_demo_robustness_report()
|
| 683 |
+
d = report.as_dict()
|
| 684 |
+
json_str = json.dumps(d, ensure_ascii=False)
|
| 685 |
+
assert len(json_str) > 0
|
| 686 |
+
reparsed = json.loads(json_str)
|
| 687 |
+
assert "curves" in reparsed
|
| 688 |
+
|
| 689 |
+
|
| 690 |
+
# ===========================================================================
|
| 691 |
+
# TestCLIDemo
|
| 692 |
+
# ===========================================================================
|
| 693 |
+
|
| 694 |
+
class TestCLIDemo:
|
| 695 |
+
|
| 696 |
+
def test_demo_with_history_flag(self):
|
| 697 |
+
from click.testing import CliRunner
|
| 698 |
+
from picarones.cli import cli
|
| 699 |
+
runner = CliRunner()
|
| 700 |
+
result = runner.invoke(cli, ["demo", "--with-history", "--docs", "3"])
|
| 701 |
+
assert result.exit_code == 0
|
| 702 |
+
assert "longitudinal" in result.output.lower() or "suivi" in result.output.lower() or "CER" in result.output
|
| 703 |
+
|
| 704 |
+
def test_demo_with_robustness_flag(self):
|
| 705 |
+
from click.testing import CliRunner
|
| 706 |
+
from picarones.cli import cli
|
| 707 |
+
runner = CliRunner()
|
| 708 |
+
result = runner.invoke(cli, ["demo", "--with-robustness", "--docs", "3"])
|
| 709 |
+
assert result.exit_code == 0
|
| 710 |
+
assert "robustesse" in result.output.lower() or "robustness" in result.output.lower() or "bruit" in result.output.lower()
|
| 711 |
+
|
| 712 |
+
def test_demo_with_both_flags(self):
|
| 713 |
+
from click.testing import CliRunner
|
| 714 |
+
from picarones.cli import cli
|
| 715 |
+
runner = CliRunner()
|
| 716 |
+
result = runner.invoke(cli, ["demo", "--with-history", "--with-robustness", "--docs", "3"])
|
| 717 |
+
assert result.exit_code == 0
|
| 718 |
+
|
| 719 |
+
def test_demo_without_flags(self):
|
| 720 |
+
from click.testing import CliRunner
|
| 721 |
+
from picarones.cli import cli
|
| 722 |
+
runner = CliRunner()
|
| 723 |
+
result = runner.invoke(cli, ["demo", "--docs", "3"])
|
| 724 |
+
assert result.exit_code == 0
|
| 725 |
+
|
| 726 |
+
def test_demo_generates_html_file(self):
|
| 727 |
+
from click.testing import CliRunner
|
| 728 |
+
from picarones.cli import cli
|
| 729 |
+
import os
|
| 730 |
+
runner = CliRunner()
|
| 731 |
+
with runner.isolated_filesystem():
|
| 732 |
+
result = runner.invoke(cli, ["demo", "--docs", "3", "--output", "test_demo.html"])
|
| 733 |
+
assert result.exit_code == 0
|
| 734 |
+
assert os.path.exists("test_demo.html")
|