Spaces:
Sleeping
feat(services): Phase B3-final — helper prepare_preset_args (Option 10)
Browse filesPhase finale du chantier Option B (mai 2026). Pattern 3 étapes pour
invoquer RunOrchestrator depuis du code Python qui instancie ses
adapters en mémoire (vs charger un YAML).
picarones/app/services/python_helpers.py (nouveau)
- PresetArgs : dataclass agrégeant les objets domain prêts à
passer à RunOrchestrator.execute_preset (spec, corpus_spec,
extracted_dir, pipeline_specs, adapter_resolver, adapter_kwargs).
- prepare_preset_args(corpus, engines, **kwargs) : helper unique
qui convertit (Corpus legacy + instances) en PresetArgs.
Absorbe les conventions hétérogènes (char_exclude frozenset →
string, normalization_profile objet → nom).
picarones/app/services/__init__.py
- Expose PresetArgs, prepare_preset_args, run_result_to_benchmark_result
comme API publique du package.
Tests : 9 cas dans test_python_helpers.py (nominal, multi-engines,
conversions hétérogènes, end-to-end avec execute_preset, defaults).
Suite : 9 passed (helper isolé). La migration des call sites
existants (CLI/Web/tests) se fait dans les commits suivants.
Pattern d'usage cible (3 étapes explicites) :
args = prepare_preset_args(corpus, engines, workspace_dir=...)
orch_result = RunOrchestrator(out).execute_preset(**asdict(args))
bm = run_result_to_benchmark_result(orch_result.run_result, ...)
|
@@ -42,6 +42,10 @@ from picarones.app.services.path_security import (
|
|
| 42 |
validated_path,
|
| 43 |
validated_prompt_filename,
|
| 44 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
from picarones.app.services.registry_service import (
|
| 46 |
RegistriesBundle,
|
| 47 |
RegistryService,
|
|
@@ -52,6 +56,15 @@ from picarones.app.services.run_orchestrator import (
|
|
| 52 |
RunOrchestrator,
|
| 53 |
)
|
| 54 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 55 |
# Le rendu HTML vit dans la couche ``reports/`` (cible documentée
|
| 56 |
# du rewrite — un rapport est un format de sortie, pas un service).
|
| 57 |
# Un caller qui veut juste générer un HTML l'importe directement
|
|
@@ -68,6 +81,10 @@ __all__ = [
|
|
| 68 |
"OrchestrationResult",
|
| 69 |
"PathValidationError",
|
| 70 |
"PipelineInputsFactory",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
"RegistriesBundle",
|
| 72 |
"RegistryService",
|
| 73 |
"RunOrchestrator",
|
|
|
|
| 42 |
validated_path,
|
| 43 |
validated_prompt_filename,
|
| 44 |
)
|
| 45 |
+
from picarones.app.services.python_helpers import (
|
| 46 |
+
PresetArgs,
|
| 47 |
+
prepare_preset_args,
|
| 48 |
+
)
|
| 49 |
from picarones.app.services.registry_service import (
|
| 50 |
RegistriesBundle,
|
| 51 |
RegistryService,
|
|
|
|
| 56 |
RunOrchestrator,
|
| 57 |
)
|
| 58 |
|
| 59 |
+
# Phase B3-final (mai 2026) — re-export du converter pour les
|
| 60 |
+
# callers Python qui veulent un ``BenchmarkResult`` legacy après
|
| 61 |
+
# ``RunOrchestrator.execute_preset()``. Le converter reste dans
|
| 62 |
+
# son module privé ``_benchmark_converter`` mais est exposé
|
| 63 |
+
# publiquement via le package.
|
| 64 |
+
from picarones.app.services._benchmark_converter import (
|
| 65 |
+
run_result_to_benchmark_result,
|
| 66 |
+
)
|
| 67 |
+
|
| 68 |
# Le rendu HTML vit dans la couche ``reports/`` (cible documentée
|
| 69 |
# du rewrite — un rapport est un format de sortie, pas un service).
|
| 70 |
# Un caller qui veut juste générer un HTML l'importe directement
|
|
|
|
| 81 |
"OrchestrationResult",
|
| 82 |
"PathValidationError",
|
| 83 |
"PipelineInputsFactory",
|
| 84 |
+
# Phase B3-final — helpers pour callers Python
|
| 85 |
+
"PresetArgs",
|
| 86 |
+
"prepare_preset_args",
|
| 87 |
+
"run_result_to_benchmark_result",
|
| 88 |
"RegistriesBundle",
|
| 89 |
"RegistryService",
|
| 90 |
"RunOrchestrator",
|
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Helpers pour invoquer ``RunOrchestrator`` depuis du code Python qui
|
| 2 |
+
instancie ses adapters en mémoire (par opposition au chargement depuis
|
| 3 |
+
un YAML via :class:`RunSpec`).
|
| 4 |
+
|
| 5 |
+
API publique
|
| 6 |
+
------------
|
| 7 |
+
- :class:`PresetArgs` — dataclass qui agrège les objets domain prêts
|
| 8 |
+
à passer à :meth:`RunOrchestrator.execute_preset`.
|
| 9 |
+
- :func:`prepare_preset_args` — convertit ``(Corpus legacy + liste
|
| 10 |
+
d'instances d'adapters)`` en :class:`PresetArgs`.
|
| 11 |
+
|
| 12 |
+
Pattern d'usage canonique
|
| 13 |
+
-------------------------
|
| 14 |
+
|
| 15 |
+
::
|
| 16 |
+
|
| 17 |
+
from picarones import RunOrchestrator
|
| 18 |
+
from picarones.app.services import (
|
| 19 |
+
prepare_preset_args,
|
| 20 |
+
run_result_to_benchmark_result,
|
| 21 |
+
)
|
| 22 |
+
import tempfile
|
| 23 |
+
from pathlib import Path
|
| 24 |
+
|
| 25 |
+
with tempfile.TemporaryDirectory() as ws:
|
| 26 |
+
ws_path = Path(ws)
|
| 27 |
+
args = prepare_preset_args(
|
| 28 |
+
corpus, engines,
|
| 29 |
+
workspace_dir=ws_path / "gt",
|
| 30 |
+
views=("text_final", "alto_documentary"),
|
| 31 |
+
normalization_profile="caseless",
|
| 32 |
+
profile="standard",
|
| 33 |
+
)
|
| 34 |
+
orch_result = RunOrchestrator(ws_path / "run").execute_preset(
|
| 35 |
+
spec=args.spec,
|
| 36 |
+
corpus_spec=args.corpus_spec,
|
| 37 |
+
extracted_dir=args.extracted_dir,
|
| 38 |
+
pipeline_specs=args.pipeline_specs,
|
| 39 |
+
adapter_resolver=args.adapter_resolver,
|
| 40 |
+
adapter_kwargs=args.adapter_kwargs,
|
| 41 |
+
progress_callback=cb, # optionnel
|
| 42 |
+
cancel_event=ev, # optionnel
|
| 43 |
+
)
|
| 44 |
+
# Si l'on veut un BenchmarkResult legacy (rapport HTML, etc.) :
|
| 45 |
+
benchmark = run_result_to_benchmark_result(
|
| 46 |
+
orch_result.run_result,
|
| 47 |
+
corpus=corpus, engines=engines,
|
| 48 |
+
normalization_profile="caseless", profile="standard",
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
Pourquoi 3 étapes et pas une seule fonction ?
|
| 52 |
+
---------------------------------------------
|
| 53 |
+
Volontairement explicite : chaque étape (préparation → exécution →
|
| 54 |
+
conversion legacy) est visible dans le call site et testable
|
| 55 |
+
isolément. Un caller qui n'a pas besoin du ``BenchmarkResult``
|
| 56 |
+
legacy peut sauter la 3e étape et consommer directement le
|
| 57 |
+
``RunResult`` typé du :class:`OrchestrationResult`.
|
| 58 |
+
|
| 59 |
+
Pour les callers YAML (CI, scripts reproductibles), passer par
|
| 60 |
+
:meth:`RunOrchestrator.execute(spec)` avec un :class:`RunSpec`
|
| 61 |
+
sérialisable plutôt que par ce helper.
|
| 62 |
+
"""
|
| 63 |
+
|
| 64 |
+
from __future__ import annotations
|
| 65 |
+
|
| 66 |
+
from dataclasses import dataclass
|
| 67 |
+
from pathlib import Path
|
| 68 |
+
from typing import TYPE_CHECKING, Any, Callable
|
| 69 |
+
|
| 70 |
+
if TYPE_CHECKING:
|
| 71 |
+
from picarones.app.schemas.run_spec import RunSpec
|
| 72 |
+
from picarones.domain.corpus import CorpusSpec
|
| 73 |
+
from picarones.domain.pipeline_spec import PipelineSpec
|
| 74 |
+
from picarones.evaluation.corpus import Corpus
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
@dataclass(frozen=True)
|
| 78 |
+
class PresetArgs:
|
| 79 |
+
"""Objets domain pré-construits pour
|
| 80 |
+
:meth:`RunOrchestrator.execute_preset`.
|
| 81 |
+
|
| 82 |
+
Attributs
|
| 83 |
+
---------
|
| 84 |
+
spec:
|
| 85 |
+
``RunSpec`` qui porte les paramètres (views, char_exclude,
|
| 86 |
+
normalization_profile, partial_dir, entity_extractor,
|
| 87 |
+
profile, output_json, timeout, code_version). Sa partie
|
| 88 |
+
``corpus_dir`` + ``pipelines`` est **ignorée** par
|
| 89 |
+
``execute_preset`` (placeholders Pydantic).
|
| 90 |
+
corpus_spec:
|
| 91 |
+
``CorpusSpec`` (couche 1, domain) construit depuis le
|
| 92 |
+
``Corpus`` legacy via ``corpus_to_corpus_spec``.
|
| 93 |
+
extracted_dir:
|
| 94 |
+
Dossier où les images source du corpus sont accessibles
|
| 95 |
+
(utilisé par le converter legacy si ``output_json`` est
|
| 96 |
+
renseigné).
|
| 97 |
+
pipeline_specs:
|
| 98 |
+
Liste de ``PipelineSpec`` (couche 1) construite via
|
| 99 |
+
``engine_to_pipeline_spec`` pour chaque engine fourni.
|
| 100 |
+
adapter_resolver:
|
| 101 |
+
Resolver ``name → StepExecutor`` construit via
|
| 102 |
+
``build_adapter_resolver`` qui mappe chaque adapter à son
|
| 103 |
+
instance pour ``PipelineExecutor``.
|
| 104 |
+
adapter_kwargs:
|
| 105 |
+
Map ``adapter_name → kwargs dict`` pour le manifest. Vide
|
| 106 |
+
par défaut.
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
+
spec: "RunSpec"
|
| 110 |
+
corpus_spec: "CorpusSpec"
|
| 111 |
+
extracted_dir: Path
|
| 112 |
+
pipeline_specs: list["PipelineSpec"]
|
| 113 |
+
adapter_resolver: Callable[[str], Any]
|
| 114 |
+
adapter_kwargs: dict[str, Any]
|
| 115 |
+
|
| 116 |
+
|
| 117 |
+
def _dummy_pipeline_yaml(name: str = "preset_pipeline") -> Any:
|
| 118 |
+
"""``PipelineSpecYaml`` minimaliste pour passer le validator
|
| 119 |
+
Pydantic de ``RunSpec.pipelines`` (min_length=1). Le contenu
|
| 120 |
+
est **ignoré** par ``execute_preset`` qui utilise les
|
| 121 |
+
``pipeline_specs`` du :class:`PresetArgs`.
|
| 122 |
+
"""
|
| 123 |
+
from picarones.app.schemas.run_spec import PipelineSpecYaml, StepSpec
|
| 124 |
+
from picarones.domain.artifacts import ArtifactType
|
| 125 |
+
return PipelineSpecYaml(
|
| 126 |
+
name=name,
|
| 127 |
+
initial_inputs=(ArtifactType.IMAGE,),
|
| 128 |
+
steps=(StepSpec(
|
| 129 |
+
id="ocr",
|
| 130 |
+
adapter_class="picarones.app.services.python_helpers.IgnoredByPreset",
|
| 131 |
+
adapter_kwargs={},
|
| 132 |
+
input_types=(ArtifactType.IMAGE,),
|
| 133 |
+
output_types=(ArtifactType.RAW_TEXT,),
|
| 134 |
+
),),
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
|
| 138 |
+
def prepare_preset_args(
|
| 139 |
+
corpus: "Corpus",
|
| 140 |
+
engines: list[Any],
|
| 141 |
+
*,
|
| 142 |
+
workspace_dir: Path,
|
| 143 |
+
views: tuple[str, ...] = ("text_final",),
|
| 144 |
+
char_exclude: Any | None = None,
|
| 145 |
+
normalization_profile: Any | None = None,
|
| 146 |
+
partial_dir: str | Path | None = None,
|
| 147 |
+
entity_extractor: str | None = None,
|
| 148 |
+
profile: str = "standard",
|
| 149 |
+
output_json: str | Path | None = None,
|
| 150 |
+
timeout_seconds_per_doc: float = 60.0,
|
| 151 |
+
code_version: str | None = None,
|
| 152 |
+
output_dir: str | Path | None = None,
|
| 153 |
+
) -> PresetArgs:
|
| 154 |
+
"""Convertit ``(Corpus legacy + instances d'adapters)`` en
|
| 155 |
+
objets domain prêts pour :meth:`RunOrchestrator.execute_preset`.
|
| 156 |
+
|
| 157 |
+
Parameters
|
| 158 |
+
----------
|
| 159 |
+
corpus:
|
| 160 |
+
``picarones.evaluation.corpus.Corpus`` legacy (en mémoire,
|
| 161 |
+
avec ``Document.image_path`` et ``ground_truth``).
|
| 162 |
+
engines:
|
| 163 |
+
Liste d'instances ``BaseOCRAdapter`` ou
|
| 164 |
+
``OCRLLMPipelineConfig``. Chaque instance doit exposer
|
| 165 |
+
``.name`` unique.
|
| 166 |
+
workspace_dir:
|
| 167 |
+
Dossier où sérialiser les GT pour ``corpus_to_corpus_spec``.
|
| 168 |
+
Typiquement ``Path(tmp).joinpath("gt")``. Doit exister.
|
| 169 |
+
views:
|
| 170 |
+
Noms canoniques des vues à appliquer. Défaut :
|
| 171 |
+
``("text_final",)``. Valeurs valides : ``"text_final"``,
|
| 172 |
+
``"alto_documentary"``, ``"searchability"``.
|
| 173 |
+
char_exclude, normalization_profile, partial_dir,
|
| 174 |
+
entity_extractor, profile, output_json, timeout_seconds_per_doc,
|
| 175 |
+
code_version:
|
| 176 |
+
Paramètres propagés au ``RunSpec``. Voir
|
| 177 |
+
:class:`picarones.RunSpec` pour les contrats.
|
| 178 |
+
|
| 179 |
+
- ``char_exclude`` accepte ``str`` ou ``frozenset[str]``
|
| 180 |
+
(auto-converti en string).
|
| 181 |
+
- ``normalization_profile`` accepte ``str`` ou objet
|
| 182 |
+
``NormalizationProfile`` (le nom est extrait).
|
| 183 |
+
output_dir:
|
| 184 |
+
Dossier où ``RunOrchestrator`` écrira ses 4 fichiers JSONL.
|
| 185 |
+
Si ``None``, défaut ``workspace_dir.parent / "run"``.
|
| 186 |
+
|
| 187 |
+
Returns
|
| 188 |
+
-------
|
| 189 |
+
:class:`PresetArgs`
|
| 190 |
+
|
| 191 |
+
Notes
|
| 192 |
+
-----
|
| 193 |
+
Aucune ressource externe n'est créée par cette fonction (pas
|
| 194 |
+
de tempdir, pas de fichier). Le caller est responsable du
|
| 195 |
+
cycle de vie du ``workspace_dir`` (typiquement via
|
| 196 |
+
``tempfile.TemporaryDirectory``).
|
| 197 |
+
"""
|
| 198 |
+
from picarones.app.schemas.run_spec import RunSpec
|
| 199 |
+
from picarones.app.services._benchmark_adapter_resolver import (
|
| 200 |
+
build_adapter_resolver,
|
| 201 |
+
engine_to_pipeline_spec,
|
| 202 |
+
)
|
| 203 |
+
from picarones.app.services._benchmark_conversions import (
|
| 204 |
+
corpus_to_corpus_spec,
|
| 205 |
+
)
|
| 206 |
+
|
| 207 |
+
if code_version is None:
|
| 208 |
+
import importlib
|
| 209 |
+
try:
|
| 210 |
+
code_version = importlib.import_module("picarones").__version__
|
| 211 |
+
except (ImportError, AttributeError):
|
| 212 |
+
code_version = "unknown"
|
| 213 |
+
|
| 214 |
+
workspace_dir = Path(workspace_dir)
|
| 215 |
+
if not workspace_dir.exists():
|
| 216 |
+
workspace_dir.mkdir(parents=True, exist_ok=True)
|
| 217 |
+
effective_output_dir = (
|
| 218 |
+
Path(output_dir) if output_dir
|
| 219 |
+
else workspace_dir.parent / "run"
|
| 220 |
+
)
|
| 221 |
+
|
| 222 |
+
corpus_spec = corpus_to_corpus_spec(corpus, workspace_dir=workspace_dir)
|
| 223 |
+
pipeline_specs = [engine_to_pipeline_spec(e) for e in engines]
|
| 224 |
+
adapter_resolver = build_adapter_resolver(engines)
|
| 225 |
+
|
| 226 |
+
# Normalisation des params hétérogènes legacy → RunSpec string.
|
| 227 |
+
char_exclude_str: str | None = None
|
| 228 |
+
if char_exclude is not None:
|
| 229 |
+
if isinstance(char_exclude, str):
|
| 230 |
+
char_exclude_str = char_exclude
|
| 231 |
+
else:
|
| 232 |
+
char_exclude_str = "".join(sorted(char_exclude))
|
| 233 |
+
|
| 234 |
+
norm_profile_str = normalization_profile
|
| 235 |
+
if normalization_profile is not None and not isinstance(
|
| 236 |
+
normalization_profile, str,
|
| 237 |
+
):
|
| 238 |
+
norm_profile_str = getattr(normalization_profile, "name", None)
|
| 239 |
+
|
| 240 |
+
spec = RunSpec(
|
| 241 |
+
corpus_dir=str(workspace_dir.parent), # ignoré par execute_preset
|
| 242 |
+
pipelines=(_dummy_pipeline_yaml(),),
|
| 243 |
+
views=views,
|
| 244 |
+
output_dir=str(effective_output_dir),
|
| 245 |
+
char_exclude=char_exclude_str,
|
| 246 |
+
normalization_profile=norm_profile_str,
|
| 247 |
+
partial_dir=str(partial_dir) if partial_dir else None,
|
| 248 |
+
entity_extractor=entity_extractor,
|
| 249 |
+
profile=profile,
|
| 250 |
+
output_json=str(output_json) if output_json else None,
|
| 251 |
+
code_version=code_version,
|
| 252 |
+
timeout_seconds_per_doc=timeout_seconds_per_doc,
|
| 253 |
+
)
|
| 254 |
+
|
| 255 |
+
return PresetArgs(
|
| 256 |
+
spec=spec,
|
| 257 |
+
corpus_spec=corpus_spec,
|
| 258 |
+
extracted_dir=workspace_dir,
|
| 259 |
+
pipeline_specs=pipeline_specs,
|
| 260 |
+
adapter_resolver=adapter_resolver,
|
| 261 |
+
adapter_kwargs={},
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
|
| 265 |
+
__all__ = ["PresetArgs", "prepare_preset_args"]
|
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests du helper ``prepare_preset_args`` (Phase B3-final).
|
| 2 |
+
|
| 3 |
+
Vérifie la conversion ``(Corpus legacy + instances d'adapters)`` →
|
| 4 |
+
``PresetArgs`` prête à passer à ``RunOrchestrator.execute_preset()``.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
from __future__ import annotations
|
| 8 |
+
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
from picarones.adapters.ocr.base import BaseOCRAdapter
|
| 13 |
+
from picarones.app.services import PresetArgs, prepare_preset_args
|
| 14 |
+
from picarones.domain.artifacts import Artifact, ArtifactType
|
| 15 |
+
from picarones.evaluation.corpus import Corpus, Document
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
# ──────────────────────────────────────────────────────────────────────
|
| 19 |
+
# Mock minimal
|
| 20 |
+
# ──────────────────────────────────────────────────────────────────────
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
class _MockOCR(BaseOCRAdapter):
|
| 24 |
+
def __init__(self, name: str = "mock") -> None:
|
| 25 |
+
self._name = name
|
| 26 |
+
|
| 27 |
+
@property
|
| 28 |
+
def name(self) -> str:
|
| 29 |
+
return self._name
|
| 30 |
+
|
| 31 |
+
def execute(self, inputs, params, context):
|
| 32 |
+
out_dir = Path(context.workspace_uri)
|
| 33 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 34 |
+
out_path = out_dir / f"{context.document_id}.txt"
|
| 35 |
+
out_path.write_text("hello", encoding="utf-8")
|
| 36 |
+
return {ArtifactType.RAW_TEXT: Artifact(
|
| 37 |
+
id=f"{context.document_id}:{self._name}:raw_text",
|
| 38 |
+
document_id=context.document_id,
|
| 39 |
+
type=ArtifactType.RAW_TEXT,
|
| 40 |
+
produced_by_step="ocr",
|
| 41 |
+
uri=str(out_path),
|
| 42 |
+
)}
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def _make_corpus(tmp_path: Path, n: int = 1) -> Corpus:
|
| 46 |
+
docs = []
|
| 47 |
+
for i in range(n):
|
| 48 |
+
img = tmp_path / f"doc{i}.png"
|
| 49 |
+
img.write_bytes(b"x")
|
| 50 |
+
docs.append(Document(
|
| 51 |
+
image_path=img,
|
| 52 |
+
ground_truth="hello",
|
| 53 |
+
doc_id=f"doc{i}",
|
| 54 |
+
))
|
| 55 |
+
return Corpus(name="helper_test", documents=docs)
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
# ──────────────────────────────────────────────────────────────────────
|
| 59 |
+
# Cas nominal — un engine seul
|
| 60 |
+
# ──────────────────────────────────────────────────────────────────────
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
class TestNominal:
|
| 64 |
+
def test_returns_preset_args_with_all_fields_populated(
|
| 65 |
+
self, tmp_path: Path,
|
| 66 |
+
) -> None:
|
| 67 |
+
corpus = _make_corpus(tmp_path)
|
| 68 |
+
engine = _MockOCR()
|
| 69 |
+
workspace = tmp_path / "ws"
|
| 70 |
+
workspace.mkdir()
|
| 71 |
+
|
| 72 |
+
args = prepare_preset_args(
|
| 73 |
+
corpus, [engine], workspace_dir=workspace,
|
| 74 |
+
)
|
| 75 |
+
|
| 76 |
+
assert isinstance(args, PresetArgs)
|
| 77 |
+
assert args.corpus_spec.name == "helper_test"
|
| 78 |
+
assert len(args.pipeline_specs) == 1
|
| 79 |
+
# Resolver retourne l'adapter quand on demande son name canonique.
|
| 80 |
+
assert args.adapter_resolver(engine.name) is engine
|
| 81 |
+
assert args.extracted_dir == workspace
|
| 82 |
+
assert args.adapter_kwargs == {}
|
| 83 |
+
|
| 84 |
+
def test_default_views_is_text_final(self, tmp_path: Path) -> None:
|
| 85 |
+
corpus = _make_corpus(tmp_path)
|
| 86 |
+
workspace = tmp_path / "ws"
|
| 87 |
+
workspace.mkdir()
|
| 88 |
+
args = prepare_preset_args(
|
| 89 |
+
corpus, [_MockOCR()], workspace_dir=workspace,
|
| 90 |
+
)
|
| 91 |
+
assert args.spec.views == ("text_final",)
|
| 92 |
+
|
| 93 |
+
def test_custom_views_propagated(self, tmp_path: Path) -> None:
|
| 94 |
+
corpus = _make_corpus(tmp_path)
|
| 95 |
+
workspace = tmp_path / "ws"
|
| 96 |
+
workspace.mkdir()
|
| 97 |
+
args = prepare_preset_args(
|
| 98 |
+
corpus, [_MockOCR()],
|
| 99 |
+
workspace_dir=workspace,
|
| 100 |
+
views=("text_final", "alto_documentary", "searchability"),
|
| 101 |
+
)
|
| 102 |
+
assert args.spec.views == (
|
| 103 |
+
"text_final", "alto_documentary", "searchability",
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
|
| 107 |
+
# ──────────────────────────────────────────────────────────────────────
|
| 108 |
+
# Multi-engines
|
| 109 |
+
# ──────────────────────────────────────────────────────────────────────
|
| 110 |
+
|
| 111 |
+
|
| 112 |
+
class TestMultipleEngines:
|
| 113 |
+
def test_two_engines_produce_two_pipeline_specs(
|
| 114 |
+
self, tmp_path: Path,
|
| 115 |
+
) -> None:
|
| 116 |
+
corpus = _make_corpus(tmp_path)
|
| 117 |
+
workspace = tmp_path / "ws"
|
| 118 |
+
workspace.mkdir()
|
| 119 |
+
a = _MockOCR(name="a")
|
| 120 |
+
b = _MockOCR(name="b")
|
| 121 |
+
|
| 122 |
+
args = prepare_preset_args(corpus, [a, b], workspace_dir=workspace)
|
| 123 |
+
|
| 124 |
+
assert len(args.pipeline_specs) == 2
|
| 125 |
+
# Resolver est capable de répondre aux 2 noms.
|
| 126 |
+
assert args.adapter_resolver("a") is a
|
| 127 |
+
assert args.adapter_resolver("b") is b
|
| 128 |
+
|
| 129 |
+
|
| 130 |
+
# ──────────────────────────────────────────────────────────────────────
|
| 131 |
+
# Conversions hétérogènes (char_exclude frozenset, normalization objet)
|
| 132 |
+
# ──────────────────────────────────────────────────────────────────────
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
class TestConversions:
|
| 136 |
+
def test_char_exclude_frozenset_converted_to_string(
|
| 137 |
+
self, tmp_path: Path,
|
| 138 |
+
) -> None:
|
| 139 |
+
corpus = _make_corpus(tmp_path)
|
| 140 |
+
workspace = tmp_path / "ws"
|
| 141 |
+
workspace.mkdir()
|
| 142 |
+
args = prepare_preset_args(
|
| 143 |
+
corpus, [_MockOCR()],
|
| 144 |
+
workspace_dir=workspace,
|
| 145 |
+
char_exclude=frozenset({"!", ".", ","}),
|
| 146 |
+
)
|
| 147 |
+
# Le RunSpec attend une string ; le helper convertit.
|
| 148 |
+
assert args.spec.char_exclude is not None
|
| 149 |
+
assert set(args.spec.char_exclude) == {"!", ".", ","}
|
| 150 |
+
|
| 151 |
+
def test_normalization_profile_object_converted_to_name(
|
| 152 |
+
self, tmp_path: Path,
|
| 153 |
+
) -> None:
|
| 154 |
+
from picarones.formats.text.normalization import get_builtin_profile
|
| 155 |
+
|
| 156 |
+
corpus = _make_corpus(tmp_path)
|
| 157 |
+
workspace = tmp_path / "ws"
|
| 158 |
+
workspace.mkdir()
|
| 159 |
+
profile = get_builtin_profile("caseless")
|
| 160 |
+
args = prepare_preset_args(
|
| 161 |
+
corpus, [_MockOCR()],
|
| 162 |
+
workspace_dir=workspace,
|
| 163 |
+
normalization_profile=profile,
|
| 164 |
+
)
|
| 165 |
+
assert args.spec.normalization_profile == "caseless"
|
| 166 |
+
|
| 167 |
+
def test_normalization_profile_string_passthrough(
|
| 168 |
+
self, tmp_path: Path,
|
| 169 |
+
) -> None:
|
| 170 |
+
corpus = _make_corpus(tmp_path)
|
| 171 |
+
workspace = tmp_path / "ws"
|
| 172 |
+
workspace.mkdir()
|
| 173 |
+
args = prepare_preset_args(
|
| 174 |
+
corpus, [_MockOCR()],
|
| 175 |
+
workspace_dir=workspace,
|
| 176 |
+
normalization_profile="medieval_french",
|
| 177 |
+
)
|
| 178 |
+
assert args.spec.normalization_profile == "medieval_french"
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
# ──────────────────────────────────────────────────────────────────────
|
| 182 |
+
# Intégration avec execute_preset (cas bout-en-bout)
|
| 183 |
+
# ──────────────────────────────────────────────────────────────────────
|
| 184 |
+
|
| 185 |
+
|
| 186 |
+
class TestEndToEnd:
|
| 187 |
+
def test_args_can_be_consumed_by_execute_preset(
|
| 188 |
+
self, tmp_path: Path,
|
| 189 |
+
) -> None:
|
| 190 |
+
"""Pattern complet : prepare → execute_preset → converter."""
|
| 191 |
+
from picarones.app.services import (
|
| 192 |
+
RunOrchestrator,
|
| 193 |
+
run_result_to_benchmark_result,
|
| 194 |
+
)
|
| 195 |
+
|
| 196 |
+
corpus = _make_corpus(tmp_path, n=2)
|
| 197 |
+
engine = _MockOCR()
|
| 198 |
+
workspace = tmp_path / "gt"
|
| 199 |
+
out_dir = tmp_path / "run"
|
| 200 |
+
|
| 201 |
+
args = prepare_preset_args(
|
| 202 |
+
corpus, [engine],
|
| 203 |
+
workspace_dir=workspace, output_dir=out_dir,
|
| 204 |
+
)
|
| 205 |
+
orch_result = RunOrchestrator(out_dir).execute_preset(
|
| 206 |
+
spec=args.spec,
|
| 207 |
+
corpus_spec=args.corpus_spec,
|
| 208 |
+
extracted_dir=args.extracted_dir,
|
| 209 |
+
pipeline_specs=args.pipeline_specs,
|
| 210 |
+
adapter_resolver=args.adapter_resolver,
|
| 211 |
+
adapter_kwargs=args.adapter_kwargs,
|
| 212 |
+
)
|
| 213 |
+
assert orch_result.run_result.n_documents == 2
|
| 214 |
+
|
| 215 |
+
# 3e étape optionnelle : convertir en BenchmarkResult legacy.
|
| 216 |
+
bm = run_result_to_benchmark_result(
|
| 217 |
+
orch_result.run_result,
|
| 218 |
+
corpus=corpus, engines=[engine],
|
| 219 |
+
char_exclude=None, normalization_profile=None,
|
| 220 |
+
profile="standard",
|
| 221 |
+
)
|
| 222 |
+
assert bm.document_count == 2
|
| 223 |
+
assert len(bm.engine_reports) == 1
|
| 224 |
+
assert bm.engine_reports[0].engine_name == "mock"
|
| 225 |
+
|
| 226 |
+
|
| 227 |
+
# ──────────────────────────────────────────────────────────────────────
|
| 228 |
+
# Default output_dir
|
| 229 |
+
# ──────────────────────────────────────────────────────────────────────
|
| 230 |
+
|
| 231 |
+
|
| 232 |
+
class TestDefaultOutputDir:
|
| 233 |
+
def test_defaults_to_workspace_parent_run(self, tmp_path: Path) -> None:
|
| 234 |
+
"""Sans ``output_dir``, le helper utilise
|
| 235 |
+
``workspace_dir.parent / "run"``."""
|
| 236 |
+
corpus = _make_corpus(tmp_path)
|
| 237 |
+
workspace = tmp_path / "gt"
|
| 238 |
+
workspace.mkdir()
|
| 239 |
+
args = prepare_preset_args(
|
| 240 |
+
corpus, [_MockOCR()], workspace_dir=workspace,
|
| 241 |
+
)
|
| 242 |
+
assert args.spec.output_dir == str(tmp_path / "run")
|