"""Tests du helper ``prepare_preset_args`` (Phase B3-final). Vérifie la conversion ``(Corpus legacy + instances d'adapters)`` → ``PresetArgs`` prête à passer à ``RunOrchestrator.execute_preset()``. """ from __future__ import annotations from pathlib import Path from picarones.adapters.ocr.base import BaseOCRAdapter from picarones.app.services import PresetArgs, prepare_preset_args from picarones.domain.artifacts import Artifact, ArtifactType from picarones.evaluation.corpus import Corpus, Document # ────────────────────────────────────────────────────────────────────── # Mock minimal # ────────────────────────────────────────────────────────────────────── class _MockOCR(BaseOCRAdapter): def __init__(self, name: str = "mock") -> None: self._name = name @property def name(self) -> str: return self._name def execute(self, inputs, params, context): out_dir = Path(context.workspace_uri) out_dir.mkdir(parents=True, exist_ok=True) out_path = out_dir / f"{context.document_id}.txt" out_path.write_text("hello", encoding="utf-8") return {ArtifactType.RAW_TEXT: Artifact( id=f"{context.document_id}:{self._name}:raw_text", document_id=context.document_id, type=ArtifactType.RAW_TEXT, produced_by_step="ocr", uri=str(out_path), )} def _make_corpus(tmp_path: Path, n: int = 1) -> Corpus: docs = [] for i in range(n): img = tmp_path / f"doc{i}.png" img.write_bytes(b"x") docs.append(Document( image_path=img, ground_truth="hello", doc_id=f"doc{i}", )) return Corpus(name="helper_test", documents=docs) # ────────────────────────────────────────────────────────────────────── # Cas nominal — un engine seul # ────────────────────────────────────────────────────────────────────── class TestNominal: def test_returns_preset_args_with_all_fields_populated( self, tmp_path: Path, ) -> None: corpus = _make_corpus(tmp_path) engine = _MockOCR() workspace = tmp_path / "ws" workspace.mkdir() args = prepare_preset_args( corpus, [engine], workspace_dir=workspace, ) assert isinstance(args, PresetArgs) assert args.corpus_spec.name == "helper_test" assert len(args.pipeline_specs) == 1 # Resolver retourne l'adapter quand on demande son name canonique. assert args.adapter_resolver(engine.name) is engine assert args.extracted_dir == workspace assert args.adapter_kwargs == {} def test_default_views_is_text_final(self, tmp_path: Path) -> None: corpus = _make_corpus(tmp_path) workspace = tmp_path / "ws" workspace.mkdir() args = prepare_preset_args( corpus, [_MockOCR()], workspace_dir=workspace, ) assert args.spec.views == ("text_final",) def test_custom_views_propagated(self, tmp_path: Path) -> None: corpus = _make_corpus(tmp_path) workspace = tmp_path / "ws" workspace.mkdir() args = prepare_preset_args( corpus, [_MockOCR()], workspace_dir=workspace, views=("text_final", "alto_documentary", "searchability"), ) assert args.spec.views == ( "text_final", "alto_documentary", "searchability", ) # ────────────────────────────────────────────────────────────────────── # Phase B3-final corr-A/D — vérifier que le helper test propage `views` # ────────────────────────────────────────────────────────────────────── class TestMigrationHelperViewsPropagation: """Garantie que ``tests/_migration_helpers.run_via_orchestrator`` propage le param ``views`` à ``prepare_preset_args``. Audit Phase B3-final a identifié une divergence test↔prod : le helper de test ne transmettait pas ``views``, donc aucun test B4 ne couvrait le multi-vues via le helper. Corr-D : helper test aligné, test de propagation explicite. """ def test_helper_propagates_views_to_run_result( self, tmp_path: Path, ) -> None: from tests._migration_helpers import run_via_orchestrator corpus = _make_corpus(tmp_path, n=1) engine = _MockOCR() # Sans param ``views`` → défaut text_final seulement. bm_default = run_via_orchestrator(corpus, [engine]) assert "text_final" in bm_default.view_results assert "alto_documentary" not in bm_default.view_results # Avec ``views=...`` → propagation effective. bm_multi = run_via_orchestrator( corpus, [engine], views=("text_final", "searchability"), ) assert "text_final" in bm_multi.view_results assert "searchability" in bm_multi.view_results # ────────────────────────────────────────────────────────────────────── # Multi-engines # ────────────────────────────────────────────────────────────────────── class TestMultipleEngines: def test_two_engines_produce_two_pipeline_specs( self, tmp_path: Path, ) -> None: corpus = _make_corpus(tmp_path) workspace = tmp_path / "ws" workspace.mkdir() a = _MockOCR(name="a") b = _MockOCR(name="b") args = prepare_preset_args(corpus, [a, b], workspace_dir=workspace) assert len(args.pipeline_specs) == 2 # Resolver est capable de répondre aux 2 noms. assert args.adapter_resolver("a") is a assert args.adapter_resolver("b") is b # ────────────────────────────────────────────────────────────────────── # Conversions hétérogènes (char_exclude frozenset, normalization objet) # ────────────────────────────────────────────────────────────────────── class TestConversions: def test_char_exclude_frozenset_converted_to_string( self, tmp_path: Path, ) -> None: corpus = _make_corpus(tmp_path) workspace = tmp_path / "ws" workspace.mkdir() args = prepare_preset_args( corpus, [_MockOCR()], workspace_dir=workspace, char_exclude=frozenset({"!", ".", ","}), ) # Le RunSpec attend une string ; le helper convertit. assert args.spec.char_exclude is not None assert set(args.spec.char_exclude) == {"!", ".", ","} def test_normalization_profile_object_converted_to_name( self, tmp_path: Path, ) -> None: from picarones.formats.text.normalization import get_builtin_profile corpus = _make_corpus(tmp_path) workspace = tmp_path / "ws" workspace.mkdir() profile = get_builtin_profile("caseless") args = prepare_preset_args( corpus, [_MockOCR()], workspace_dir=workspace, normalization_profile=profile, ) assert args.spec.normalization_profile == "caseless" def test_normalization_profile_string_passthrough( self, tmp_path: Path, ) -> None: corpus = _make_corpus(tmp_path) workspace = tmp_path / "ws" workspace.mkdir() args = prepare_preset_args( corpus, [_MockOCR()], workspace_dir=workspace, normalization_profile="medieval_french", ) assert args.spec.normalization_profile == "medieval_french" # ────────────────────────────────────────────────────────────────────── # Intégration avec execute_preset (cas bout-en-bout) # ────────────────────────────────────────────────────────────────────── class TestEndToEnd: def test_args_can_be_consumed_by_execute_preset( self, tmp_path: Path, ) -> None: """Pattern complet : prepare → execute_preset → converter.""" from picarones.app.services import ( RunOrchestrator, run_result_to_benchmark_result, ) corpus = _make_corpus(tmp_path, n=2) engine = _MockOCR() workspace = tmp_path / "gt" out_dir = tmp_path / "run" args = prepare_preset_args( corpus, [engine], workspace_dir=workspace, output_dir=out_dir, ) orch_result = RunOrchestrator(out_dir).execute_preset( spec=args.spec, corpus_spec=args.corpus_spec, extracted_dir=args.extracted_dir, pipeline_specs=args.pipeline_specs, adapter_resolver=args.adapter_resolver, adapter_kwargs=args.adapter_kwargs, ) assert orch_result.run_result.n_documents == 2 # 3e étape optionnelle : convertir en BenchmarkResult legacy. bm = run_result_to_benchmark_result( orch_result.run_result, corpus=corpus, engines=[engine], char_exclude=None, normalization_profile=None, profile="standard", ) assert bm.document_count == 2 assert len(bm.engine_reports) == 1 assert bm.engine_reports[0].engine_name == "mock" # ────────────────────────────────────────────────────────────────────── # Default output_dir # ────────────────────────────────────────────────────────────────────── class TestDefaultOutputDir: def test_defaults_to_workspace_parent_run(self, tmp_path: Path) -> None: """Sans ``output_dir``, le helper utilise ``workspace_dir.parent / "run"``.""" corpus = _make_corpus(tmp_path) workspace = tmp_path / "gt" workspace.mkdir() args = prepare_preset_args( corpus, [_MockOCR()], workspace_dir=workspace, ) assert args.spec.output_dir == str(tmp_path / "run")