Spaces:
Running
Running
Claude
feat(web,tests): Corr-A/B/C β exposer views + expose_alto + B2 features Web/test helper
50b07b8 unverified | """Tests du helper ``prepare_preset_args`` (Phase B3-final). | |
| VΓ©rifie la conversion ``(Corpus legacy + instances d'adapters)`` β | |
| ``PresetArgs`` prΓͺte Γ passer Γ ``RunOrchestrator.execute_preset()``. | |
| """ | |
| from __future__ import annotations | |
| from pathlib import Path | |
| from picarones.adapters.ocr.base import BaseOCRAdapter | |
| from picarones.app.services import PresetArgs, prepare_preset_args | |
| from picarones.domain.artifacts import Artifact, ArtifactType | |
| from picarones.evaluation.corpus import Corpus, Document | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Mock minimal | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class _MockOCR(BaseOCRAdapter): | |
| def __init__(self, name: str = "mock") -> None: | |
| self._name = name | |
| def name(self) -> str: | |
| return self._name | |
| def execute(self, inputs, params, context): | |
| out_dir = Path(context.workspace_uri) | |
| out_dir.mkdir(parents=True, exist_ok=True) | |
| out_path = out_dir / f"{context.document_id}.txt" | |
| out_path.write_text("hello", encoding="utf-8") | |
| return {ArtifactType.RAW_TEXT: Artifact( | |
| id=f"{context.document_id}:{self._name}:raw_text", | |
| document_id=context.document_id, | |
| type=ArtifactType.RAW_TEXT, | |
| produced_by_step="ocr", | |
| uri=str(out_path), | |
| )} | |
| def _make_corpus(tmp_path: Path, n: int = 1) -> Corpus: | |
| docs = [] | |
| for i in range(n): | |
| img = tmp_path / f"doc{i}.png" | |
| img.write_bytes(b"x") | |
| docs.append(Document( | |
| image_path=img, | |
| ground_truth="hello", | |
| doc_id=f"doc{i}", | |
| )) | |
| return Corpus(name="helper_test", documents=docs) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Cas nominal β un engine seul | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestNominal: | |
| def test_returns_preset_args_with_all_fields_populated( | |
| self, tmp_path: Path, | |
| ) -> None: | |
| corpus = _make_corpus(tmp_path) | |
| engine = _MockOCR() | |
| workspace = tmp_path / "ws" | |
| workspace.mkdir() | |
| args = prepare_preset_args( | |
| corpus, [engine], workspace_dir=workspace, | |
| ) | |
| assert isinstance(args, PresetArgs) | |
| assert args.corpus_spec.name == "helper_test" | |
| assert len(args.pipeline_specs) == 1 | |
| # Resolver retourne l'adapter quand on demande son name canonique. | |
| assert args.adapter_resolver(engine.name) is engine | |
| assert args.extracted_dir == workspace | |
| assert args.adapter_kwargs == {} | |
| def test_default_views_is_text_final(self, tmp_path: Path) -> None: | |
| corpus = _make_corpus(tmp_path) | |
| workspace = tmp_path / "ws" | |
| workspace.mkdir() | |
| args = prepare_preset_args( | |
| corpus, [_MockOCR()], workspace_dir=workspace, | |
| ) | |
| assert args.spec.views == ("text_final",) | |
| def test_custom_views_propagated(self, tmp_path: Path) -> None: | |
| corpus = _make_corpus(tmp_path) | |
| workspace = tmp_path / "ws" | |
| workspace.mkdir() | |
| args = prepare_preset_args( | |
| corpus, [_MockOCR()], | |
| workspace_dir=workspace, | |
| views=("text_final", "alto_documentary", "searchability"), | |
| ) | |
| assert args.spec.views == ( | |
| "text_final", "alto_documentary", "searchability", | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Phase B3-final corr-A/D β vΓ©rifier que le helper test propage `views` | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestMigrationHelperViewsPropagation: | |
| """Garantie que ``tests/_migration_helpers.run_via_orchestrator`` | |
| propage le param ``views`` Γ ``prepare_preset_args``. | |
| Audit Phase B3-final a identifiΓ© une divergence testβprod : le | |
| helper de test ne transmettait pas ``views``, donc aucun test B4 | |
| ne couvrait le multi-vues via le helper. Corr-D : helper test | |
| alignΓ©, test de propagation explicite. | |
| """ | |
| def test_helper_propagates_views_to_run_result( | |
| self, tmp_path: Path, | |
| ) -> None: | |
| from tests._migration_helpers import run_via_orchestrator | |
| corpus = _make_corpus(tmp_path, n=1) | |
| engine = _MockOCR() | |
| # Sans param ``views`` β dΓ©faut text_final seulement. | |
| bm_default = run_via_orchestrator(corpus, [engine]) | |
| assert "text_final" in bm_default.view_results | |
| assert "alto_documentary" not in bm_default.view_results | |
| # Avec ``views=...`` β propagation effective. | |
| bm_multi = run_via_orchestrator( | |
| corpus, [engine], | |
| views=("text_final", "searchability"), | |
| ) | |
| assert "text_final" in bm_multi.view_results | |
| assert "searchability" in bm_multi.view_results | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Multi-engines | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestMultipleEngines: | |
| def test_two_engines_produce_two_pipeline_specs( | |
| self, tmp_path: Path, | |
| ) -> None: | |
| corpus = _make_corpus(tmp_path) | |
| workspace = tmp_path / "ws" | |
| workspace.mkdir() | |
| a = _MockOCR(name="a") | |
| b = _MockOCR(name="b") | |
| args = prepare_preset_args(corpus, [a, b], workspace_dir=workspace) | |
| assert len(args.pipeline_specs) == 2 | |
| # Resolver est capable de rΓ©pondre aux 2 noms. | |
| assert args.adapter_resolver("a") is a | |
| assert args.adapter_resolver("b") is b | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Conversions hétérogènes (char_exclude frozenset, normalization objet) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestConversions: | |
| def test_char_exclude_frozenset_converted_to_string( | |
| self, tmp_path: Path, | |
| ) -> None: | |
| corpus = _make_corpus(tmp_path) | |
| workspace = tmp_path / "ws" | |
| workspace.mkdir() | |
| args = prepare_preset_args( | |
| corpus, [_MockOCR()], | |
| workspace_dir=workspace, | |
| char_exclude=frozenset({"!", ".", ","}), | |
| ) | |
| # Le RunSpec attend une string ; le helper convertit. | |
| assert args.spec.char_exclude is not None | |
| assert set(args.spec.char_exclude) == {"!", ".", ","} | |
| def test_normalization_profile_object_converted_to_name( | |
| self, tmp_path: Path, | |
| ) -> None: | |
| from picarones.formats.text.normalization import get_builtin_profile | |
| corpus = _make_corpus(tmp_path) | |
| workspace = tmp_path / "ws" | |
| workspace.mkdir() | |
| profile = get_builtin_profile("caseless") | |
| args = prepare_preset_args( | |
| corpus, [_MockOCR()], | |
| workspace_dir=workspace, | |
| normalization_profile=profile, | |
| ) | |
| assert args.spec.normalization_profile == "caseless" | |
| def test_normalization_profile_string_passthrough( | |
| self, tmp_path: Path, | |
| ) -> None: | |
| corpus = _make_corpus(tmp_path) | |
| workspace = tmp_path / "ws" | |
| workspace.mkdir() | |
| args = prepare_preset_args( | |
| corpus, [_MockOCR()], | |
| workspace_dir=workspace, | |
| normalization_profile="medieval_french", | |
| ) | |
| assert args.spec.normalization_profile == "medieval_french" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # IntΓ©gration avec execute_preset (cas bout-en-bout) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestEndToEnd: | |
| def test_args_can_be_consumed_by_execute_preset( | |
| self, tmp_path: Path, | |
| ) -> None: | |
| """Pattern complet : prepare β execute_preset β converter.""" | |
| from picarones.app.services import ( | |
| RunOrchestrator, | |
| run_result_to_benchmark_result, | |
| ) | |
| corpus = _make_corpus(tmp_path, n=2) | |
| engine = _MockOCR() | |
| workspace = tmp_path / "gt" | |
| out_dir = tmp_path / "run" | |
| args = prepare_preset_args( | |
| corpus, [engine], | |
| workspace_dir=workspace, output_dir=out_dir, | |
| ) | |
| orch_result = RunOrchestrator(out_dir).execute_preset( | |
| spec=args.spec, | |
| corpus_spec=args.corpus_spec, | |
| extracted_dir=args.extracted_dir, | |
| pipeline_specs=args.pipeline_specs, | |
| adapter_resolver=args.adapter_resolver, | |
| adapter_kwargs=args.adapter_kwargs, | |
| ) | |
| assert orch_result.run_result.n_documents == 2 | |
| # 3e Γ©tape optionnelle : convertir en BenchmarkResult legacy. | |
| bm = run_result_to_benchmark_result( | |
| orch_result.run_result, | |
| corpus=corpus, engines=[engine], | |
| char_exclude=None, normalization_profile=None, | |
| profile="standard", | |
| ) | |
| assert bm.document_count == 2 | |
| assert len(bm.engine_reports) == 1 | |
| assert bm.engine_reports[0].engine_name == "mock" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Default output_dir | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestDefaultOutputDir: | |
| def test_defaults_to_workspace_parent_run(self, tmp_path: Path) -> None: | |
| """Sans ``output_dir``, le helper utilise | |
| ``workspace_dir.parent / "run"``.""" | |
| corpus = _make_corpus(tmp_path) | |
| workspace = tmp_path / "gt" | |
| workspace.mkdir() | |
| args = prepare_preset_args( | |
| corpus, [_MockOCR()], workspace_dir=workspace, | |
| ) | |
| assert args.spec.output_dir == str(tmp_path / "run") | |