Picarones / tests /app /services /test_python_helpers.py
Claude
feat(web,tests): Corr-A/B/C β€” exposer views + expose_alto + B2 features Web/test helper
50b07b8 unverified
"""Tests du helper ``prepare_preset_args`` (Phase B3-final).
VΓ©rifie la conversion ``(Corpus legacy + instances d'adapters)`` β†’
``PresetArgs`` prΓͺte Γ  passer Γ  ``RunOrchestrator.execute_preset()``.
"""
from __future__ import annotations
from pathlib import Path
from picarones.adapters.ocr.base import BaseOCRAdapter
from picarones.app.services import PresetArgs, prepare_preset_args
from picarones.domain.artifacts import Artifact, ArtifactType
from picarones.evaluation.corpus import Corpus, Document
# ──────────────────────────────────────────────────────────────────────
# Mock minimal
# ──────────────────────────────────────────────────────────────────────
class _MockOCR(BaseOCRAdapter):
def __init__(self, name: str = "mock") -> None:
self._name = name
@property
def name(self) -> str:
return self._name
def execute(self, inputs, params, context):
out_dir = Path(context.workspace_uri)
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / f"{context.document_id}.txt"
out_path.write_text("hello", encoding="utf-8")
return {ArtifactType.RAW_TEXT: Artifact(
id=f"{context.document_id}:{self._name}:raw_text",
document_id=context.document_id,
type=ArtifactType.RAW_TEXT,
produced_by_step="ocr",
uri=str(out_path),
)}
def _make_corpus(tmp_path: Path, n: int = 1) -> Corpus:
docs = []
for i in range(n):
img = tmp_path / f"doc{i}.png"
img.write_bytes(b"x")
docs.append(Document(
image_path=img,
ground_truth="hello",
doc_id=f"doc{i}",
))
return Corpus(name="helper_test", documents=docs)
# ──────────────────────────────────────────────────────────────────────
# Cas nominal β€” un engine seul
# ──────────────────────────────────────────────────────────────────────
class TestNominal:
def test_returns_preset_args_with_all_fields_populated(
self, tmp_path: Path,
) -> None:
corpus = _make_corpus(tmp_path)
engine = _MockOCR()
workspace = tmp_path / "ws"
workspace.mkdir()
args = prepare_preset_args(
corpus, [engine], workspace_dir=workspace,
)
assert isinstance(args, PresetArgs)
assert args.corpus_spec.name == "helper_test"
assert len(args.pipeline_specs) == 1
# Resolver retourne l'adapter quand on demande son name canonique.
assert args.adapter_resolver(engine.name) is engine
assert args.extracted_dir == workspace
assert args.adapter_kwargs == {}
def test_default_views_is_text_final(self, tmp_path: Path) -> None:
corpus = _make_corpus(tmp_path)
workspace = tmp_path / "ws"
workspace.mkdir()
args = prepare_preset_args(
corpus, [_MockOCR()], workspace_dir=workspace,
)
assert args.spec.views == ("text_final",)
def test_custom_views_propagated(self, tmp_path: Path) -> None:
corpus = _make_corpus(tmp_path)
workspace = tmp_path / "ws"
workspace.mkdir()
args = prepare_preset_args(
corpus, [_MockOCR()],
workspace_dir=workspace,
views=("text_final", "alto_documentary", "searchability"),
)
assert args.spec.views == (
"text_final", "alto_documentary", "searchability",
)
# ──────────────────────────────────────────────────────────────────────
# Phase B3-final corr-A/D β€” vΓ©rifier que le helper test propage `views`
# ──────────────────────────────────────────────────────────────────────
class TestMigrationHelperViewsPropagation:
"""Garantie que ``tests/_migration_helpers.run_via_orchestrator``
propage le param ``views`` Γ  ``prepare_preset_args``.
Audit Phase B3-final a identifiΓ© une divergence test↔prod : le
helper de test ne transmettait pas ``views``, donc aucun test B4
ne couvrait le multi-vues via le helper. Corr-D : helper test
alignΓ©, test de propagation explicite.
"""
def test_helper_propagates_views_to_run_result(
self, tmp_path: Path,
) -> None:
from tests._migration_helpers import run_via_orchestrator
corpus = _make_corpus(tmp_path, n=1)
engine = _MockOCR()
# Sans param ``views`` β†’ dΓ©faut text_final seulement.
bm_default = run_via_orchestrator(corpus, [engine])
assert "text_final" in bm_default.view_results
assert "alto_documentary" not in bm_default.view_results
# Avec ``views=...`` β†’ propagation effective.
bm_multi = run_via_orchestrator(
corpus, [engine],
views=("text_final", "searchability"),
)
assert "text_final" in bm_multi.view_results
assert "searchability" in bm_multi.view_results
# ──────────────────────────────────────────────────────────────────────
# Multi-engines
# ──────────────────────────────────────────────────────────────────────
class TestMultipleEngines:
def test_two_engines_produce_two_pipeline_specs(
self, tmp_path: Path,
) -> None:
corpus = _make_corpus(tmp_path)
workspace = tmp_path / "ws"
workspace.mkdir()
a = _MockOCR(name="a")
b = _MockOCR(name="b")
args = prepare_preset_args(corpus, [a, b], workspace_dir=workspace)
assert len(args.pipeline_specs) == 2
# Resolver est capable de rΓ©pondre aux 2 noms.
assert args.adapter_resolver("a") is a
assert args.adapter_resolver("b") is b
# ──────────────────────────────────────────────────────────────────────
# Conversions hétérogènes (char_exclude frozenset, normalization objet)
# ──────────────────────────────────────────────────────────────────────
class TestConversions:
def test_char_exclude_frozenset_converted_to_string(
self, tmp_path: Path,
) -> None:
corpus = _make_corpus(tmp_path)
workspace = tmp_path / "ws"
workspace.mkdir()
args = prepare_preset_args(
corpus, [_MockOCR()],
workspace_dir=workspace,
char_exclude=frozenset({"!", ".", ","}),
)
# Le RunSpec attend une string ; le helper convertit.
assert args.spec.char_exclude is not None
assert set(args.spec.char_exclude) == {"!", ".", ","}
def test_normalization_profile_object_converted_to_name(
self, tmp_path: Path,
) -> None:
from picarones.formats.text.normalization import get_builtin_profile
corpus = _make_corpus(tmp_path)
workspace = tmp_path / "ws"
workspace.mkdir()
profile = get_builtin_profile("caseless")
args = prepare_preset_args(
corpus, [_MockOCR()],
workspace_dir=workspace,
normalization_profile=profile,
)
assert args.spec.normalization_profile == "caseless"
def test_normalization_profile_string_passthrough(
self, tmp_path: Path,
) -> None:
corpus = _make_corpus(tmp_path)
workspace = tmp_path / "ws"
workspace.mkdir()
args = prepare_preset_args(
corpus, [_MockOCR()],
workspace_dir=workspace,
normalization_profile="medieval_french",
)
assert args.spec.normalization_profile == "medieval_french"
# ──────────────────────────────────────────────────────────────────────
# IntΓ©gration avec execute_preset (cas bout-en-bout)
# ──────────────────────────────────────────────────────────────────────
class TestEndToEnd:
def test_args_can_be_consumed_by_execute_preset(
self, tmp_path: Path,
) -> None:
"""Pattern complet : prepare β†’ execute_preset β†’ converter."""
from picarones.app.services import (
RunOrchestrator,
run_result_to_benchmark_result,
)
corpus = _make_corpus(tmp_path, n=2)
engine = _MockOCR()
workspace = tmp_path / "gt"
out_dir = tmp_path / "run"
args = prepare_preset_args(
corpus, [engine],
workspace_dir=workspace, output_dir=out_dir,
)
orch_result = RunOrchestrator(out_dir).execute_preset(
spec=args.spec,
corpus_spec=args.corpus_spec,
extracted_dir=args.extracted_dir,
pipeline_specs=args.pipeline_specs,
adapter_resolver=args.adapter_resolver,
adapter_kwargs=args.adapter_kwargs,
)
assert orch_result.run_result.n_documents == 2
# 3e Γ©tape optionnelle : convertir en BenchmarkResult legacy.
bm = run_result_to_benchmark_result(
orch_result.run_result,
corpus=corpus, engines=[engine],
char_exclude=None, normalization_profile=None,
profile="standard",
)
assert bm.document_count == 2
assert len(bm.engine_reports) == 1
assert bm.engine_reports[0].engine_name == "mock"
# ──────────────────────────────────────────────────────────────────────
# Default output_dir
# ──────────────────────────────────────────────────────────────────────
class TestDefaultOutputDir:
def test_defaults_to_workspace_parent_run(self, tmp_path: Path) -> None:
"""Sans ``output_dir``, le helper utilise
``workspace_dir.parent / "run"``."""
corpus = _make_corpus(tmp_path)
workspace = tmp_path / "gt"
workspace.mkdir()
args = prepare_preset_args(
corpus, [_MockOCR()], workspace_dir=workspace,
)
assert args.spec.output_dir == str(tmp_path / "run")