Picarones / tests /formats /test_normalization_yaml.py
Claude
feat(audit): Phase 3.3 — exposer NormalizationProfile.from_yaml (CLI + API) (S3)
9df6424 unverified
"""Phase 3.3 audit code-quality — ``NormalizationProfile.from_yaml``
exposé en CLI et via ``POST /api/normalization/profiles/preview``.
Avant la Phase 3.3 :
- ``NormalizationProfile.from_yaml`` était écrit (formats/text/normalization.py:191)
mais aucun caller ne l'utilisait : 0 hit dans ``grep -rn from_yaml``,
0 test associé.
- L'API web exposait seulement les 11 profils builtin via
``GET /api/normalization/profiles``.
- La CLI ``picarones run`` n'avait aucune option de normalisation.
Phase 3.3 :
- Option CLI ``--normalization-profile <ID-OR-PATH>`` (identifiant
builtin ou chemin .yaml versionné).
- Helper ``picarones.interfaces.cli._normalization_arg.resolve_normalization_profile``
unifiant les deux chemins.
- Endpoint ``POST /api/normalization/profiles/preview`` (validation
+ sérialisation, pas de persistance).
"""
from __future__ import annotations
import textwrap
from pathlib import Path
import pytest
_VALID_YAML = textwrap.dedent(
"""\
name: medieval_custom
description: Français médiéval BnF (test)
caseless: false
nfc: true
exclude_chars: ".,;:!?"
diplomatic:
ſ: s
u: v
v: u
"""
)
# --------------------------------------------------------------------------
# 1. NormalizationProfile.from_yaml — round-trip
# --------------------------------------------------------------------------
class TestFromYAMLDirect:
def test_loads_valid_yaml(self, tmp_path: Path) -> None:
from picarones.formats.text.normalization import NormalizationProfile
path = tmp_path / "medieval.yaml"
path.write_text(_VALID_YAML, encoding="utf-8")
profile = NormalizationProfile.from_yaml(path)
assert profile.name == "medieval_custom"
assert profile.description == "Français médiéval BnF (test)"
assert profile.caseless is False
assert profile.nfc is True
assert profile.diplomatic_table["ſ"] == "s"
assert "." in profile.exclude_chars
assert "!" in profile.exclude_chars
def test_missing_name_uses_filename_stem(self, tmp_path: Path) -> None:
"""Si ``name`` n'est pas dans le YAML, le stem du fichier
fait office de défaut (cf. docstring de la fonction)."""
from picarones.formats.text.normalization import NormalizationProfile
path = tmp_path / "my_corpus.yml"
path.write_text("caseless: true\n", encoding="utf-8")
profile = NormalizationProfile.from_yaml(path)
assert profile.name == "my_corpus"
assert profile.caseless is True
# --------------------------------------------------------------------------
# 2. CLI helper resolve_normalization_profile
# --------------------------------------------------------------------------
class TestResolveCLIArg:
def test_none_returns_none(self) -> None:
from picarones.interfaces.cli._normalization_arg import (
resolve_normalization_profile,
)
assert resolve_normalization_profile(None) is None
assert resolve_normalization_profile("") is None
def test_builtin_id_resolves(self) -> None:
from picarones.evaluation.metrics.normalization import (
NORMALIZATION_PROFILES,
)
from picarones.interfaces.cli._normalization_arg import (
resolve_normalization_profile,
)
profile = resolve_normalization_profile("nfc")
assert profile is NORMALIZATION_PROFILES["nfc"]
def test_yaml_path_resolves(self, tmp_path: Path) -> None:
from picarones.interfaces.cli._normalization_arg import (
resolve_normalization_profile,
)
path = tmp_path / "custom.yaml"
path.write_text(_VALID_YAML, encoding="utf-8")
profile = resolve_normalization_profile(str(path))
assert profile is not None
assert profile.name == "medieval_custom"
def test_yaml_path_missing_raises(self, tmp_path: Path) -> None:
from picarones.interfaces.cli._normalization_arg import (
resolve_normalization_profile,
)
with pytest.raises(FileNotFoundError, match="introuvable"):
resolve_normalization_profile(str(tmp_path / "absent.yaml"))
def test_unknown_id_raises_with_help(self) -> None:
from picarones.interfaces.cli._normalization_arg import (
resolve_normalization_profile,
)
with pytest.raises(ValueError, match="inconnu") as exc_info:
resolve_normalization_profile("not_a_real_profile")
# Le message doit citer les identifiants disponibles pour
# aider l'utilisateur à se corriger sans aller lire la doc.
assert "nfc" in str(exc_info.value)
# --------------------------------------------------------------------------
# 3. Option CLI --normalization-profile branchée à ``picarones run``
# --------------------------------------------------------------------------
class TestCLIIntegration:
def test_run_cmd_accepts_normalization_profile_option(self) -> None:
"""L'option ``--normalization-profile`` doit être déclarée
sur la commande ``run`` (Click)."""
from click.testing import CliRunner
from picarones.interfaces.cli._workflows import run_cmd
runner = CliRunner()
result = runner.invoke(run_cmd, ["--help"])
assert result.exit_code == 0, result.output
assert "--normalization-profile" in result.output
assert "ID-OR-PATH" in result.output
def test_run_cmd_rejects_invalid_profile_with_clean_error(
self, tmp_path: Path,
) -> None:
"""Un identifiant inconnu doit produire un exit code != 0 et
un message d'erreur lisible (pas un stacktrace Python).
Vérifie que la résolution est bien faite **avant** le
chargement du corpus (rejet précoce)."""
from click.testing import CliRunner
from picarones.interfaces.cli._workflows import run_cmd
runner = CliRunner()
result = runner.invoke(
run_cmd,
[
"--corpus", str(tmp_path), # corpus vide — peu importe
"--engines", "tesseract",
"--output", str(tmp_path / "out.json"),
"--normalization-profile", "not_a_real_profile",
],
)
assert result.exit_code != 0
assert "profil normalisation" in result.output.lower() or "normalization" in result.output.lower()
# --------------------------------------------------------------------------
# 4. Endpoint POST /api/normalization/profiles/preview
# --------------------------------------------------------------------------
@pytest.fixture
def web_client():
"""Client FastAPI minimal pour tester l'endpoint preview."""
from fastapi.testclient import TestClient
from picarones.interfaces.web.routers.normalization import router
from fastapi import FastAPI
app = FastAPI()
app.include_router(router)
return TestClient(app)
class TestPreviewEndpoint:
def test_valid_yaml_returns_serialized_profile(self, web_client) -> None:
resp = web_client.post(
"/api/normalization/profiles/preview",
json={"yaml": _VALID_YAML},
)
assert resp.status_code == 200, resp.text
data = resp.json()
assert data["name"] == "medieval_custom"
assert data["caseless"] is False
assert data["nfc"] is True
assert data["diplomatic_rules"] == 3
assert data["diplomatic_table"]["ſ"] == "s"
assert "." in data["exclude_chars"]
def test_invalid_yaml_returns_400(self, web_client) -> None:
# YAML syntaxiquement invalide.
resp = web_client.post(
"/api/normalization/profiles/preview",
json={"yaml": "name: [unclosed_list"},
)
assert resp.status_code == 400
assert "invalide" in resp.json()["detail"].lower()
def test_yaml_too_large_rejected_by_pydantic(self, web_client) -> None:
"""Le ``max_length`` Pydantic doit refuser un YAML > 64 KiB
au niveau de la validation request, avant tout parsing."""
oversized = "x: y\n" * 20000 # ~100 KiB
resp = web_client.post(
"/api/normalization/profiles/preview",
json={"yaml": oversized},
)
# Pydantic renvoie 422 sur max_length, ou 400 si notre check
# interne se déclenche d'abord — les deux sont OK.
assert resp.status_code in (400, 422)
def test_preview_does_not_register_profile(self, web_client) -> None:
"""Le profil prévisualisé ne doit PAS apparaître dans la
liste ``GET /api/normalization/profiles`` — c'est un preview,
pas une persistance."""
# On envoie un profil avec un nom unique.
yaml = textwrap.dedent("""\
name: zzz_unique_test_profile_xyz
caseless: true
""")
resp = web_client.post(
"/api/normalization/profiles/preview",
json={"yaml": yaml},
)
assert resp.status_code == 200
# GET doit toujours retourner uniquement les builtins.
list_resp = web_client.get("/api/normalization/profiles")
assert list_resp.status_code == 200
ids = {p["id"] for p in list_resp.json()["profiles"]}
assert "zzz_unique_test_profile_xyz" not in ids