Picarones / tests /test_metrics.py
Claude
fix(pipeline/metrics): 3 bugs pipelines OCR+LLM
39b4865 unverified
"""Tests unitaires pour le module picarones.core.metrics."""
import pytest
from picarones.core.metrics import aggregate_metrics, compute_metrics, MetricsResult
class TestComputeMetrics:
"""Tests de compute_metrics sur des cas connus."""
def test_perfect_match(self):
"""CER et WER doivent être 0 quand référence == hypothèse."""
result = compute_metrics("Bonjour le monde", "Bonjour le monde")
assert result.cer == pytest.approx(0.0)
assert result.wer == pytest.approx(0.0)
assert result.error is None
def test_complete_mismatch(self):
"""CER proche de 1 quand les textes sont totalement différents."""
result = compute_metrics("abc", "xyz")
assert result.cer > 0.0
assert result.error is None
def test_empty_reference(self):
"""Référence vide : CER = 1.0 si hypothèse non vide."""
result = compute_metrics("", "quelque chose")
assert result.cer == pytest.approx(1.0)
def test_empty_both(self):
"""Référence et hypothèse vides : CER = 0.0."""
result = compute_metrics("", "")
assert result.cer == pytest.approx(0.0)
def test_single_substitution(self):
"""Une seule substitution sur 4 chars → CER = 0.25."""
result = compute_metrics("abcd", "abce")
assert result.cer == pytest.approx(0.25)
def test_case_insensitive_cer(self):
"""CER caseless ignore les différences de casse."""
result = compute_metrics("Bonjour", "bonjour")
assert result.cer_caseless == pytest.approx(0.0)
# CER brut doit être > 0 (B ≠ b)
assert result.cer > 0.0
def test_nfc_normalization(self):
"""CER NFC normalise les séquences unicode équivalentes."""
# é peut être encodé en forme composée (U+00E9) ou décomposée (e + U+0301)
composed = "\u00e9" # é (NFC)
decomposed = "e\u0301" # e + combining accent (NFD)
result = compute_metrics(composed, decomposed)
# Après NFC, les deux sont identiques → cer_nfc = 0
assert result.cer_nfc == pytest.approx(0.0)
def test_wer_one_word_wrong(self):
"""WER = 1/3 pour 1 mot faux sur 3."""
result = compute_metrics("le chat dort", "le chien dort")
assert result.wer == pytest.approx(1 / 3, rel=1e-2)
def test_result_has_lengths(self):
ref = "Texte de référence"
result = compute_metrics(ref, "Texte différent")
assert result.reference_length == len(ref)
assert result.hypothesis_length > 0
def test_metrics_result_as_dict(self):
"""as_dict() doit retourner toutes les clés attendues."""
result = compute_metrics("abc", "abc")
d = result.as_dict()
for key in ["cer", "cer_nfc", "cer_caseless", "wer", "wer_normalized", "mer", "wil"]:
assert key in d
def test_cer_percent_property(self):
result = compute_metrics("abcd", "abce")
assert result.cer_percent == pytest.approx(25.0, rel=1e-2)
# ── Bug fix : hypothèse vide → CER doit être 1.0, pas 0.0 (bug sprint 13) ──
def test_empty_hypothesis_cer_is_one(self):
"""Hypothèse vide avec référence non vide doit donner CER=1.0."""
result = compute_metrics("Bonjour le monde", "")
assert result.cer == pytest.approx(1.0), (
f"CER attendu 1.0 pour hypothèse vide, obtenu {result.cer}"
)
assert result.error is None
def test_empty_hypothesis_wer_is_one(self):
"""WER doit être 1.0 pour hypothèse vide (pas de ZeroDivisionError)."""
result = compute_metrics("hello world", "")
assert result.wer == pytest.approx(1.0)
assert result.mer == pytest.approx(1.0)
assert result.wil == pytest.approx(1.0)
assert result.error is None
def test_empty_hypothesis_whitespace_is_treated_as_empty(self):
"""Hypothèse avec uniquement des espaces est traitée comme vide."""
result = compute_metrics("Bonjour", " ")
assert result.cer == pytest.approx(1.0)
def test_empty_hypothesis_hypothesis_length_is_zero(self):
"""hypothesis_length doit être 0 pour hypothèse vide."""
result = compute_metrics("Bonjour le monde", "")
assert result.hypothesis_length == 0
class TestAggregateMetrics:
"""Tests de aggregate_metrics."""
def _make_result(self, cer: float) -> MetricsResult:
return MetricsResult(
cer=cer, cer_nfc=cer, cer_caseless=cer,
wer=cer, wer_normalized=cer, mer=cer, wil=cer,
reference_length=100,
hypothesis_length=100,
)
def test_empty_list(self):
assert aggregate_metrics([]) == {}
def test_single_result(self):
results = [self._make_result(0.1)]
agg = aggregate_metrics(results)
assert agg["cer"]["mean"] == pytest.approx(0.1)
assert agg["cer"]["min"] == pytest.approx(0.1)
assert agg["cer"]["max"] == pytest.approx(0.1)
def test_multiple_results(self):
results = [self._make_result(0.1), self._make_result(0.3)]
agg = aggregate_metrics(results)
assert agg["cer"]["mean"] == pytest.approx(0.2)
assert agg["document_count"] == 2
assert agg["failed_count"] == 0
def test_failed_results_excluded(self):
ok = self._make_result(0.1)
failed = MetricsResult(
cer=1.0, cer_nfc=1.0, cer_caseless=1.0,
wer=1.0, wer_normalized=1.0, mer=1.0, wil=1.0,
reference_length=50, hypothesis_length=0,
error="Moteur en erreur",
)
agg = aggregate_metrics([ok, failed])
# Les métriques agrégées n'incluent que les résultats sans erreur
assert agg["cer"]["mean"] == pytest.approx(0.1)
assert agg["failed_count"] == 1