Spaces:

Ma-Ri-Ba-Ku
/

Picarones

Sleeping

Picarones / tests /test_metrics.py

Claude

fix(pipeline/metrics): 3 bugs pipelines OCR+LLM

39b4865 unverified about 1 month ago

5.88 kB

	"""Tests unitaires pour le module picarones.core.metrics."""

	import pytest

	from picarones.core.metrics import aggregate_metrics, compute_metrics, MetricsResult


	class TestComputeMetrics:
	"""Tests de compute_metrics sur des cas connus."""

	def test_perfect_match(self):
	"""CER et WER doivent être 0 quand référence == hypothèse."""
	result = compute_metrics("Bonjour le monde", "Bonjour le monde")
	assert result.cer == pytest.approx(0.0)
	assert result.wer == pytest.approx(0.0)
	assert result.error is None

	def test_complete_mismatch(self):
	"""CER proche de 1 quand les textes sont totalement différents."""
	result = compute_metrics("abc", "xyz")
	assert result.cer > 0.0
	assert result.error is None

	def test_empty_reference(self):
	"""Référence vide : CER = 1.0 si hypothèse non vide."""
	result = compute_metrics("", "quelque chose")
	assert result.cer == pytest.approx(1.0)

	def test_empty_both(self):
	"""Référence et hypothèse vides : CER = 0.0."""
	result = compute_metrics("", "")
	assert result.cer == pytest.approx(0.0)

	def test_single_substitution(self):
	"""Une seule substitution sur 4 chars → CER = 0.25."""
	result = compute_metrics("abcd", "abce")
	assert result.cer == pytest.approx(0.25)

	def test_case_insensitive_cer(self):
	"""CER caseless ignore les différences de casse."""
	result = compute_metrics("Bonjour", "bonjour")
	assert result.cer_caseless == pytest.approx(0.0)
	# CER brut doit être > 0 (B ≠ b)
	assert result.cer > 0.0

	def test_nfc_normalization(self):
	"""CER NFC normalise les séquences unicode équivalentes."""
	# é peut être encodé en forme composée (U+00E9) ou décomposée (e + U+0301)
	composed = "\u00e9" # é (NFC)
	decomposed = "e\u0301" # e + combining accent (NFD)
	result = compute_metrics(composed, decomposed)
	# Après NFC, les deux sont identiques → cer_nfc = 0
	assert result.cer_nfc == pytest.approx(0.0)

	def test_wer_one_word_wrong(self):
	"""WER = 1/3 pour 1 mot faux sur 3."""
	result = compute_metrics("le chat dort", "le chien dort")
	assert result.wer == pytest.approx(1 / 3, rel=1e-2)

	def test_result_has_lengths(self):
	ref = "Texte de référence"
	result = compute_metrics(ref, "Texte différent")
	assert result.reference_length == len(ref)
	assert result.hypothesis_length > 0

	def test_metrics_result_as_dict(self):
	"""as_dict() doit retourner toutes les clés attendues."""
	result = compute_metrics("abc", "abc")
	d = result.as_dict()
	for key in ["cer", "cer_nfc", "cer_caseless", "wer", "wer_normalized", "mer", "wil"]:
	assert key in d

	def test_cer_percent_property(self):
	result = compute_metrics("abcd", "abce")
	assert result.cer_percent == pytest.approx(25.0, rel=1e-2)

	# ── Bug fix : hypothèse vide → CER doit être 1.0, pas 0.0 (bug sprint 13) ──

	def test_empty_hypothesis_cer_is_one(self):
	"""Hypothèse vide avec référence non vide doit donner CER=1.0."""
	result = compute_metrics("Bonjour le monde", "")
	assert result.cer == pytest.approx(1.0), (
	f"CER attendu 1.0 pour hypothèse vide, obtenu {result.cer}"
	)
	assert result.error is None

	def test_empty_hypothesis_wer_is_one(self):
	"""WER doit être 1.0 pour hypothèse vide (pas de ZeroDivisionError)."""
	result = compute_metrics("hello world", "")
	assert result.wer == pytest.approx(1.0)
	assert result.mer == pytest.approx(1.0)
	assert result.wil == pytest.approx(1.0)
	assert result.error is None

	def test_empty_hypothesis_whitespace_is_treated_as_empty(self):
	"""Hypothèse avec uniquement des espaces est traitée comme vide."""
	result = compute_metrics("Bonjour", " ")
	assert result.cer == pytest.approx(1.0)

	def test_empty_hypothesis_hypothesis_length_is_zero(self):
	"""hypothesis_length doit être 0 pour hypothèse vide."""
	result = compute_metrics("Bonjour le monde", "")
	assert result.hypothesis_length == 0


	class TestAggregateMetrics:
	"""Tests de aggregate_metrics."""

	def _make_result(self, cer: float) -> MetricsResult:
	return MetricsResult(
	cer=cer, cer_nfc=cer, cer_caseless=cer,
	wer=cer, wer_normalized=cer, mer=cer, wil=cer,
	reference_length=100,
	hypothesis_length=100,
	)

	def test_empty_list(self):
	assert aggregate_metrics([]) == {}

	def test_single_result(self):
	results = [self._make_result(0.1)]
	agg = aggregate_metrics(results)
	assert agg["cer"]["mean"] == pytest.approx(0.1)
	assert agg["cer"]["min"] == pytest.approx(0.1)
	assert agg["cer"]["max"] == pytest.approx(0.1)

	def test_multiple_results(self):
	results = [self._make_result(0.1), self._make_result(0.3)]
	agg = aggregate_metrics(results)
	assert agg["cer"]["mean"] == pytest.approx(0.2)
	assert agg["document_count"] == 2
	assert agg["failed_count"] == 0

	def test_failed_results_excluded(self):
	ok = self._make_result(0.1)
	failed = MetricsResult(
	cer=1.0, cer_nfc=1.0, cer_caseless=1.0,
	wer=1.0, wer_normalized=1.0, mer=1.0, wil=1.0,
	reference_length=50, hypothesis_length=0,
	error="Moteur en erreur",
	)
	agg = aggregate_metrics([ok, failed])
	# Les métriques agrégées n'incluent que les résultats sans erreur
	assert agg["cer"]["mean"] == pytest.approx(0.1)
	assert agg["failed_count"] == 1