Spaces:
Sleeping
Sleeping
| """Tests Sprint 35 β mΓ©triques inter-moteurs (Γtape 2 du plan). | |
| Couvre les deux familles de mesures du module ``picarones.measurements.inter_engine`` : | |
| 1. **Divergence taxonomique** : KL et JS-divergence sur les | |
| distributions de classes d'erreur, plus la matrice triangulaire | |
| inter-moteurs. Tests : invariants mathΓ©matiques (positivitΓ©, JS | |
| symΓ©trique et bornΓ©e, KL(p,p)=0), comportement sur clΓ©s disjointes. | |
| 2. **ComplΓ©mentaritΓ©** : oracle token recall, gap absolu/relatif vs | |
| meilleur moteur seul, taux de dΓ©saccord par paire. Tests : cas | |
| parfait (oracle = best = 1), cas oΓΉ un ensemble apporte un vrai gain, | |
| cas d'Γ©galitΓ© parfaite (gap = 0), garde-fous (rΓ©fΓ©rence vide, | |
| hypothèses vides). | |
| Les fonctions sont pures ; pas besoin de fixtures d'I/O ni de moteurs | |
| rΓ©els. | |
| """ | |
| from __future__ import annotations | |
| import math | |
| import pytest | |
| from picarones.evaluation.metrics.inter_engine import ( | |
| complementarity_gap, | |
| jensen_shannon_divergence, | |
| kl_divergence, | |
| oracle_token_recall, | |
| pairwise_disagreement_rate, | |
| taxonomy_divergence_matrix, | |
| ) | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 1. KL-divergence | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestKLDivergence: | |
| def test_self_divergence_is_zero(self) -> None: | |
| p = {"a": 0.4, "b": 0.3, "c": 0.3} | |
| assert kl_divergence(p, p) == pytest.approx(0.0, abs=1e-9) | |
| def test_kl_is_non_negative(self) -> None: | |
| p = {"a": 0.7, "b": 0.2, "c": 0.1} | |
| q = {"a": 0.1, "b": 0.4, "c": 0.5} | |
| assert kl_divergence(p, q) > 0 | |
| assert kl_divergence(q, p) > 0 | |
| def test_kl_is_asymmetric_in_general(self) -> None: | |
| # Choix asymΓ©trique non symΓ©trique par permutation | |
| p = {"a": 0.9, "b": 0.05, "c": 0.05} | |
| q = {"a": 0.4, "b": 0.4, "c": 0.2} | |
| assert kl_divergence(p, q) != pytest.approx(kl_divergence(q, p), abs=1e-3) | |
| def test_disjoint_keys_handled(self) -> None: | |
| # Pas de clΓ© en commun : doit retourner une valeur finie grΓ’ce | |
| # au lissage epsilon. | |
| p = {"a": 1.0} | |
| q = {"b": 1.0} | |
| kl = kl_divergence(p, q) | |
| assert math.isfinite(kl) | |
| assert kl > 0 | |
| def test_empty_distributions_return_zero(self) -> None: | |
| assert kl_divergence({}, {}) == 0.0 | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 2. Jensen-Shannon divergence | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestJensenShannonDivergence: | |
| def test_self_divergence_is_zero(self) -> None: | |
| p = {"a": 0.4, "b": 0.3, "c": 0.3} | |
| assert jensen_shannon_divergence(p, p) == pytest.approx(0.0, abs=1e-9) | |
| def test_symmetric(self) -> None: | |
| p = {"a": 0.7, "b": 0.2, "c": 0.1} | |
| q = {"a": 0.1, "b": 0.4, "c": 0.5} | |
| assert jensen_shannon_divergence(p, q) == pytest.approx( | |
| jensen_shannon_divergence(q, p), abs=1e-9 | |
| ) | |
| def test_bounded_in_unit_interval(self) -> None: | |
| # JS en bits β [0, 1]. Distributions extrΓͺmes : disjointes. | |
| p = {"a": 1.0} | |
| q = {"b": 1.0} | |
| js = jensen_shannon_divergence(p, q) | |
| assert 0.0 <= js <= 1.0 | |
| # Les distributions disjointes donnent une JS proche de 1 (la | |
| # borne est atteinte asymptotiquement). | |
| assert js > 0.5 | |
| def test_close_distributions_have_small_js(self) -> None: | |
| p = {"a": 0.5, "b": 0.5} | |
| q = {"a": 0.51, "b": 0.49} | |
| assert jensen_shannon_divergence(p, q) < 0.01 | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 3. Matrice de divergence inter-moteurs | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestDivergenceMatrix: | |
| def engines(self) -> dict[str, dict[str, float]]: | |
| return { | |
| "tesseract": {"visual": 0.5, "casse": 0.3, "abbrev": 0.2}, | |
| "pero": {"visual": 0.2, "casse": 0.3, "abbrev": 0.5}, | |
| "mistral": {"visual": 0.4, "casse": 0.4, "abbrev": 0.2}, | |
| } | |
| def test_diagonal_is_zero( | |
| self, engines: dict[str, dict[str, float]] | |
| ) -> None: | |
| mat = taxonomy_divergence_matrix(engines) | |
| for name in engines: | |
| assert mat[name][name] == pytest.approx(0.0, abs=1e-9) | |
| def test_js_matrix_is_symmetric( | |
| self, engines: dict[str, dict[str, float]] | |
| ) -> None: | |
| mat = taxonomy_divergence_matrix(engines, metric="js") | |
| for a in engines: | |
| for b in engines: | |
| assert mat[a][b] == pytest.approx(mat[b][a], abs=1e-9) | |
| def test_kl_matrix_is_asymmetric( | |
| self, engines: dict[str, dict[str, float]] | |
| ) -> None: | |
| mat = taxonomy_divergence_matrix(engines, metric="kl") | |
| # Au moins une paire doit Γͺtre asymΓ©trique | |
| asymmetric_found = any( | |
| abs(mat[a][b] - mat[b][a]) > 1e-6 | |
| for a in engines for b in engines if a != b | |
| ) | |
| assert asymmetric_found | |
| def test_unknown_metric_raises( | |
| self, engines: dict[str, dict[str, float]] | |
| ) -> None: | |
| with pytest.raises(ValueError, match="metric"): | |
| taxonomy_divergence_matrix(engines, metric="hellinger") | |
| def test_distinguishes_specialized_engines(self) -> None: | |
| """Deux moteurs avec profils opposΓ©s doivent ressortir comme | |
| candidats Γ un ensemble (JS Γ©levΓ©e).""" | |
| engines = { | |
| "visual_specialist": {"visual": 0.9, "casse": 0.05, "abbrev": 0.05}, | |
| "abbrev_specialist": {"visual": 0.05, "casse": 0.05, "abbrev": 0.9}, | |
| "balanced": {"visual": 0.33, "casse": 0.33, "abbrev": 0.34}, | |
| } | |
| mat = taxonomy_divergence_matrix(engines, metric="js") | |
| # Les deux spΓ©cialistes doivent diverger plus l'un de l'autre que | |
| # n'importe lequel d'eux du moteur balanced. | |
| assert mat["visual_specialist"]["abbrev_specialist"] > mat["visual_specialist"]["balanced"] | |
| assert mat["visual_specialist"]["abbrev_specialist"] > mat["abbrev_specialist"]["balanced"] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 4. Oracle token recall | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestOracleTokenRecall: | |
| def test_perfect_engine_oracle_is_one(self) -> None: | |
| ref = "le manuscrit est ancien" | |
| hyps = {"perfect": ref} | |
| assert oracle_token_recall(ref, hyps) == pytest.approx(1.0) | |
| def test_no_engine_recovers_anything(self) -> None: | |
| ref = "alpha beta gamma" | |
| hyps = {"a": "x y z", "b": "x y z"} | |
| assert oracle_token_recall(ref, hyps) == pytest.approx(0.0) | |
| def test_complementarity_pays_off(self) -> None: | |
| """A et B se complètent : aucun ne fait tout, ensemble ils font tout.""" | |
| ref = "alpha beta gamma delta" | |
| hyps = { | |
| "a": "alpha beta x y", # alpha + beta seulement | |
| "b": "x y gamma delta", # gamma + delta seulement | |
| } | |
| assert oracle_token_recall(ref, hyps) == pytest.approx(1.0) | |
| # Et chacun seul ne fait que la moitiΓ© | |
| from picarones.evaluation.metrics.inter_engine import complementarity_gap | |
| gap = complementarity_gap(ref, hyps) | |
| assert gap["best_single_recall"] == pytest.approx(0.5) | |
| assert gap["oracle_recall"] == pytest.approx(1.0) | |
| assert gap["absolute_gap"] == pytest.approx(0.5) | |
| # Tout l'Γ©cart restant est rΓ©cupΓ©rable β relative_gap = 1 | |
| assert gap["relative_gap"] == pytest.approx(1.0) | |
| def test_multiplicity_is_respected(self) -> None: | |
| """Si la GT a deux 'le' et le moteur n'en produit qu'un, recall = 0.5 | |
| sur ce token.""" | |
| ref = "le chat le chien" # 2Γ 'le', 1Γ 'chat', 1Γ 'chien' | |
| hyps = {"a": "le chat le chien"} # parfait | |
| assert oracle_token_recall(ref, hyps) == pytest.approx(1.0) | |
| hyps2 = {"a": "le chat chien"} # un seul 'le' | |
| assert oracle_token_recall(ref, hyps2) == pytest.approx(3 / 4) | |
| def test_empty_reference_returns_one(self) -> None: | |
| assert oracle_token_recall("", {"a": "anything"}) == pytest.approx(1.0) | |
| def test_no_hypotheses_returns_zero(self) -> None: | |
| assert oracle_token_recall("alpha", {}) == pytest.approx(0.0) | |
| def test_oracle_is_at_least_best_single(self) -> None: | |
| """Invariant : l'oracle est toujours β₯ au meilleur moteur seul.""" | |
| ref = "alpha beta gamma delta epsilon" | |
| hyps = { | |
| "a": "alpha beta gamma x y", | |
| "b": "alpha x gamma delta z", | |
| "c": "x y z delta epsilon", | |
| } | |
| gap = complementarity_gap(ref, hyps) | |
| assert gap["oracle_recall"] >= gap["best_single_recall"] | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # 5. Gap et dΓ©saccord par paire | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| class TestComplementarityGap: | |
| def test_no_gap_when_engines_are_redundant(self) -> None: | |
| ref = "alpha beta gamma" | |
| hyps = {"a": "alpha beta x", "b": "alpha beta x"} # redondants | |
| gap = complementarity_gap(ref, hyps) | |
| # Les deux ratent le mΓͺme token β oracle = best_single | |
| assert gap["absolute_gap"] == pytest.approx(0.0) | |
| assert gap["relative_gap"] == pytest.approx(0.0) | |
| def test_best_engine_named(self) -> None: | |
| ref = "alpha beta gamma" | |
| hyps = { | |
| "tesseract": "alpha x x", # 1/3 | |
| "pero": "alpha beta x", # 2/3 | |
| } | |
| gap = complementarity_gap(ref, hyps) | |
| assert gap["best_engine"] == "pero" | |
| def test_empty_reference(self) -> None: | |
| gap = complementarity_gap("", {"a": "anything"}) | |
| assert gap["oracle_recall"] == 1.0 | |
| assert gap["best_single_recall"] == 1.0 | |
| assert gap["absolute_gap"] == 0.0 | |
| class TestPairwiseDisagreement: | |
| def test_identical_hypotheses_zero_disagreement(self) -> None: | |
| ref = "alpha beta gamma" | |
| h = "alpha beta x" | |
| assert pairwise_disagreement_rate(ref, h, h) == pytest.approx(0.0) | |
| def test_complete_disagreement_when_complementary(self) -> None: | |
| ref = "alpha beta" | |
| # A prΓ©serve alpha, B prΓ©serve beta β dΓ©saccord sur les deux | |
| rate = pairwise_disagreement_rate(ref, "alpha x", "x beta") | |
| assert rate == pytest.approx(1.0) | |
| def test_empty_reference_returns_zero(self) -> None: | |
| assert pairwise_disagreement_rate("", "x", "y") == 0.0 | |