Spaces:
Sleeping
Sleeping
File size: 12,053 Bytes
68a1ab1 979f3c3 68a1ab1 46bb905 68a1ab1 46bb905 68a1ab1 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 | """Tests Sprint 35 β mΓ©triques inter-moteurs (Γtape 2 du plan).
Couvre les deux familles de mesures du module ``picarones.measurements.inter_engine`` :
1. **Divergence taxonomique** : KL et JS-divergence sur les
distributions de classes d'erreur, plus la matrice triangulaire
inter-moteurs. Tests : invariants mathΓ©matiques (positivitΓ©, JS
symΓ©trique et bornΓ©e, KL(p,p)=0), comportement sur clΓ©s disjointes.
2. **ComplΓ©mentaritΓ©** : oracle token recall, gap absolu/relatif vs
meilleur moteur seul, taux de dΓ©saccord par paire. Tests : cas
parfait (oracle = best = 1), cas oΓΉ un ensemble apporte un vrai gain,
cas d'Γ©galitΓ© parfaite (gap = 0), garde-fous (rΓ©fΓ©rence vide,
hypothèses vides).
Les fonctions sont pures ; pas besoin de fixtures d'I/O ni de moteurs
rΓ©els.
"""
from __future__ import annotations
import math
import pytest
from picarones.evaluation.metrics.inter_engine import (
complementarity_gap,
jensen_shannon_divergence,
kl_divergence,
oracle_token_recall,
pairwise_disagreement_rate,
taxonomy_divergence_matrix,
)
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 1. KL-divergence
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestKLDivergence:
def test_self_divergence_is_zero(self) -> None:
p = {"a": 0.4, "b": 0.3, "c": 0.3}
assert kl_divergence(p, p) == pytest.approx(0.0, abs=1e-9)
def test_kl_is_non_negative(self) -> None:
p = {"a": 0.7, "b": 0.2, "c": 0.1}
q = {"a": 0.1, "b": 0.4, "c": 0.5}
assert kl_divergence(p, q) > 0
assert kl_divergence(q, p) > 0
def test_kl_is_asymmetric_in_general(self) -> None:
# Choix asymΓ©trique non symΓ©trique par permutation
p = {"a": 0.9, "b": 0.05, "c": 0.05}
q = {"a": 0.4, "b": 0.4, "c": 0.2}
assert kl_divergence(p, q) != pytest.approx(kl_divergence(q, p), abs=1e-3)
def test_disjoint_keys_handled(self) -> None:
# Pas de clΓ© en commun : doit retourner une valeur finie grΓ’ce
# au lissage epsilon.
p = {"a": 1.0}
q = {"b": 1.0}
kl = kl_divergence(p, q)
assert math.isfinite(kl)
assert kl > 0
def test_empty_distributions_return_zero(self) -> None:
assert kl_divergence({}, {}) == 0.0
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 2. Jensen-Shannon divergence
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestJensenShannonDivergence:
def test_self_divergence_is_zero(self) -> None:
p = {"a": 0.4, "b": 0.3, "c": 0.3}
assert jensen_shannon_divergence(p, p) == pytest.approx(0.0, abs=1e-9)
def test_symmetric(self) -> None:
p = {"a": 0.7, "b": 0.2, "c": 0.1}
q = {"a": 0.1, "b": 0.4, "c": 0.5}
assert jensen_shannon_divergence(p, q) == pytest.approx(
jensen_shannon_divergence(q, p), abs=1e-9
)
def test_bounded_in_unit_interval(self) -> None:
# JS en bits β [0, 1]. Distributions extrΓͺmes : disjointes.
p = {"a": 1.0}
q = {"b": 1.0}
js = jensen_shannon_divergence(p, q)
assert 0.0 <= js <= 1.0
# Les distributions disjointes donnent une JS proche de 1 (la
# borne est atteinte asymptotiquement).
assert js > 0.5
def test_close_distributions_have_small_js(self) -> None:
p = {"a": 0.5, "b": 0.5}
q = {"a": 0.51, "b": 0.49}
assert jensen_shannon_divergence(p, q) < 0.01
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 3. Matrice de divergence inter-moteurs
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestDivergenceMatrix:
@pytest.fixture
def engines(self) -> dict[str, dict[str, float]]:
return {
"tesseract": {"visual": 0.5, "casse": 0.3, "abbrev": 0.2},
"pero": {"visual": 0.2, "casse": 0.3, "abbrev": 0.5},
"mistral": {"visual": 0.4, "casse": 0.4, "abbrev": 0.2},
}
def test_diagonal_is_zero(
self, engines: dict[str, dict[str, float]]
) -> None:
mat = taxonomy_divergence_matrix(engines)
for name in engines:
assert mat[name][name] == pytest.approx(0.0, abs=1e-9)
def test_js_matrix_is_symmetric(
self, engines: dict[str, dict[str, float]]
) -> None:
mat = taxonomy_divergence_matrix(engines, metric="js")
for a in engines:
for b in engines:
assert mat[a][b] == pytest.approx(mat[b][a], abs=1e-9)
def test_kl_matrix_is_asymmetric(
self, engines: dict[str, dict[str, float]]
) -> None:
mat = taxonomy_divergence_matrix(engines, metric="kl")
# Au moins une paire doit Γͺtre asymΓ©trique
asymmetric_found = any(
abs(mat[a][b] - mat[b][a]) > 1e-6
for a in engines for b in engines if a != b
)
assert asymmetric_found
def test_unknown_metric_raises(
self, engines: dict[str, dict[str, float]]
) -> None:
with pytest.raises(ValueError, match="metric"):
taxonomy_divergence_matrix(engines, metric="hellinger")
def test_distinguishes_specialized_engines(self) -> None:
"""Deux moteurs avec profils opposΓ©s doivent ressortir comme
candidats Γ un ensemble (JS Γ©levΓ©e)."""
engines = {
"visual_specialist": {"visual": 0.9, "casse": 0.05, "abbrev": 0.05},
"abbrev_specialist": {"visual": 0.05, "casse": 0.05, "abbrev": 0.9},
"balanced": {"visual": 0.33, "casse": 0.33, "abbrev": 0.34},
}
mat = taxonomy_divergence_matrix(engines, metric="js")
# Les deux spΓ©cialistes doivent diverger plus l'un de l'autre que
# n'importe lequel d'eux du moteur balanced.
assert mat["visual_specialist"]["abbrev_specialist"] > mat["visual_specialist"]["balanced"]
assert mat["visual_specialist"]["abbrev_specialist"] > mat["abbrev_specialist"]["balanced"]
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 4. Oracle token recall
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestOracleTokenRecall:
def test_perfect_engine_oracle_is_one(self) -> None:
ref = "le manuscrit est ancien"
hyps = {"perfect": ref}
assert oracle_token_recall(ref, hyps) == pytest.approx(1.0)
def test_no_engine_recovers_anything(self) -> None:
ref = "alpha beta gamma"
hyps = {"a": "x y z", "b": "x y z"}
assert oracle_token_recall(ref, hyps) == pytest.approx(0.0)
def test_complementarity_pays_off(self) -> None:
"""A et B se complètent : aucun ne fait tout, ensemble ils font tout."""
ref = "alpha beta gamma delta"
hyps = {
"a": "alpha beta x y", # alpha + beta seulement
"b": "x y gamma delta", # gamma + delta seulement
}
assert oracle_token_recall(ref, hyps) == pytest.approx(1.0)
# Et chacun seul ne fait que la moitiΓ©
from picarones.evaluation.metrics.inter_engine import complementarity_gap
gap = complementarity_gap(ref, hyps)
assert gap["best_single_recall"] == pytest.approx(0.5)
assert gap["oracle_recall"] == pytest.approx(1.0)
assert gap["absolute_gap"] == pytest.approx(0.5)
# Tout l'Γ©cart restant est rΓ©cupΓ©rable β relative_gap = 1
assert gap["relative_gap"] == pytest.approx(1.0)
def test_multiplicity_is_respected(self) -> None:
"""Si la GT a deux 'le' et le moteur n'en produit qu'un, recall = 0.5
sur ce token."""
ref = "le chat le chien" # 2Γ 'le', 1Γ 'chat', 1Γ 'chien'
hyps = {"a": "le chat le chien"} # parfait
assert oracle_token_recall(ref, hyps) == pytest.approx(1.0)
hyps2 = {"a": "le chat chien"} # un seul 'le'
assert oracle_token_recall(ref, hyps2) == pytest.approx(3 / 4)
def test_empty_reference_returns_one(self) -> None:
assert oracle_token_recall("", {"a": "anything"}) == pytest.approx(1.0)
def test_no_hypotheses_returns_zero(self) -> None:
assert oracle_token_recall("alpha", {}) == pytest.approx(0.0)
def test_oracle_is_at_least_best_single(self) -> None:
"""Invariant : l'oracle est toujours β₯ au meilleur moteur seul."""
ref = "alpha beta gamma delta epsilon"
hyps = {
"a": "alpha beta gamma x y",
"b": "alpha x gamma delta z",
"c": "x y z delta epsilon",
}
gap = complementarity_gap(ref, hyps)
assert gap["oracle_recall"] >= gap["best_single_recall"]
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
# 5. Gap et dΓ©saccord par paire
# ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
class TestComplementarityGap:
def test_no_gap_when_engines_are_redundant(self) -> None:
ref = "alpha beta gamma"
hyps = {"a": "alpha beta x", "b": "alpha beta x"} # redondants
gap = complementarity_gap(ref, hyps)
# Les deux ratent le mΓͺme token β oracle = best_single
assert gap["absolute_gap"] == pytest.approx(0.0)
assert gap["relative_gap"] == pytest.approx(0.0)
def test_best_engine_named(self) -> None:
ref = "alpha beta gamma"
hyps = {
"tesseract": "alpha x x", # 1/3
"pero": "alpha beta x", # 2/3
}
gap = complementarity_gap(ref, hyps)
assert gap["best_engine"] == "pero"
def test_empty_reference(self) -> None:
gap = complementarity_gap("", {"a": "anything"})
assert gap["oracle_recall"] == 1.0
assert gap["best_single_recall"] == 1.0
assert gap["absolute_gap"] == 0.0
class TestPairwiseDisagreement:
def test_identical_hypotheses_zero_disagreement(self) -> None:
ref = "alpha beta gamma"
h = "alpha beta x"
assert pairwise_disagreement_rate(ref, h, h) == pytest.approx(0.0)
def test_complete_disagreement_when_complementary(self) -> None:
ref = "alpha beta"
# A prΓ©serve alpha, B prΓ©serve beta β dΓ©saccord sur les deux
rate = pairwise_disagreement_rate(ref, "alpha x", "x beta")
assert rate == pytest.approx(1.0)
def test_empty_reference_returns_zero(self) -> None:
assert pairwise_disagreement_rate("", "x", "y") == 0.0
|