davidjurgens's picture
Deploy: Potato — Codebook Annotation
aceb1b2 verified
Raw
History Blame Contribute Delete
3.38 kB
"""
Krippendorff's alpha wrapper.
Delegates to ``simpledorff`` (already a project dependency). Supports nominal,
ordinal, interval, ratio, and MASI distance metrics. Accepts long-format data:
a list of (annotator, item, label) triples.
When ``simpledorff`` is unavailable, falls back to NaN with a logged warning.
"""
from __future__ import annotations
from typing import Iterable, Sequence, Tuple, Union
import logging
logger = logging.getLogger(__name__)
def _nominal_distance(a, b) -> float:
return 0.0 if a == b else 1.0
def _ordinal_distance(a, b) -> float:
try:
return abs(float(a) - float(b))
except (TypeError, ValueError):
return 0.0 if a == b else 1.0
def _interval_distance(a, b) -> float:
try:
return (float(a) - float(b)) ** 2
except (TypeError, ValueError):
return 0.0 if a == b else 1.0
def _ratio_distance(a, b) -> float:
try:
a, b = float(a), float(b)
if a + b == 0:
return 0.0
return ((a - b) / (a + b)) ** 2
except (TypeError, ValueError):
return 0.0 if a == b else 1.0
def _masi_distance(a, b) -> float:
"""
MASI distance for multi-label sets. ``a`` and ``b`` are iterables of labels.
"""
set_a = frozenset(a) if not isinstance(a, frozenset) else a
set_b = frozenset(b) if not isinstance(b, frozenset) else b
if not set_a and not set_b:
return 0.0
intersection = set_a & set_b
union = set_a | set_b
if not union:
return 0.0
jaccard = len(intersection) / len(union)
if set_a == set_b:
m = 1.0
elif set_a < set_b or set_b < set_a:
m = 2 / 3
elif intersection and set_a != set_b:
m = 1 / 3
else:
m = 0.0
return 1.0 - (jaccard * m)
_DISTANCES = {
"nominal": _nominal_distance,
"ordinal": _ordinal_distance,
"interval": _interval_distance,
"ratio": _ratio_distance,
"masi": _masi_distance,
}
def krippendorff_alpha(
long_format: Sequence[Tuple[str, str, Union[str, float, frozenset]]],
level: str = "nominal",
) -> float:
"""
Krippendorff's alpha.
Args:
long_format: iterable of (annotator_id, item_id, value) tuples.
level: 'nominal', 'ordinal', 'interval', 'ratio', or 'masi'.
Returns:
Alpha as a float, or NaN if undefined.
"""
if level not in _DISTANCES:
raise ValueError(f"Unknown level for Krippendorff's alpha: {level!r}")
try:
import simpledorff
import pandas as pd
except ImportError: # pragma: no cover
logger.warning("simpledorff/pandas unavailable; krippendorff_alpha returning NaN")
return float("nan")
rows = list(long_format)
if not rows:
return float("nan")
df = pd.DataFrame(rows, columns=["annotator", "item", "value"])
if df["item"].nunique() < 2 or df["annotator"].nunique() < 2:
return float("nan")
dist = _DISTANCES[level]
try:
return float(
simpledorff.calculate_krippendorffs_alpha_for_df(
df,
experiment_col="item",
annotator_col="annotator",
class_col="value",
metric_fn=dist,
)
)
except Exception as exc: # pragma: no cover
logger.warning("krippendorff_alpha failed: %s", exc)
return float("nan")