codebook / potato /server_utils /iaa /nominal.py
davidjurgens's picture
Deploy: Potato — Codebook Annotation
aceb1b2 verified
Raw
History Blame Contribute Delete
4.38 kB
"""
Nominal IAA metrics: percent agreement, Cohen's kappa, Fleiss' kappa.
Inputs are lists keyed by item: for two-annotator metrics, two equal-length
label lists; for multi-annotator metrics, a list of (annotator_id -> label) dicts.
"""
from __future__ import annotations
from collections import Counter
from math import isclose
from typing import Dict, List, Sequence
import logging
logger = logging.getLogger(__name__)
def percent_agreement(labels_a: Sequence, labels_b: Sequence) -> float:
"""Fraction of items on which two annotators agree."""
if len(labels_a) != len(labels_b):
raise ValueError("label lists must be the same length")
if not labels_a:
return float("nan")
agree = sum(1 for a, b in zip(labels_a, labels_b) if a == b)
return agree / len(labels_a)
def cohen_kappa(labels_a: Sequence, labels_b: Sequence) -> float:
"""
Cohen's kappa for two annotators on nominal categories.
Uses sklearn if available (handles ties and edge cases well); falls back
to a direct implementation otherwise.
"""
if len(labels_a) != len(labels_b):
raise ValueError("label lists must be the same length")
if not labels_a:
return float("nan")
try:
from sklearn.metrics import cohen_kappa_score
return float(cohen_kappa_score(list(labels_a), list(labels_b)))
except ImportError: # pragma: no cover
pass
n = len(labels_a)
po = percent_agreement(labels_a, labels_b)
counts_a = Counter(labels_a)
counts_b = Counter(labels_b)
pe = sum(counts_a[c] * counts_b[c] for c in set(counts_a) | set(counts_b)) / (n * n)
if isclose(pe, 1.0):
return 1.0 if isclose(po, 1.0) else 0.0
return (po - pe) / (1 - pe)
def fleiss_kappa(per_item_label_counts: List[Dict[str, int]]) -> float:
"""
Fleiss' kappa for >=2 annotators on nominal categories.
Args:
per_item_label_counts: one dict per item mapping label -> number of
annotators who chose it. Each item dict must sum to the same N
(the number of annotators rating that item). Items where N < 2
are skipped.
Returns:
Fleiss' kappa as a float, or NaN if undefined.
"""
# Use only items rated by at least 2 annotators.
rated = [d for d in per_item_label_counts if sum(d.values()) >= 2]
if not rated:
return float("nan")
ns = [sum(d.values()) for d in rated]
if len(set(ns)) != 1:
# Variable-N Fleiss' kappa is rare in practice; restrict to majority N.
from statistics import mode
majority_n = mode(ns)
rated = [d for d, n in zip(rated, ns) if n == majority_n]
ns = [majority_n] * len(rated)
if not rated:
return float("nan")
n = ns[0]
categories = sorted({c for d in rated for c in d})
if n < 2 or not categories:
return float("nan")
n_items = len(rated)
# Per-item agreement P_i
p_is = []
for d in rated:
total = sum(d.get(c, 0) ** 2 for c in categories)
p_is.append((total - n) / (n * (n - 1)))
p_bar = sum(p_is) / n_items
# Marginal proportions per category
p_js = []
for c in categories:
s = sum(d.get(c, 0) for d in rated)
p_js.append(s / (n_items * n))
p_e = sum(p * p for p in p_js)
if isclose(p_e, 1.0):
return 1.0 if isclose(p_bar, 1.0) else 0.0
return (p_bar - p_e) / (1 - p_e)
def pairwise_cohen_kappa(annotations_by_user: Dict[str, Sequence]) -> float:
"""
Mean Cohen's kappa across every distinct pair of annotators.
annotations_by_user maps user_id -> aligned label sequence (same length per user).
Users contributing fewer than the maximum length are restricted to their
overlap with each partner.
"""
users = list(annotations_by_user)
if len(users) < 2:
return float("nan")
kappas = []
for i in range(len(users)):
for j in range(i + 1, len(users)):
a = list(annotations_by_user[users[i]])
b = list(annotations_by_user[users[j]])
m = min(len(a), len(b))
if m == 0:
continue
try:
kappas.append(cohen_kappa(a[:m], b[:m]))
except ValueError:
continue
if not kappas:
return float("nan")
return sum(kappas) / len(kappas)