Spaces:
Paused
Paused
| """ | |
| Nominal IAA metrics: percent agreement, Cohen's kappa, Fleiss' kappa. | |
| Inputs are lists keyed by item: for two-annotator metrics, two equal-length | |
| label lists; for multi-annotator metrics, a list of (annotator_id -> label) dicts. | |
| """ | |
| from __future__ import annotations | |
| from collections import Counter | |
| from math import isclose | |
| from typing import Dict, List, Sequence | |
| import logging | |
| logger = logging.getLogger(__name__) | |
| def percent_agreement(labels_a: Sequence, labels_b: Sequence) -> float: | |
| """Fraction of items on which two annotators agree.""" | |
| if len(labels_a) != len(labels_b): | |
| raise ValueError("label lists must be the same length") | |
| if not labels_a: | |
| return float("nan") | |
| agree = sum(1 for a, b in zip(labels_a, labels_b) if a == b) | |
| return agree / len(labels_a) | |
| def cohen_kappa(labels_a: Sequence, labels_b: Sequence) -> float: | |
| """ | |
| Cohen's kappa for two annotators on nominal categories. | |
| Uses sklearn if available (handles ties and edge cases well); falls back | |
| to a direct implementation otherwise. | |
| """ | |
| if len(labels_a) != len(labels_b): | |
| raise ValueError("label lists must be the same length") | |
| if not labels_a: | |
| return float("nan") | |
| try: | |
| from sklearn.metrics import cohen_kappa_score | |
| return float(cohen_kappa_score(list(labels_a), list(labels_b))) | |
| except ImportError: # pragma: no cover | |
| pass | |
| n = len(labels_a) | |
| po = percent_agreement(labels_a, labels_b) | |
| counts_a = Counter(labels_a) | |
| counts_b = Counter(labels_b) | |
| pe = sum(counts_a[c] * counts_b[c] for c in set(counts_a) | set(counts_b)) / (n * n) | |
| if isclose(pe, 1.0): | |
| return 1.0 if isclose(po, 1.0) else 0.0 | |
| return (po - pe) / (1 - pe) | |
| def fleiss_kappa(per_item_label_counts: List[Dict[str, int]]) -> float: | |
| """ | |
| Fleiss' kappa for >=2 annotators on nominal categories. | |
| Args: | |
| per_item_label_counts: one dict per item mapping label -> number of | |
| annotators who chose it. Each item dict must sum to the same N | |
| (the number of annotators rating that item). Items where N < 2 | |
| are skipped. | |
| Returns: | |
| Fleiss' kappa as a float, or NaN if undefined. | |
| """ | |
| # Use only items rated by at least 2 annotators. | |
| rated = [d for d in per_item_label_counts if sum(d.values()) >= 2] | |
| if not rated: | |
| return float("nan") | |
| ns = [sum(d.values()) for d in rated] | |
| if len(set(ns)) != 1: | |
| # Variable-N Fleiss' kappa is rare in practice; restrict to majority N. | |
| from statistics import mode | |
| majority_n = mode(ns) | |
| rated = [d for d, n in zip(rated, ns) if n == majority_n] | |
| ns = [majority_n] * len(rated) | |
| if not rated: | |
| return float("nan") | |
| n = ns[0] | |
| categories = sorted({c for d in rated for c in d}) | |
| if n < 2 or not categories: | |
| return float("nan") | |
| n_items = len(rated) | |
| # Per-item agreement P_i | |
| p_is = [] | |
| for d in rated: | |
| total = sum(d.get(c, 0) ** 2 for c in categories) | |
| p_is.append((total - n) / (n * (n - 1))) | |
| p_bar = sum(p_is) / n_items | |
| # Marginal proportions per category | |
| p_js = [] | |
| for c in categories: | |
| s = sum(d.get(c, 0) for d in rated) | |
| p_js.append(s / (n_items * n)) | |
| p_e = sum(p * p for p in p_js) | |
| if isclose(p_e, 1.0): | |
| return 1.0 if isclose(p_bar, 1.0) else 0.0 | |
| return (p_bar - p_e) / (1 - p_e) | |
| def pairwise_cohen_kappa(annotations_by_user: Dict[str, Sequence]) -> float: | |
| """ | |
| Mean Cohen's kappa across every distinct pair of annotators. | |
| annotations_by_user maps user_id -> aligned label sequence (same length per user). | |
| Users contributing fewer than the maximum length are restricted to their | |
| overlap with each partner. | |
| """ | |
| users = list(annotations_by_user) | |
| if len(users) < 2: | |
| return float("nan") | |
| kappas = [] | |
| for i in range(len(users)): | |
| for j in range(i + 1, len(users)): | |
| a = list(annotations_by_user[users[i]]) | |
| b = list(annotations_by_user[users[j]]) | |
| m = min(len(a), len(b)) | |
| if m == 0: | |
| continue | |
| try: | |
| kappas.append(cohen_kappa(a[:m], b[:m])) | |
| except ValueError: | |
| continue | |
| if not kappas: | |
| return float("nan") | |
| return sum(kappas) / len(kappas) | |