codebook / potato /bws_tuple_generator.py
davidjurgens's picture
Deploy: Potato — Codebook Annotation
aceb1b2 verified
Raw
History Blame Contribute Delete
4.69 kB
"""
Best-Worst Scaling Tuple Generator
Generates tuples of K items from a pool for Best-Worst Scaling annotation.
Each tuple is a synthetic instance containing references to K original pool items.
Annotators select the "best" and "worst" item from each tuple.
Key features:
- Reproducible via random seed
- Configurable tuple size and number of tuples
- Auto-calculates num_tuples based on Louviere's guideline (2 * tuple_size appearances per item)
- Each item appears in multiple tuples; no item repeats within a single tuple
"""
import logging
import math
import random
from typing import Any, Dict, List, Optional
logger = logging.getLogger(__name__)
# Position labels: A, B, C, ... Z
POSITION_LABELS = [chr(ord('A') + i) for i in range(26)]
class BwsTupleGenerator:
"""Generate BWS tuples from a pool of items."""
def __init__(
self,
pool_items: List[Dict[str, Any]],
id_key: str,
text_key: str,
tuple_size: int = 4,
num_tuples: Optional[int] = None,
seed: int = 42,
min_item_appearances: Optional[int] = None,
):
self.pool_items = pool_items
self.id_key = id_key
self.text_key = text_key
self.tuple_size = tuple_size
self.seed = seed
self.min_item_appearances = min_item_appearances
self._num_tuples = num_tuples
def validate(self):
"""Validate configuration before generation."""
if self.tuple_size < 2:
raise ValueError(f"tuple_size must be >= 2, got {self.tuple_size}")
if self.tuple_size > len(self.pool_items):
raise ValueError(
f"tuple_size ({self.tuple_size}) exceeds pool size ({len(self.pool_items)})"
)
if self._num_tuples is not None and self._num_tuples < 1:
raise ValueError(f"num_tuples must be >= 1, got {self._num_tuples}")
def _calculate_num_tuples(self) -> int:
"""Auto-calculate number of tuples.
Uses Louviere's guideline: each item should appear at least
2 * tuple_size times across all tuples.
"""
min_appearances = self.min_item_appearances
if min_appearances is None:
min_appearances = 2 * self.tuple_size
pool_size = len(self.pool_items)
# Each tuple uses tuple_size items, so on average each item appears
# (num_tuples * tuple_size) / pool_size times.
# We need: (num_tuples * tuple_size) / pool_size >= min_appearances
num_tuples = math.ceil(pool_size * min_appearances / self.tuple_size)
return max(num_tuples, 1)
def generate(self) -> List[Dict[str, Any]]:
"""Generate tuple instances from pool items.
Returns list of synthetic item dicts, each with:
- id_key: "bws_tuple_001"
- "_bws_items": list of {source_id, text, position} dicts
- "_bws_tuple_size": int
- text_key: "" (empty — BWS JS handles display)
"""
self.validate()
num_tuples = self._num_tuples if self._num_tuples else self._calculate_num_tuples()
rng = random.Random(self.seed)
logger.info(
f"Generating {num_tuples} BWS tuples of size {self.tuple_size} "
f"from pool of {len(self.pool_items)} items (seed={self.seed})"
)
tuples = []
for i in range(num_tuples):
sampled = rng.sample(self.pool_items, self.tuple_size)
bws_items = []
for pos_idx, item in enumerate(sampled):
bws_items.append({
"source_id": str(item[self.id_key]),
"text": str(item.get(self.text_key, "")),
"position": POSITION_LABELS[pos_idx],
})
tuple_id = f"bws_tuple_{i + 1:04d}"
tuple_instance = {
self.id_key: tuple_id,
self.text_key: "",
"_bws_items": bws_items,
"_bws_tuple_size": self.tuple_size,
}
tuples.append(tuple_instance)
# Log coverage statistics
item_counts = {}
for t in tuples:
for bws_item in t["_bws_items"]:
sid = bws_item["source_id"]
item_counts[sid] = item_counts.get(sid, 0) + 1
min_count = min(item_counts.values()) if item_counts else 0
max_count = max(item_counts.values()) if item_counts else 0
avg_count = sum(item_counts.values()) / len(item_counts) if item_counts else 0
logger.info(
f"BWS tuple coverage: min={min_count}, max={max_count}, avg={avg_count:.1f} "
f"appearances per item"
)
return tuples