data-use-annotation / generate_assignments.py
rafmacalaba's picture
feat: multi-corpus support
a2c885c
#!/usr/bin/env python3
"""
generate_assignments.py
Reads corpora.json and annotator_config.yaml, distributes available docs
across annotators with configurable overlap per corpus, and writes back
the updated config.
Usage:
python3 generate_assignments.py # Generate and save
python3 generate_assignments.py --dry-run # Preview only
python3 generate_assignments.py --upload # Upload config to HF
Requires: pyyaml, huggingface_hub (for --upload)
"""
import argparse
import json
import random
import sys
from pathlib import Path
try:
import yaml
except ImportError:
print("❌ pyyaml required: uv pip install pyyaml")
sys.exit(1)
CONFIG_PATH = Path(__file__).parent / "annotation_data" / "annotator_config.yaml"
CORPORA_PATH = Path(__file__).parent / "annotation_data" / "corpora.json"
def load_config():
return yaml.safe_load(CONFIG_PATH.read_text())
def save_config(config):
CONFIG_PATH.write_text(yaml.dump(config, default_flow_style=False, sort_keys=False))
def load_corpora():
return json.loads(CORPORA_PATH.read_text())
def get_available_docs(corpus):
"""Get list of active doc indices for a given corpus."""
links_path = Path(__file__).parent / "annotation_data" / corpus["links_file"]
if not links_path.exists():
print(f" ⚠️ No links file for {corpus['id']}: {links_path}")
return []
links = json.loads(links_path.read_text())
return sorted([
l["index"] for l in links
if l.get("has_revalidation") and l.get("status") == "success"
])
def generate_assignments(config, corpora, seed=42):
"""Distribute docs across annotators with overlap, per corpus."""
settings = config.get("settings", {})
overlap_pct = settings.get("overlap_percent", 10)
annotators = config.get("annotators", [])
if not annotators:
print("❌ No annotators defined in config.")
return config
n_annotators = len(annotators)
rng = random.Random(seed)
# Initialize per-corpus doc dicts
for ann in annotators:
if not isinstance(ann.get("docs"), dict):
ann["docs"] = {}
for corpus in corpora:
cid = corpus["id"]
all_docs = get_available_docs(corpus)
n_docs = len(all_docs)
if n_docs == 0:
print(f"\nπŸ“‚ {corpus['name']} ({cid}): no docs available")
continue
n_overlap = max(1, round(n_docs * overlap_pct / 100))
shuffled = all_docs.copy()
rng.shuffle(shuffled)
overlap_docs = sorted(shuffled[:n_overlap])
remaining = shuffled[n_overlap:]
per_annotator = len(remaining) // n_annotators
extra = len(remaining) % n_annotators
print(f"\nπŸ“‚ {corpus['name']} ({cid}):")
print(f" Total docs: {n_docs}")
print(f" Overlap ({overlap_pct}%): {n_overlap} docs shared by all")
print(f" Per annotator: ~{per_annotator + n_overlap} docs each")
print(f" Overlap docs: {overlap_docs}")
start = 0
for i, ann in enumerate(annotators):
count = per_annotator + (1 if i < extra else 0)
exclusive = sorted(remaining[start:start + count])
start += count
ann["docs"][cid] = sorted(overlap_docs + exclusive)
print(f" {ann['username']}: {len(ann['docs'][cid])} docs "
f"({n_overlap} overlap + {len(exclusive)} exclusive)")
return config
def upload_config():
"""Upload annotator_config.yaml to HF."""
try:
from huggingface_hub import HfApi
import os
token = os.environ.get("HF_TOKEN")
if not token:
env_path = Path(__file__).parent / ".env"
if env_path.exists():
for line in env_path.read_text().splitlines():
if line.startswith("HF_TOKEN="):
token = line.split("=", 1)[1].strip()
if not token:
print("❌ No HF_TOKEN found.")
return
api = HfApi(token=token)
api.upload_file(
path_or_fileobj=str(CONFIG_PATH),
path_in_repo="annotation_data/annotator_config.yaml",
repo_id="ai4data/annotation_data",
repo_type="dataset",
commit_message="Update annotator assignments",
)
print("βœ… Uploaded annotator_config.yaml to HF")
except ImportError:
print("❌ huggingface_hub required: uv pip install huggingface_hub")
def main():
parser = argparse.ArgumentParser(description="Generate document assignments per corpus")
parser.add_argument("--dry-run", action="store_true", help="Preview only")
parser.add_argument("--upload", action="store_true", help="Upload config to HF")
parser.add_argument("--seed", type=int, default=42, help="Random seed")
args = parser.parse_args()
corpora = load_corpora()
config = load_config()
print(f"πŸ“‹ Loaded {len(corpora)} corpora, {len(config.get('annotators', []))} annotators")
config = generate_assignments(config, corpora, seed=args.seed)
if args.dry_run:
print("\n[DRY RUN] Would save:")
print(yaml.dump(config, default_flow_style=False, sort_keys=False))
else:
save_config(config)
print(f"\nπŸ’Ύ Saved to {CONFIG_PATH}")
if args.upload:
upload_config()
print("\nβœ… Done!")
if __name__ == "__main__":
main()