#!/usr/bin/env python3 """ generate_assignments.py Reads corpora.json and annotator_config.yaml, distributes available docs across annotators with configurable overlap per corpus, and writes back the updated config. Usage: python3 generate_assignments.py # Generate and save python3 generate_assignments.py --dry-run # Preview only python3 generate_assignments.py --upload # Upload config to HF Requires: pyyaml, huggingface_hub (for --upload) """ import argparse import json import random import sys from pathlib import Path try: import yaml except ImportError: print("āŒ pyyaml required: uv pip install pyyaml") sys.exit(1) CONFIG_PATH = Path(__file__).parent / "annotation_data" / "annotator_config.yaml" CORPORA_PATH = Path(__file__).parent / "annotation_data" / "corpora.json" def load_config(): return yaml.safe_load(CONFIG_PATH.read_text()) def save_config(config): CONFIG_PATH.write_text(yaml.dump(config, default_flow_style=False, sort_keys=False)) def load_corpora(): return json.loads(CORPORA_PATH.read_text()) def get_available_docs(corpus): """Get list of active doc indices for a given corpus.""" links_path = Path(__file__).parent / "annotation_data" / corpus["links_file"] if not links_path.exists(): print(f" āš ļø No links file for {corpus['id']}: {links_path}") return [] links = json.loads(links_path.read_text()) return sorted([ l["index"] for l in links if l.get("has_revalidation") and l.get("status") == "success" ]) def generate_assignments(config, corpora, seed=42): """Distribute docs across annotators with overlap, per corpus.""" settings = config.get("settings", {}) overlap_pct = settings.get("overlap_percent", 10) annotators = config.get("annotators", []) if not annotators: print("āŒ No annotators defined in config.") return config n_annotators = len(annotators) rng = random.Random(seed) # Initialize per-corpus doc dicts for ann in annotators: if not isinstance(ann.get("docs"), dict): ann["docs"] = {} for corpus in corpora: cid = corpus["id"] all_docs = get_available_docs(corpus) n_docs = len(all_docs) if n_docs == 0: print(f"\nšŸ“‚ {corpus['name']} ({cid}): no docs available") continue n_overlap = max(1, round(n_docs * overlap_pct / 100)) shuffled = all_docs.copy() rng.shuffle(shuffled) overlap_docs = sorted(shuffled[:n_overlap]) remaining = shuffled[n_overlap:] per_annotator = len(remaining) // n_annotators extra = len(remaining) % n_annotators print(f"\nšŸ“‚ {corpus['name']} ({cid}):") print(f" Total docs: {n_docs}") print(f" Overlap ({overlap_pct}%): {n_overlap} docs shared by all") print(f" Per annotator: ~{per_annotator + n_overlap} docs each") print(f" Overlap docs: {overlap_docs}") start = 0 for i, ann in enumerate(annotators): count = per_annotator + (1 if i < extra else 0) exclusive = sorted(remaining[start:start + count]) start += count ann["docs"][cid] = sorted(overlap_docs + exclusive) print(f" {ann['username']}: {len(ann['docs'][cid])} docs " f"({n_overlap} overlap + {len(exclusive)} exclusive)") return config def upload_config(): """Upload annotator_config.yaml to HF.""" try: from huggingface_hub import HfApi import os token = os.environ.get("HF_TOKEN") if not token: env_path = Path(__file__).parent / ".env" if env_path.exists(): for line in env_path.read_text().splitlines(): if line.startswith("HF_TOKEN="): token = line.split("=", 1)[1].strip() if not token: print("āŒ No HF_TOKEN found.") return api = HfApi(token=token) api.upload_file( path_or_fileobj=str(CONFIG_PATH), path_in_repo="annotation_data/annotator_config.yaml", repo_id="ai4data/annotation_data", repo_type="dataset", commit_message="Update annotator assignments", ) print("āœ… Uploaded annotator_config.yaml to HF") except ImportError: print("āŒ huggingface_hub required: uv pip install huggingface_hub") def main(): parser = argparse.ArgumentParser(description="Generate document assignments per corpus") parser.add_argument("--dry-run", action="store_true", help="Preview only") parser.add_argument("--upload", action="store_true", help="Upload config to HF") parser.add_argument("--seed", type=int, default=42, help="Random seed") args = parser.parse_args() corpora = load_corpora() config = load_config() print(f"šŸ“‹ Loaded {len(corpora)} corpora, {len(config.get('annotators', []))} annotators") config = generate_assignments(config, corpora, seed=args.seed) if args.dry_run: print("\n[DRY RUN] Would save:") print(yaml.dump(config, default_flow_style=False, sort_keys=False)) else: save_config(config) print(f"\nšŸ’¾ Saved to {CONFIG_PATH}") if args.upload: upload_config() print("\nāœ… Done!") if __name__ == "__main__": main()