Spaces:

ai4data
/

data-use-annotation

Running

File size: 5,467 Bytes

#!/usr/bin/env python3
"""
generate_assignments.py

Reads corpora.json and annotator_config.yaml, distributes available docs
across annotators with configurable overlap per corpus, and writes back
the updated config.

Usage:
    python3 generate_assignments.py                 # Generate and save
    python3 generate_assignments.py --dry-run        # Preview only
    python3 generate_assignments.py --upload          # Upload config to HF

Requires: pyyaml, huggingface_hub (for --upload)
"""

import argparse
import json
import random
import sys
from pathlib import Path

try:
    import yaml
except ImportError:
    print("❌ pyyaml required: uv pip install pyyaml")
    sys.exit(1)

CONFIG_PATH = Path(__file__).parent / "annotation_data" / "annotator_config.yaml"
CORPORA_PATH = Path(__file__).parent / "annotation_data" / "corpora.json"


def load_config():
    return yaml.safe_load(CONFIG_PATH.read_text())


def save_config(config):
    CONFIG_PATH.write_text(yaml.dump(config, default_flow_style=False, sort_keys=False))


def load_corpora():
    return json.loads(CORPORA_PATH.read_text())


def get_available_docs(corpus):
    """Get list of active doc indices for a given corpus."""
    links_path = Path(__file__).parent / "annotation_data" / corpus["links_file"]
    if not links_path.exists():
        print(f"  ⚠️  No links file for {corpus['id']}: {links_path}")
        return []
    links = json.loads(links_path.read_text())
    return sorted([
        l["index"] for l in links
        if l.get("has_revalidation") and l.get("status") == "success"
    ])


def generate_assignments(config, corpora, seed=42):
    """Distribute docs across annotators with overlap, per corpus."""
    settings = config.get("settings", {})
    overlap_pct = settings.get("overlap_percent", 10)
    annotators = config.get("annotators", [])

    if not annotators:
        print("❌ No annotators defined in config.")
        return config

    n_annotators = len(annotators)
    rng = random.Random(seed)

    # Initialize per-corpus doc dicts
    for ann in annotators:
        if not isinstance(ann.get("docs"), dict):
            ann["docs"] = {}

    for corpus in corpora:
        cid = corpus["id"]
        all_docs = get_available_docs(corpus)
        n_docs = len(all_docs)

        if n_docs == 0:
            print(f"\n📂 {corpus['name']} ({cid}): no docs available")
            continue

        n_overlap = max(1, round(n_docs * overlap_pct / 100))

        shuffled = all_docs.copy()
        rng.shuffle(shuffled)

        overlap_docs = sorted(shuffled[:n_overlap])
        remaining = shuffled[n_overlap:]

        per_annotator = len(remaining) // n_annotators
        extra = len(remaining) % n_annotators

        print(f"\n📂 {corpus['name']} ({cid}):")
        print(f"  Total docs:       {n_docs}")
        print(f"  Overlap ({overlap_pct}%):   {n_overlap} docs shared by all")
        print(f"  Per annotator:    ~{per_annotator + n_overlap} docs each")
        print(f"  Overlap docs:     {overlap_docs}")

        start = 0
        for i, ann in enumerate(annotators):
            count = per_annotator + (1 if i < extra else 0)
            exclusive = sorted(remaining[start:start + count])
            start += count

            ann["docs"][cid] = sorted(overlap_docs + exclusive)
            print(f"  {ann['username']}: {len(ann['docs'][cid])} docs "
                  f"({n_overlap} overlap + {len(exclusive)} exclusive)")

    return config


def upload_config():
    """Upload annotator_config.yaml to HF."""
    try:
        from huggingface_hub import HfApi
        import os

        token = os.environ.get("HF_TOKEN")
        if not token:
            env_path = Path(__file__).parent / ".env"
            if env_path.exists():
                for line in env_path.read_text().splitlines():
                    if line.startswith("HF_TOKEN="):
                        token = line.split("=", 1)[1].strip()

        if not token:
            print("❌ No HF_TOKEN found.")
            return

        api = HfApi(token=token)
        api.upload_file(
            path_or_fileobj=str(CONFIG_PATH),
            path_in_repo="annotation_data/annotator_config.yaml",
            repo_id="ai4data/annotation_data",
            repo_type="dataset",
            commit_message="Update annotator assignments",
        )
        print("✅ Uploaded annotator_config.yaml to HF")
    except ImportError:
        print("❌ huggingface_hub required: uv pip install huggingface_hub")


def main():
    parser = argparse.ArgumentParser(description="Generate document assignments per corpus")
    parser.add_argument("--dry-run", action="store_true", help="Preview only")
    parser.add_argument("--upload", action="store_true", help="Upload config to HF")
    parser.add_argument("--seed", type=int, default=42, help="Random seed")
    args = parser.parse_args()

    corpora = load_corpora()
    config = load_config()

    print(f"📋 Loaded {len(corpora)} corpora, {len(config.get('annotators', []))} annotators")
    config = generate_assignments(config, corpora, seed=args.seed)

    if args.dry_run:
        print("\n[DRY RUN] Would save:")
        print(yaml.dump(config, default_flow_style=False, sort_keys=False))
    else:
        save_config(config)
        print(f"\n💾 Saved to {CONFIG_PATH}")

        if args.upload:
            upload_config()

    print("\n✅ Done!")


if __name__ == "__main__":
    main()