File size: 5,467 Bytes
79ba9a0
 
 
 
a2c885c
 
 
79ba9a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2c885c
79ba9a0
 
 
 
 
 
 
 
 
 
a2c885c
 
 
 
 
 
 
 
 
 
 
79ba9a0
 
 
 
 
 
a2c885c
 
79ba9a0
 
 
 
 
 
 
 
 
a2c885c
79ba9a0
a2c885c
 
 
 
79ba9a0
a2c885c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79ba9a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a2c885c
79ba9a0
 
 
 
 
a2c885c
79ba9a0
a2c885c
 
 
79ba9a0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#!/usr/bin/env python3
"""
generate_assignments.py

Reads corpora.json and annotator_config.yaml, distributes available docs
across annotators with configurable overlap per corpus, and writes back
the updated config.

Usage:
    python3 generate_assignments.py                 # Generate and save
    python3 generate_assignments.py --dry-run        # Preview only
    python3 generate_assignments.py --upload          # Upload config to HF

Requires: pyyaml, huggingface_hub (for --upload)
"""

import argparse
import json
import random
import sys
from pathlib import Path

try:
    import yaml
except ImportError:
    print("❌ pyyaml required: uv pip install pyyaml")
    sys.exit(1)

CONFIG_PATH = Path(__file__).parent / "annotation_data" / "annotator_config.yaml"
CORPORA_PATH = Path(__file__).parent / "annotation_data" / "corpora.json"


def load_config():
    return yaml.safe_load(CONFIG_PATH.read_text())


def save_config(config):
    CONFIG_PATH.write_text(yaml.dump(config, default_flow_style=False, sort_keys=False))


def load_corpora():
    return json.loads(CORPORA_PATH.read_text())


def get_available_docs(corpus):
    """Get list of active doc indices for a given corpus."""
    links_path = Path(__file__).parent / "annotation_data" / corpus["links_file"]
    if not links_path.exists():
        print(f"  ⚠️  No links file for {corpus['id']}: {links_path}")
        return []
    links = json.loads(links_path.read_text())
    return sorted([
        l["index"] for l in links
        if l.get("has_revalidation") and l.get("status") == "success"
    ])


def generate_assignments(config, corpora, seed=42):
    """Distribute docs across annotators with overlap, per corpus."""
    settings = config.get("settings", {})
    overlap_pct = settings.get("overlap_percent", 10)
    annotators = config.get("annotators", [])

    if not annotators:
        print("❌ No annotators defined in config.")
        return config

    n_annotators = len(annotators)
    rng = random.Random(seed)

    # Initialize per-corpus doc dicts
    for ann in annotators:
        if not isinstance(ann.get("docs"), dict):
            ann["docs"] = {}

    for corpus in corpora:
        cid = corpus["id"]
        all_docs = get_available_docs(corpus)
        n_docs = len(all_docs)

        if n_docs == 0:
            print(f"\nπŸ“‚ {corpus['name']} ({cid}): no docs available")
            continue

        n_overlap = max(1, round(n_docs * overlap_pct / 100))

        shuffled = all_docs.copy()
        rng.shuffle(shuffled)

        overlap_docs = sorted(shuffled[:n_overlap])
        remaining = shuffled[n_overlap:]

        per_annotator = len(remaining) // n_annotators
        extra = len(remaining) % n_annotators

        print(f"\nπŸ“‚ {corpus['name']} ({cid}):")
        print(f"  Total docs:       {n_docs}")
        print(f"  Overlap ({overlap_pct}%):   {n_overlap} docs shared by all")
        print(f"  Per annotator:    ~{per_annotator + n_overlap} docs each")
        print(f"  Overlap docs:     {overlap_docs}")

        start = 0
        for i, ann in enumerate(annotators):
            count = per_annotator + (1 if i < extra else 0)
            exclusive = sorted(remaining[start:start + count])
            start += count

            ann["docs"][cid] = sorted(overlap_docs + exclusive)
            print(f"  {ann['username']}: {len(ann['docs'][cid])} docs "
                  f"({n_overlap} overlap + {len(exclusive)} exclusive)")

    return config


def upload_config():
    """Upload annotator_config.yaml to HF."""
    try:
        from huggingface_hub import HfApi
        import os

        token = os.environ.get("HF_TOKEN")
        if not token:
            env_path = Path(__file__).parent / ".env"
            if env_path.exists():
                for line in env_path.read_text().splitlines():
                    if line.startswith("HF_TOKEN="):
                        token = line.split("=", 1)[1].strip()

        if not token:
            print("❌ No HF_TOKEN found.")
            return

        api = HfApi(token=token)
        api.upload_file(
            path_or_fileobj=str(CONFIG_PATH),
            path_in_repo="annotation_data/annotator_config.yaml",
            repo_id="ai4data/annotation_data",
            repo_type="dataset",
            commit_message="Update annotator assignments",
        )
        print("βœ… Uploaded annotator_config.yaml to HF")
    except ImportError:
        print("❌ huggingface_hub required: uv pip install huggingface_hub")


def main():
    parser = argparse.ArgumentParser(description="Generate document assignments per corpus")
    parser.add_argument("--dry-run", action="store_true", help="Preview only")
    parser.add_argument("--upload", action="store_true", help="Upload config to HF")
    parser.add_argument("--seed", type=int, default=42, help="Random seed")
    args = parser.parse_args()

    corpora = load_corpora()
    config = load_config()

    print(f"πŸ“‹ Loaded {len(corpora)} corpora, {len(config.get('annotators', []))} annotators")
    config = generate_assignments(config, corpora, seed=args.seed)

    if args.dry_run:
        print("\n[DRY RUN] Would save:")
        print(yaml.dump(config, default_flow_style=False, sort_keys=False))
    else:
        save_config(config)
        print(f"\nπŸ’Ύ Saved to {CONFIG_PATH}")

        if args.upload:
            upload_config()

    print("\nβœ… Done!")


if __name__ == "__main__":
    main()