Spaces:
Running
Running
File size: 5,467 Bytes
79ba9a0 a2c885c 79ba9a0 a2c885c 79ba9a0 a2c885c 79ba9a0 a2c885c 79ba9a0 a2c885c 79ba9a0 a2c885c 79ba9a0 a2c885c 79ba9a0 a2c885c 79ba9a0 a2c885c 79ba9a0 a2c885c 79ba9a0 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 | #!/usr/bin/env python3
"""
generate_assignments.py
Reads corpora.json and annotator_config.yaml, distributes available docs
across annotators with configurable overlap per corpus, and writes back
the updated config.
Usage:
python3 generate_assignments.py # Generate and save
python3 generate_assignments.py --dry-run # Preview only
python3 generate_assignments.py --upload # Upload config to HF
Requires: pyyaml, huggingface_hub (for --upload)
"""
import argparse
import json
import random
import sys
from pathlib import Path
try:
import yaml
except ImportError:
print("β pyyaml required: uv pip install pyyaml")
sys.exit(1)
CONFIG_PATH = Path(__file__).parent / "annotation_data" / "annotator_config.yaml"
CORPORA_PATH = Path(__file__).parent / "annotation_data" / "corpora.json"
def load_config():
return yaml.safe_load(CONFIG_PATH.read_text())
def save_config(config):
CONFIG_PATH.write_text(yaml.dump(config, default_flow_style=False, sort_keys=False))
def load_corpora():
return json.loads(CORPORA_PATH.read_text())
def get_available_docs(corpus):
"""Get list of active doc indices for a given corpus."""
links_path = Path(__file__).parent / "annotation_data" / corpus["links_file"]
if not links_path.exists():
print(f" β οΈ No links file for {corpus['id']}: {links_path}")
return []
links = json.loads(links_path.read_text())
return sorted([
l["index"] for l in links
if l.get("has_revalidation") and l.get("status") == "success"
])
def generate_assignments(config, corpora, seed=42):
"""Distribute docs across annotators with overlap, per corpus."""
settings = config.get("settings", {})
overlap_pct = settings.get("overlap_percent", 10)
annotators = config.get("annotators", [])
if not annotators:
print("β No annotators defined in config.")
return config
n_annotators = len(annotators)
rng = random.Random(seed)
# Initialize per-corpus doc dicts
for ann in annotators:
if not isinstance(ann.get("docs"), dict):
ann["docs"] = {}
for corpus in corpora:
cid = corpus["id"]
all_docs = get_available_docs(corpus)
n_docs = len(all_docs)
if n_docs == 0:
print(f"\nπ {corpus['name']} ({cid}): no docs available")
continue
n_overlap = max(1, round(n_docs * overlap_pct / 100))
shuffled = all_docs.copy()
rng.shuffle(shuffled)
overlap_docs = sorted(shuffled[:n_overlap])
remaining = shuffled[n_overlap:]
per_annotator = len(remaining) // n_annotators
extra = len(remaining) % n_annotators
print(f"\nπ {corpus['name']} ({cid}):")
print(f" Total docs: {n_docs}")
print(f" Overlap ({overlap_pct}%): {n_overlap} docs shared by all")
print(f" Per annotator: ~{per_annotator + n_overlap} docs each")
print(f" Overlap docs: {overlap_docs}")
start = 0
for i, ann in enumerate(annotators):
count = per_annotator + (1 if i < extra else 0)
exclusive = sorted(remaining[start:start + count])
start += count
ann["docs"][cid] = sorted(overlap_docs + exclusive)
print(f" {ann['username']}: {len(ann['docs'][cid])} docs "
f"({n_overlap} overlap + {len(exclusive)} exclusive)")
return config
def upload_config():
"""Upload annotator_config.yaml to HF."""
try:
from huggingface_hub import HfApi
import os
token = os.environ.get("HF_TOKEN")
if not token:
env_path = Path(__file__).parent / ".env"
if env_path.exists():
for line in env_path.read_text().splitlines():
if line.startswith("HF_TOKEN="):
token = line.split("=", 1)[1].strip()
if not token:
print("β No HF_TOKEN found.")
return
api = HfApi(token=token)
api.upload_file(
path_or_fileobj=str(CONFIG_PATH),
path_in_repo="annotation_data/annotator_config.yaml",
repo_id="ai4data/annotation_data",
repo_type="dataset",
commit_message="Update annotator assignments",
)
print("β
Uploaded annotator_config.yaml to HF")
except ImportError:
print("β huggingface_hub required: uv pip install huggingface_hub")
def main():
parser = argparse.ArgumentParser(description="Generate document assignments per corpus")
parser.add_argument("--dry-run", action="store_true", help="Preview only")
parser.add_argument("--upload", action="store_true", help="Upload config to HF")
parser.add_argument("--seed", type=int, default=42, help="Random seed")
args = parser.parse_args()
corpora = load_corpora()
config = load_config()
print(f"π Loaded {len(corpora)} corpora, {len(config.get('annotators', []))} annotators")
config = generate_assignments(config, corpora, seed=args.seed)
if args.dry_run:
print("\n[DRY RUN] Would save:")
print(yaml.dump(config, default_flow_style=False, sort_keys=False))
else:
save_config(config)
print(f"\nπΎ Saved to {CONFIG_PATH}")
if args.upload:
upload_config()
print("\nβ
Done!")
if __name__ == "__main__":
main()
|