Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| generate_assignments.py | |
| Reads corpora.json and annotator_config.yaml, distributes available docs | |
| across annotators with configurable overlap per corpus, and writes back | |
| the updated config. | |
| Usage: | |
| python3 generate_assignments.py # Generate and save | |
| python3 generate_assignments.py --dry-run # Preview only | |
| python3 generate_assignments.py --upload # Upload config to HF | |
| Requires: pyyaml, huggingface_hub (for --upload) | |
| """ | |
| import argparse | |
| import json | |
| import random | |
| import sys | |
| from pathlib import Path | |
| try: | |
| import yaml | |
| except ImportError: | |
| print("β pyyaml required: uv pip install pyyaml") | |
| sys.exit(1) | |
| CONFIG_PATH = Path(__file__).parent / "annotation_data" / "annotator_config.yaml" | |
| CORPORA_PATH = Path(__file__).parent / "annotation_data" / "corpora.json" | |
| def load_config(): | |
| return yaml.safe_load(CONFIG_PATH.read_text()) | |
| def save_config(config): | |
| CONFIG_PATH.write_text(yaml.dump(config, default_flow_style=False, sort_keys=False)) | |
| def load_corpora(): | |
| return json.loads(CORPORA_PATH.read_text()) | |
| def get_available_docs(corpus): | |
| """Get list of active doc indices for a given corpus.""" | |
| links_path = Path(__file__).parent / "annotation_data" / corpus["links_file"] | |
| if not links_path.exists(): | |
| print(f" β οΈ No links file for {corpus['id']}: {links_path}") | |
| return [] | |
| links = json.loads(links_path.read_text()) | |
| return sorted([ | |
| l["index"] for l in links | |
| if l.get("has_revalidation") and l.get("status") == "success" | |
| ]) | |
| def generate_assignments(config, corpora, seed=42): | |
| """Distribute docs across annotators with overlap, per corpus.""" | |
| settings = config.get("settings", {}) | |
| overlap_pct = settings.get("overlap_percent", 10) | |
| annotators = config.get("annotators", []) | |
| if not annotators: | |
| print("β No annotators defined in config.") | |
| return config | |
| n_annotators = len(annotators) | |
| rng = random.Random(seed) | |
| # Initialize per-corpus doc dicts | |
| for ann in annotators: | |
| if not isinstance(ann.get("docs"), dict): | |
| ann["docs"] = {} | |
| for corpus in corpora: | |
| cid = corpus["id"] | |
| all_docs = get_available_docs(corpus) | |
| n_docs = len(all_docs) | |
| if n_docs == 0: | |
| print(f"\nπ {corpus['name']} ({cid}): no docs available") | |
| continue | |
| n_overlap = max(1, round(n_docs * overlap_pct / 100)) | |
| shuffled = all_docs.copy() | |
| rng.shuffle(shuffled) | |
| overlap_docs = sorted(shuffled[:n_overlap]) | |
| remaining = shuffled[n_overlap:] | |
| per_annotator = len(remaining) // n_annotators | |
| extra = len(remaining) % n_annotators | |
| print(f"\nπ {corpus['name']} ({cid}):") | |
| print(f" Total docs: {n_docs}") | |
| print(f" Overlap ({overlap_pct}%): {n_overlap} docs shared by all") | |
| print(f" Per annotator: ~{per_annotator + n_overlap} docs each") | |
| print(f" Overlap docs: {overlap_docs}") | |
| start = 0 | |
| for i, ann in enumerate(annotators): | |
| count = per_annotator + (1 if i < extra else 0) | |
| exclusive = sorted(remaining[start:start + count]) | |
| start += count | |
| ann["docs"][cid] = sorted(overlap_docs + exclusive) | |
| print(f" {ann['username']}: {len(ann['docs'][cid])} docs " | |
| f"({n_overlap} overlap + {len(exclusive)} exclusive)") | |
| return config | |
| def upload_config(): | |
| """Upload annotator_config.yaml to HF.""" | |
| try: | |
| from huggingface_hub import HfApi | |
| import os | |
| token = os.environ.get("HF_TOKEN") | |
| if not token: | |
| env_path = Path(__file__).parent / ".env" | |
| if env_path.exists(): | |
| for line in env_path.read_text().splitlines(): | |
| if line.startswith("HF_TOKEN="): | |
| token = line.split("=", 1)[1].strip() | |
| if not token: | |
| print("β No HF_TOKEN found.") | |
| return | |
| api = HfApi(token=token) | |
| api.upload_file( | |
| path_or_fileobj=str(CONFIG_PATH), | |
| path_in_repo="annotation_data/annotator_config.yaml", | |
| repo_id="ai4data/annotation_data", | |
| repo_type="dataset", | |
| commit_message="Update annotator assignments", | |
| ) | |
| print("β Uploaded annotator_config.yaml to HF") | |
| except ImportError: | |
| print("β huggingface_hub required: uv pip install huggingface_hub") | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Generate document assignments per corpus") | |
| parser.add_argument("--dry-run", action="store_true", help="Preview only") | |
| parser.add_argument("--upload", action="store_true", help="Upload config to HF") | |
| parser.add_argument("--seed", type=int, default=42, help="Random seed") | |
| args = parser.parse_args() | |
| corpora = load_corpora() | |
| config = load_config() | |
| print(f"π Loaded {len(corpora)} corpora, {len(config.get('annotators', []))} annotators") | |
| config = generate_assignments(config, corpora, seed=args.seed) | |
| if args.dry_run: | |
| print("\n[DRY RUN] Would save:") | |
| print(yaml.dump(config, default_flow_style=False, sort_keys=False)) | |
| else: | |
| save_config(config) | |
| print(f"\nπΎ Saved to {CONFIG_PATH}") | |
| if args.upload: | |
| upload_config() | |
| print("\nβ Done!") | |
| if __name__ == "__main__": | |
| main() | |