rafmacalaba commited on
Commit
79ba9a0
·
1 Parent(s): 75ee81e

feat: per-annotator document assignments with configurable overlap

Browse files

- annotator_config.yaml: defines annotators and overlap % (default 10%)
- generate_assignments.py: auto-distributes docs with overlap, seed-based
- Documents API reads config, filters docs by logged-in user
- 10 shared docs for inter-annotator agreement
- Fallback: show all docs if user not in config

app/api/documents/route.js CHANGED
@@ -1,7 +1,43 @@
1
  import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN } from '../../../utils/config.js';
 
2
 
3
- export async function GET() {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  try {
 
 
 
 
 
 
 
5
  // Fetch the index file from HF Datasets
6
  const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_data/wbg_pdf_links.json`;
7
  const linksRes = await fetch(linksUrl, {
@@ -21,12 +57,17 @@ export async function GET() {
21
 
22
  const links = await linksRes.json();
23
 
24
- // Filter to docs with revalidation data and take the first N
25
- const successLinks = links
26
- .filter(l => l.status === 'success' && l.has_revalidation === true)
27
- .slice(0, MAX_DOCS_TO_SCAN);
 
 
 
 
 
28
 
29
- // Parallel fetch — much faster than sequential scanning
30
  const results = await Promise.allSettled(
31
  successLinks.map(async (link) => {
32
  const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_direct_judged.jsonl`;
@@ -44,7 +85,7 @@ export async function GET() {
44
  if (annotatablePages.length === 0) return null;
45
 
46
  const pdfUrl = link.direct_pdf_url;
47
- if (!pdfUrl) return null; // no PDF URL at all
48
 
49
  return {
50
  index: link.index,
@@ -63,7 +104,7 @@ export async function GET() {
63
  status: 200,
64
  headers: {
65
  'Content-Type': 'application/json',
66
- 'Cache-Control': 'public, s-maxage=3600, stale-while-revalidate=59'
67
  }
68
  });
69
  } catch (error) {
 
1
  import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN } from '../../../utils/config.js';
2
+ import yaml from 'js-yaml';
3
 
4
+ /**
5
+ * Fetch annotator_config.yaml and return the doc list for a given user.
6
+ * Returns null if no config or user not found (show all docs).
7
+ */
8
+ async function getUserAssignedDocs(username) {
9
+ if (!username) return null;
10
+
11
+ try {
12
+ const configUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/annotator_config.yaml`;
13
+ const res = await fetch(configUrl, {
14
+ headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
15
+ next: { revalidate: 300 } // cache 5 min
16
+ });
17
+ if (!res.ok) return null;
18
+
19
+ const text = await res.text();
20
+ const config = yaml.load(text);
21
+
22
+ const annotator = (config.annotators || []).find(a => a.username === username);
23
+ if (!annotator || !annotator.docs || annotator.docs.length === 0) return null;
24
+
25
+ return new Set(annotator.docs);
26
+ } catch (e) {
27
+ console.warn('Could not load annotator_config.yaml:', e.message);
28
+ return null;
29
+ }
30
+ }
31
+
32
+ export async function GET(request) {
33
  try {
34
+ // Get username from query param
35
+ const { searchParams } = new URL(request.url);
36
+ const username = searchParams.get('user');
37
+
38
+ // Fetch user's assigned docs (if configured)
39
+ const assignedDocs = await getUserAssignedDocs(username);
40
+
41
  // Fetch the index file from HF Datasets
42
  const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_data/wbg_pdf_links.json`;
43
  const linksRes = await fetch(linksUrl, {
 
57
 
58
  const links = await linksRes.json();
59
 
60
+ // Filter to docs with revalidation data, then by user assignment if available
61
+ let successLinks = links
62
+ .filter(l => l.status === 'success' && l.has_revalidation === true);
63
+
64
+ if (assignedDocs) {
65
+ successLinks = successLinks.filter(l => assignedDocs.has(l.index));
66
+ }
67
+
68
+ successLinks = successLinks.slice(0, MAX_DOCS_TO_SCAN);
69
 
70
+ // Parallel fetch
71
  const results = await Promise.allSettled(
72
  successLinks.map(async (link) => {
73
  const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_direct_judged.jsonl`;
 
85
  if (annotatablePages.length === 0) return null;
86
 
87
  const pdfUrl = link.direct_pdf_url;
88
+ if (!pdfUrl) return null;
89
 
90
  return {
91
  index: link.index,
 
104
  status: 200,
105
  headers: {
106
  'Content-Type': 'application/json',
107
+ 'Cache-Control': 'public, s-maxage=300, stale-while-revalidate=59'
108
  }
109
  });
110
  } catch (error) {
app/page.js CHANGED
@@ -48,9 +48,14 @@ export default function Home() {
48
  const annotatablePages = currentDoc?.annotatable_pages ?? [];
49
  const currentPageNumber = annotatablePages[pageIdx] ?? null;
50
 
51
- // Load documents on mount
52
  useEffect(() => {
53
- fetch('/api/documents')
 
 
 
 
 
54
  .then(res => res.json())
55
  .then(data => {
56
  setDocuments(data);
@@ -75,7 +80,7 @@ export default function Home() {
75
  console.error("Failed to load documents", err);
76
  setLoading(false);
77
  });
78
- }, []);
79
 
80
  // Read HF OAuth cookie for annotator identity
81
  useEffect(() => {
 
48
  const annotatablePages = currentDoc?.annotatable_pages ?? [];
49
  const currentPageNumber = annotatablePages[pageIdx] ?? null;
50
 
51
+ // Load documents (re-fetches when annotatorName changes to get user-specific assignment)
52
  useEffect(() => {
53
+ setLoading(true);
54
+ const url = annotatorName
55
+ ? `/api/documents?user=${encodeURIComponent(annotatorName)}`
56
+ : '/api/documents';
57
+
58
+ fetch(url)
59
  .then(res => res.json())
60
  .then(data => {
61
  setDocuments(data);
 
80
  console.error("Failed to load documents", err);
81
  setLoading(false);
82
  });
83
+ }, [annotatorName]);
84
 
85
  // Read HF OAuth cookie for annotator identity
86
  useEffect(() => {
generate_assignments.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ generate_assignments.py
4
+
5
+ Reads annotator_config.yaml, distributes available docs across annotators
6
+ with configurable overlap, and writes back the updated config.
7
+
8
+ Usage:
9
+ python3 generate_assignments.py # Generate and save
10
+ python3 generate_assignments.py --dry-run # Preview only
11
+ python3 generate_assignments.py --upload # Upload config to HF
12
+
13
+ Requires: pyyaml, huggingface_hub (for --upload)
14
+ """
15
+
16
+ import argparse
17
+ import json
18
+ import random
19
+ import sys
20
+ from pathlib import Path
21
+
22
+ try:
23
+ import yaml
24
+ except ImportError:
25
+ print("❌ pyyaml required: uv pip install pyyaml")
26
+ sys.exit(1)
27
+
28
+ CONFIG_PATH = Path(__file__).parent / "annotation_data" / "annotator_config.yaml"
29
+ LINKS_PATH = Path(__file__).parent / "annotation_data" / "wbg_data" / "wbg_pdf_links.json"
30
+
31
+
32
+ def load_config():
33
+ return yaml.safe_load(CONFIG_PATH.read_text())
34
+
35
+
36
+ def save_config(config):
37
+ CONFIG_PATH.write_text(yaml.dump(config, default_flow_style=False, sort_keys=False))
38
+
39
+
40
+ def get_available_docs():
41
+ """Get list of active English doc indices."""
42
+ links = json.loads(LINKS_PATH.read_text())
43
+ return sorted([
44
+ l["index"] for l in links
45
+ if l.get("has_revalidation") and l.get("status") == "success"
46
+ ])
47
+
48
+
49
+ def generate_assignments(config, seed=42):
50
+ """Distribute docs across annotators with overlap."""
51
+ settings = config.get("settings", {})
52
+ overlap_pct = settings.get("overlap_percent", 10)
53
+ annotators = config.get("annotators", [])
54
+
55
+ if not annotators:
56
+ print("❌ No annotators defined in config.")
57
+ return config
58
+
59
+ all_docs = get_available_docs()
60
+ n_docs = len(all_docs)
61
+ n_annotators = len(annotators)
62
+
63
+ # Calculate overlap
64
+ n_overlap = max(1, round(n_docs * overlap_pct / 100))
65
+
66
+ # Shuffle docs deterministically
67
+ rng = random.Random(seed)
68
+ shuffled = all_docs.copy()
69
+ rng.shuffle(shuffled)
70
+
71
+ # Pick overlap docs (shared by ALL annotators)
72
+ overlap_docs = sorted(shuffled[:n_overlap])
73
+ remaining = shuffled[n_overlap:]
74
+
75
+ # Split remaining docs evenly across annotators
76
+ per_annotator = len(remaining) // n_annotators
77
+ extra = len(remaining) % n_annotators
78
+
79
+ print(f"\n📊 Assignment Summary:")
80
+ print(f" Total docs: {n_docs}")
81
+ print(f" Annotators: {n_annotators}")
82
+ print(f" Overlap ({overlap_pct}%): {n_overlap} docs shared by all")
83
+ print(f" Per annotator: ~{per_annotator + n_overlap} docs each")
84
+ print(f" Overlap docs: {overlap_docs}")
85
+ print()
86
+
87
+ start = 0
88
+ for i, ann in enumerate(annotators):
89
+ # Distribute remaining: first `extra` annotators get 1 more
90
+ count = per_annotator + (1 if i < extra else 0)
91
+ exclusive = sorted(remaining[start:start + count])
92
+ start += count
93
+
94
+ ann["docs"] = sorted(overlap_docs + exclusive)
95
+ print(f" {ann['username']}: {len(ann['docs'])} docs "
96
+ f"({n_overlap} overlap + {len(exclusive)} exclusive)")
97
+
98
+ return config
99
+
100
+
101
+ def upload_config():
102
+ """Upload annotator_config.yaml to HF."""
103
+ try:
104
+ from huggingface_hub import HfApi
105
+ import os
106
+
107
+ token = os.environ.get("HF_TOKEN")
108
+ if not token:
109
+ env_path = Path(__file__).parent / ".env"
110
+ if env_path.exists():
111
+ for line in env_path.read_text().splitlines():
112
+ if line.startswith("HF_TOKEN="):
113
+ token = line.split("=", 1)[1].strip()
114
+
115
+ if not token:
116
+ print("❌ No HF_TOKEN found.")
117
+ return
118
+
119
+ api = HfApi(token=token)
120
+ api.upload_file(
121
+ path_or_fileobj=str(CONFIG_PATH),
122
+ path_in_repo="annotation_data/annotator_config.yaml",
123
+ repo_id="ai4data/annotation_data",
124
+ repo_type="dataset",
125
+ commit_message="Update annotator assignments",
126
+ )
127
+ print("✅ Uploaded annotator_config.yaml to HF")
128
+ except ImportError:
129
+ print("❌ huggingface_hub required: uv pip install huggingface_hub")
130
+
131
+
132
+ def main():
133
+ parser = argparse.ArgumentParser(description="Generate document assignments")
134
+ parser.add_argument("--dry-run", action="store_true", help="Preview only")
135
+ parser.add_argument("--upload", action="store_true", help="Upload config to HF")
136
+ parser.add_argument("--seed", type=int, default=42, help="Random seed")
137
+ args = parser.parse_args()
138
+
139
+ config = load_config()
140
+ config = generate_assignments(config, seed=args.seed)
141
+
142
+ if args.dry_run:
143
+ print("\n[DRY RUN] Would save:")
144
+ print(yaml.dump(config, default_flow_style=False, sort_keys=False))
145
+ else:
146
+ save_config(config)
147
+ print(f"\n💾 Saved to {CONFIG_PATH}")
148
+
149
+ if args.upload:
150
+ upload_config()
151
+
152
+ print("\n✅ Done!")
153
+
154
+
155
+ if __name__ == "__main__":
156
+ main()
package-lock.json CHANGED
@@ -9,6 +9,7 @@
9
  "version": "1.0.0",
10
  "dependencies": {
11
  "@huggingface/hub": "^2.10.3",
 
12
  "next": "14.2.14",
13
  "react": "^18",
14
  "react-dom": "^18",
@@ -286,6 +287,12 @@
286
  "node": ">=8"
287
  }
288
  },
 
 
 
 
 
 
289
  "node_modules/bail": {
290
  "version": "2.0.2",
291
  "resolved": "https://registry.npmjs.org/bail/-/bail-2.0.2.tgz",
@@ -634,6 +641,18 @@
634
  "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
635
  "license": "MIT"
636
  },
 
 
 
 
 
 
 
 
 
 
 
 
637
  "node_modules/longest-streak": {
638
  "version": "3.1.0",
639
  "resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-3.1.0.tgz",
 
9
  "version": "1.0.0",
10
  "dependencies": {
11
  "@huggingface/hub": "^2.10.3",
12
+ "js-yaml": "^4.1.1",
13
  "next": "14.2.14",
14
  "react": "^18",
15
  "react-dom": "^18",
 
287
  "node": ">=8"
288
  }
289
  },
290
+ "node_modules/argparse": {
291
+ "version": "2.0.1",
292
+ "resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
293
+ "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
294
+ "license": "Python-2.0"
295
+ },
296
  "node_modules/bail": {
297
  "version": "2.0.2",
298
  "resolved": "https://registry.npmjs.org/bail/-/bail-2.0.2.tgz",
 
641
  "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
642
  "license": "MIT"
643
  },
644
+ "node_modules/js-yaml": {
645
+ "version": "4.1.1",
646
+ "resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz",
647
+ "integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==",
648
+ "license": "MIT",
649
+ "dependencies": {
650
+ "argparse": "^2.0.1"
651
+ },
652
+ "bin": {
653
+ "js-yaml": "bin/js-yaml.js"
654
+ }
655
+ },
656
  "node_modules/longest-streak": {
657
  "version": "3.1.0",
658
  "resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-3.1.0.tgz",
package.json CHANGED
@@ -9,6 +9,7 @@
9
  },
10
  "dependencies": {
11
  "@huggingface/hub": "^2.10.3",
 
12
  "next": "14.2.14",
13
  "react": "^18",
14
  "react-dom": "^18",
 
9
  },
10
  "dependencies": {
11
  "@huggingface/hub": "^2.10.3",
12
+ "js-yaml": "^4.1.1",
13
  "next": "14.2.14",
14
  "react": "^18",
15
  "react-dom": "^18",