Spaces:
Running
Running
Commit ·
79ba9a0
1
Parent(s): 75ee81e
feat: per-annotator document assignments with configurable overlap
Browse files- annotator_config.yaml: defines annotators and overlap % (default 10%)
- generate_assignments.py: auto-distributes docs with overlap, seed-based
- Documents API reads config, filters docs by logged-in user
- 10 shared docs for inter-annotator agreement
- Fallback: show all docs if user not in config
- app/api/documents/route.js +49 -8
- app/page.js +8 -3
- generate_assignments.py +156 -0
- package-lock.json +19 -0
- package.json +1 -0
app/api/documents/route.js
CHANGED
|
@@ -1,7 +1,43 @@
|
|
| 1 |
import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN } from '../../../utils/config.js';
|
|
|
|
| 2 |
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
try {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
// Fetch the index file from HF Datasets
|
| 6 |
const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_data/wbg_pdf_links.json`;
|
| 7 |
const linksRes = await fetch(linksUrl, {
|
|
@@ -21,12 +57,17 @@ export async function GET() {
|
|
| 21 |
|
| 22 |
const links = await linksRes.json();
|
| 23 |
|
| 24 |
-
// Filter to docs with revalidation data
|
| 25 |
-
|
| 26 |
-
.filter(l => l.status === 'success' && l.has_revalidation === true)
|
| 27 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
// Parallel fetch
|
| 30 |
const results = await Promise.allSettled(
|
| 31 |
successLinks.map(async (link) => {
|
| 32 |
const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_direct_judged.jsonl`;
|
|
@@ -44,7 +85,7 @@ export async function GET() {
|
|
| 44 |
if (annotatablePages.length === 0) return null;
|
| 45 |
|
| 46 |
const pdfUrl = link.direct_pdf_url;
|
| 47 |
-
if (!pdfUrl) return null;
|
| 48 |
|
| 49 |
return {
|
| 50 |
index: link.index,
|
|
@@ -63,7 +104,7 @@ export async function GET() {
|
|
| 63 |
status: 200,
|
| 64 |
headers: {
|
| 65 |
'Content-Type': 'application/json',
|
| 66 |
-
'Cache-Control': 'public, s-maxage=
|
| 67 |
}
|
| 68 |
});
|
| 69 |
} catch (error) {
|
|
|
|
| 1 |
import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN } from '../../../utils/config.js';
|
| 2 |
+
import yaml from 'js-yaml';
|
| 3 |
|
| 4 |
+
/**
|
| 5 |
+
* Fetch annotator_config.yaml and return the doc list for a given user.
|
| 6 |
+
* Returns null if no config or user not found (show all docs).
|
| 7 |
+
*/
|
| 8 |
+
async function getUserAssignedDocs(username) {
|
| 9 |
+
if (!username) return null;
|
| 10 |
+
|
| 11 |
+
try {
|
| 12 |
+
const configUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/annotator_config.yaml`;
|
| 13 |
+
const res = await fetch(configUrl, {
|
| 14 |
+
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
|
| 15 |
+
next: { revalidate: 300 } // cache 5 min
|
| 16 |
+
});
|
| 17 |
+
if (!res.ok) return null;
|
| 18 |
+
|
| 19 |
+
const text = await res.text();
|
| 20 |
+
const config = yaml.load(text);
|
| 21 |
+
|
| 22 |
+
const annotator = (config.annotators || []).find(a => a.username === username);
|
| 23 |
+
if (!annotator || !annotator.docs || annotator.docs.length === 0) return null;
|
| 24 |
+
|
| 25 |
+
return new Set(annotator.docs);
|
| 26 |
+
} catch (e) {
|
| 27 |
+
console.warn('Could not load annotator_config.yaml:', e.message);
|
| 28 |
+
return null;
|
| 29 |
+
}
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
export async function GET(request) {
|
| 33 |
try {
|
| 34 |
+
// Get username from query param
|
| 35 |
+
const { searchParams } = new URL(request.url);
|
| 36 |
+
const username = searchParams.get('user');
|
| 37 |
+
|
| 38 |
+
// Fetch user's assigned docs (if configured)
|
| 39 |
+
const assignedDocs = await getUserAssignedDocs(username);
|
| 40 |
+
|
| 41 |
// Fetch the index file from HF Datasets
|
| 42 |
const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_data/wbg_pdf_links.json`;
|
| 43 |
const linksRes = await fetch(linksUrl, {
|
|
|
|
| 57 |
|
| 58 |
const links = await linksRes.json();
|
| 59 |
|
| 60 |
+
// Filter to docs with revalidation data, then by user assignment if available
|
| 61 |
+
let successLinks = links
|
| 62 |
+
.filter(l => l.status === 'success' && l.has_revalidation === true);
|
| 63 |
+
|
| 64 |
+
if (assignedDocs) {
|
| 65 |
+
successLinks = successLinks.filter(l => assignedDocs.has(l.index));
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
successLinks = successLinks.slice(0, MAX_DOCS_TO_SCAN);
|
| 69 |
|
| 70 |
+
// Parallel fetch
|
| 71 |
const results = await Promise.allSettled(
|
| 72 |
successLinks.map(async (link) => {
|
| 73 |
const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_direct_judged.jsonl`;
|
|
|
|
| 85 |
if (annotatablePages.length === 0) return null;
|
| 86 |
|
| 87 |
const pdfUrl = link.direct_pdf_url;
|
| 88 |
+
if (!pdfUrl) return null;
|
| 89 |
|
| 90 |
return {
|
| 91 |
index: link.index,
|
|
|
|
| 104 |
status: 200,
|
| 105 |
headers: {
|
| 106 |
'Content-Type': 'application/json',
|
| 107 |
+
'Cache-Control': 'public, s-maxage=300, stale-while-revalidate=59'
|
| 108 |
}
|
| 109 |
});
|
| 110 |
} catch (error) {
|
app/page.js
CHANGED
|
@@ -48,9 +48,14 @@ export default function Home() {
|
|
| 48 |
const annotatablePages = currentDoc?.annotatable_pages ?? [];
|
| 49 |
const currentPageNumber = annotatablePages[pageIdx] ?? null;
|
| 50 |
|
| 51 |
-
// Load documents
|
| 52 |
useEffect(() => {
|
| 53 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
.then(res => res.json())
|
| 55 |
.then(data => {
|
| 56 |
setDocuments(data);
|
|
@@ -75,7 +80,7 @@ export default function Home() {
|
|
| 75 |
console.error("Failed to load documents", err);
|
| 76 |
setLoading(false);
|
| 77 |
});
|
| 78 |
-
}, []);
|
| 79 |
|
| 80 |
// Read HF OAuth cookie for annotator identity
|
| 81 |
useEffect(() => {
|
|
|
|
| 48 |
const annotatablePages = currentDoc?.annotatable_pages ?? [];
|
| 49 |
const currentPageNumber = annotatablePages[pageIdx] ?? null;
|
| 50 |
|
| 51 |
+
// Load documents (re-fetches when annotatorName changes to get user-specific assignment)
|
| 52 |
useEffect(() => {
|
| 53 |
+
setLoading(true);
|
| 54 |
+
const url = annotatorName
|
| 55 |
+
? `/api/documents?user=${encodeURIComponent(annotatorName)}`
|
| 56 |
+
: '/api/documents';
|
| 57 |
+
|
| 58 |
+
fetch(url)
|
| 59 |
.then(res => res.json())
|
| 60 |
.then(data => {
|
| 61 |
setDocuments(data);
|
|
|
|
| 80 |
console.error("Failed to load documents", err);
|
| 81 |
setLoading(false);
|
| 82 |
});
|
| 83 |
+
}, [annotatorName]);
|
| 84 |
|
| 85 |
// Read HF OAuth cookie for annotator identity
|
| 86 |
useEffect(() => {
|
generate_assignments.py
ADDED
|
@@ -0,0 +1,156 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
generate_assignments.py
|
| 4 |
+
|
| 5 |
+
Reads annotator_config.yaml, distributes available docs across annotators
|
| 6 |
+
with configurable overlap, and writes back the updated config.
|
| 7 |
+
|
| 8 |
+
Usage:
|
| 9 |
+
python3 generate_assignments.py # Generate and save
|
| 10 |
+
python3 generate_assignments.py --dry-run # Preview only
|
| 11 |
+
python3 generate_assignments.py --upload # Upload config to HF
|
| 12 |
+
|
| 13 |
+
Requires: pyyaml, huggingface_hub (for --upload)
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
import argparse
|
| 17 |
+
import json
|
| 18 |
+
import random
|
| 19 |
+
import sys
|
| 20 |
+
from pathlib import Path
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
import yaml
|
| 24 |
+
except ImportError:
|
| 25 |
+
print("❌ pyyaml required: uv pip install pyyaml")
|
| 26 |
+
sys.exit(1)
|
| 27 |
+
|
| 28 |
+
CONFIG_PATH = Path(__file__).parent / "annotation_data" / "annotator_config.yaml"
|
| 29 |
+
LINKS_PATH = Path(__file__).parent / "annotation_data" / "wbg_data" / "wbg_pdf_links.json"
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def load_config():
|
| 33 |
+
return yaml.safe_load(CONFIG_PATH.read_text())
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
def save_config(config):
|
| 37 |
+
CONFIG_PATH.write_text(yaml.dump(config, default_flow_style=False, sort_keys=False))
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def get_available_docs():
|
| 41 |
+
"""Get list of active English doc indices."""
|
| 42 |
+
links = json.loads(LINKS_PATH.read_text())
|
| 43 |
+
return sorted([
|
| 44 |
+
l["index"] for l in links
|
| 45 |
+
if l.get("has_revalidation") and l.get("status") == "success"
|
| 46 |
+
])
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
def generate_assignments(config, seed=42):
|
| 50 |
+
"""Distribute docs across annotators with overlap."""
|
| 51 |
+
settings = config.get("settings", {})
|
| 52 |
+
overlap_pct = settings.get("overlap_percent", 10)
|
| 53 |
+
annotators = config.get("annotators", [])
|
| 54 |
+
|
| 55 |
+
if not annotators:
|
| 56 |
+
print("❌ No annotators defined in config.")
|
| 57 |
+
return config
|
| 58 |
+
|
| 59 |
+
all_docs = get_available_docs()
|
| 60 |
+
n_docs = len(all_docs)
|
| 61 |
+
n_annotators = len(annotators)
|
| 62 |
+
|
| 63 |
+
# Calculate overlap
|
| 64 |
+
n_overlap = max(1, round(n_docs * overlap_pct / 100))
|
| 65 |
+
|
| 66 |
+
# Shuffle docs deterministically
|
| 67 |
+
rng = random.Random(seed)
|
| 68 |
+
shuffled = all_docs.copy()
|
| 69 |
+
rng.shuffle(shuffled)
|
| 70 |
+
|
| 71 |
+
# Pick overlap docs (shared by ALL annotators)
|
| 72 |
+
overlap_docs = sorted(shuffled[:n_overlap])
|
| 73 |
+
remaining = shuffled[n_overlap:]
|
| 74 |
+
|
| 75 |
+
# Split remaining docs evenly across annotators
|
| 76 |
+
per_annotator = len(remaining) // n_annotators
|
| 77 |
+
extra = len(remaining) % n_annotators
|
| 78 |
+
|
| 79 |
+
print(f"\n📊 Assignment Summary:")
|
| 80 |
+
print(f" Total docs: {n_docs}")
|
| 81 |
+
print(f" Annotators: {n_annotators}")
|
| 82 |
+
print(f" Overlap ({overlap_pct}%): {n_overlap} docs shared by all")
|
| 83 |
+
print(f" Per annotator: ~{per_annotator + n_overlap} docs each")
|
| 84 |
+
print(f" Overlap docs: {overlap_docs}")
|
| 85 |
+
print()
|
| 86 |
+
|
| 87 |
+
start = 0
|
| 88 |
+
for i, ann in enumerate(annotators):
|
| 89 |
+
# Distribute remaining: first `extra` annotators get 1 more
|
| 90 |
+
count = per_annotator + (1 if i < extra else 0)
|
| 91 |
+
exclusive = sorted(remaining[start:start + count])
|
| 92 |
+
start += count
|
| 93 |
+
|
| 94 |
+
ann["docs"] = sorted(overlap_docs + exclusive)
|
| 95 |
+
print(f" {ann['username']}: {len(ann['docs'])} docs "
|
| 96 |
+
f"({n_overlap} overlap + {len(exclusive)} exclusive)")
|
| 97 |
+
|
| 98 |
+
return config
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def upload_config():
|
| 102 |
+
"""Upload annotator_config.yaml to HF."""
|
| 103 |
+
try:
|
| 104 |
+
from huggingface_hub import HfApi
|
| 105 |
+
import os
|
| 106 |
+
|
| 107 |
+
token = os.environ.get("HF_TOKEN")
|
| 108 |
+
if not token:
|
| 109 |
+
env_path = Path(__file__).parent / ".env"
|
| 110 |
+
if env_path.exists():
|
| 111 |
+
for line in env_path.read_text().splitlines():
|
| 112 |
+
if line.startswith("HF_TOKEN="):
|
| 113 |
+
token = line.split("=", 1)[1].strip()
|
| 114 |
+
|
| 115 |
+
if not token:
|
| 116 |
+
print("❌ No HF_TOKEN found.")
|
| 117 |
+
return
|
| 118 |
+
|
| 119 |
+
api = HfApi(token=token)
|
| 120 |
+
api.upload_file(
|
| 121 |
+
path_or_fileobj=str(CONFIG_PATH),
|
| 122 |
+
path_in_repo="annotation_data/annotator_config.yaml",
|
| 123 |
+
repo_id="ai4data/annotation_data",
|
| 124 |
+
repo_type="dataset",
|
| 125 |
+
commit_message="Update annotator assignments",
|
| 126 |
+
)
|
| 127 |
+
print("✅ Uploaded annotator_config.yaml to HF")
|
| 128 |
+
except ImportError:
|
| 129 |
+
print("❌ huggingface_hub required: uv pip install huggingface_hub")
|
| 130 |
+
|
| 131 |
+
|
| 132 |
+
def main():
|
| 133 |
+
parser = argparse.ArgumentParser(description="Generate document assignments")
|
| 134 |
+
parser.add_argument("--dry-run", action="store_true", help="Preview only")
|
| 135 |
+
parser.add_argument("--upload", action="store_true", help="Upload config to HF")
|
| 136 |
+
parser.add_argument("--seed", type=int, default=42, help="Random seed")
|
| 137 |
+
args = parser.parse_args()
|
| 138 |
+
|
| 139 |
+
config = load_config()
|
| 140 |
+
config = generate_assignments(config, seed=args.seed)
|
| 141 |
+
|
| 142 |
+
if args.dry_run:
|
| 143 |
+
print("\n[DRY RUN] Would save:")
|
| 144 |
+
print(yaml.dump(config, default_flow_style=False, sort_keys=False))
|
| 145 |
+
else:
|
| 146 |
+
save_config(config)
|
| 147 |
+
print(f"\n💾 Saved to {CONFIG_PATH}")
|
| 148 |
+
|
| 149 |
+
if args.upload:
|
| 150 |
+
upload_config()
|
| 151 |
+
|
| 152 |
+
print("\n✅ Done!")
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
if __name__ == "__main__":
|
| 156 |
+
main()
|
package-lock.json
CHANGED
|
@@ -9,6 +9,7 @@
|
|
| 9 |
"version": "1.0.0",
|
| 10 |
"dependencies": {
|
| 11 |
"@huggingface/hub": "^2.10.3",
|
|
|
|
| 12 |
"next": "14.2.14",
|
| 13 |
"react": "^18",
|
| 14 |
"react-dom": "^18",
|
|
@@ -286,6 +287,12 @@
|
|
| 286 |
"node": ">=8"
|
| 287 |
}
|
| 288 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
"node_modules/bail": {
|
| 290 |
"version": "2.0.2",
|
| 291 |
"resolved": "https://registry.npmjs.org/bail/-/bail-2.0.2.tgz",
|
|
@@ -634,6 +641,18 @@
|
|
| 634 |
"integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
|
| 635 |
"license": "MIT"
|
| 636 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 637 |
"node_modules/longest-streak": {
|
| 638 |
"version": "3.1.0",
|
| 639 |
"resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-3.1.0.tgz",
|
|
|
|
| 9 |
"version": "1.0.0",
|
| 10 |
"dependencies": {
|
| 11 |
"@huggingface/hub": "^2.10.3",
|
| 12 |
+
"js-yaml": "^4.1.1",
|
| 13 |
"next": "14.2.14",
|
| 14 |
"react": "^18",
|
| 15 |
"react-dom": "^18",
|
|
|
|
| 287 |
"node": ">=8"
|
| 288 |
}
|
| 289 |
},
|
| 290 |
+
"node_modules/argparse": {
|
| 291 |
+
"version": "2.0.1",
|
| 292 |
+
"resolved": "https://registry.npmjs.org/argparse/-/argparse-2.0.1.tgz",
|
| 293 |
+
"integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==",
|
| 294 |
+
"license": "Python-2.0"
|
| 295 |
+
},
|
| 296 |
"node_modules/bail": {
|
| 297 |
"version": "2.0.2",
|
| 298 |
"resolved": "https://registry.npmjs.org/bail/-/bail-2.0.2.tgz",
|
|
|
|
| 641 |
"integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==",
|
| 642 |
"license": "MIT"
|
| 643 |
},
|
| 644 |
+
"node_modules/js-yaml": {
|
| 645 |
+
"version": "4.1.1",
|
| 646 |
+
"resolved": "https://registry.npmjs.org/js-yaml/-/js-yaml-4.1.1.tgz",
|
| 647 |
+
"integrity": "sha512-qQKT4zQxXl8lLwBtHMWwaTcGfFOZviOJet3Oy/xmGk2gZH677CJM9EvtfdSkgWcATZhj/55JZ0rmy3myCT5lsA==",
|
| 648 |
+
"license": "MIT",
|
| 649 |
+
"dependencies": {
|
| 650 |
+
"argparse": "^2.0.1"
|
| 651 |
+
},
|
| 652 |
+
"bin": {
|
| 653 |
+
"js-yaml": "bin/js-yaml.js"
|
| 654 |
+
}
|
| 655 |
+
},
|
| 656 |
"node_modules/longest-streak": {
|
| 657 |
"version": "3.1.0",
|
| 658 |
"resolved": "https://registry.npmjs.org/longest-streak/-/longest-streak-3.1.0.tgz",
|
package.json
CHANGED
|
@@ -9,6 +9,7 @@
|
|
| 9 |
},
|
| 10 |
"dependencies": {
|
| 11 |
"@huggingface/hub": "^2.10.3",
|
|
|
|
| 12 |
"next": "14.2.14",
|
| 13 |
"react": "^18",
|
| 14 |
"react-dom": "^18",
|
|
|
|
| 9 |
},
|
| 10 |
"dependencies": {
|
| 11 |
"@huggingface/hub": "^2.10.3",
|
| 12 |
+
"js-yaml": "^4.1.1",
|
| 13 |
"next": "14.2.14",
|
| 14 |
"react": "^18",
|
| 15 |
"react-dom": "^18",
|