Spaces:

ai4data
/

data-use-annotation

Running

rafmacalaba commited on Feb 24

Commit

a2c885c

1 Parent(s): aeca117

feat: multi-corpus support

- corpora.json registry: add new corpora by adding entries
- All APIs/utils resolve paths via config.js helpers
- Per-corpus doc assignments: docs: { wbg: [...], unhcr: [...] }
- Document selector shows [World Bank] Doc 3 labels
- Leaderboard/progress scan all corpora
- generate_assignments.py handles per-corpus distribution

Files changed (10) hide show

app/api/document/route.js +11 -17
app/api/documents/route.js +94 -62
app/api/leaderboard/route.js +48 -50
app/api/progress/route.js +88 -85
app/api/validate/route.js +29 -53
app/components/DocumentSelector.js +10 -5
app/page.js +16 -8
generate_assignments.py +62 -45
utils/config.js +53 -0
utils/storage.js +75 -92

app/api/document/route.js CHANGED Viewed

@@ -1,6 +1,5 @@
-import { HF_DATASET_BASE_URL } from '../../../utils/config.js';
 import fs from 'fs';
-import path from 'path';
 const isHFSpace = () => {
     return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
@@ -10,8 +9,8 @@ export async function GET(request) {
     const { searchParams } = new URL(request.url);
     const index = searchParams.get('index');
     const page = searchParams.get('page');
-    // Validate required params
     if (index === null || page === null) {
         return new Response(
             JSON.stringify({ error: "Missing index or page parameter" }),
@@ -19,7 +18,6 @@ export async function GET(request) {
         );
     }
-    // Validate numeric values
     const indexNum = parseInt(index, 10);
     const pageNum = parseInt(page, 10);
@@ -30,46 +28,42 @@ export async function GET(request) {
         );
     }
     try {
         let pagesData;
         if (isHFSpace()) {
-            // Production: fetch from HuggingFace
-            const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${indexNum}/raw/doc_${indexNum}_direct_judged.jsonl`;
             const res = await fetch(docUrl, {
                 headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
             });
             if (!res.ok) {
                 return new Response(
-                    JSON.stringify({ error: `doc_${indexNum}_direct_judged.jsonl not found on HF Datasets` }),
                     { status: res.status, headers: { 'Content-Type': 'application/json' } }
                 );
             }
             pagesData = await res.json();
         } else {
-            // Local dev: read from local file (reflects saved annotations immediately)
-            const filePath = path.join(
-                process.cwd(),
-                'annotation_data', 'wbg_extractions',
-                `doc_${indexNum}`, 'raw', `doc_${indexNum}_direct_judged.jsonl`
-            );
             if (!fs.existsSync(filePath)) {
                 return new Response(
-                    JSON.stringify({ error: `doc_${indexNum}_direct_judged.jsonl not found locally` }),
                     { status: 404, headers: { 'Content-Type': 'application/json' } }
                 );
             }
-            const raw = fs.readFileSync(filePath, 'utf-8');
-            pagesData = JSON.parse(raw);
         }
         const pageData = pagesData.find(p => p.document?.pages?.[0] === pageNum);
         if (!pageData) {
             return new Response(
-                JSON.stringify({ error: `Page ${pageNum} not found in doc ${indexNum}` }),
                 { status: 404, headers: { 'Content-Type': 'application/json' } }
             );
         }

+import { HF_DATASET_BASE_URL, getCorpus, getDocRepoPath, getDocLocalPath } from '../../../utils/config.js';
 import fs from 'fs';
 const isHFSpace = () => {
     return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
     const { searchParams } = new URL(request.url);
     const index = searchParams.get('index');
     const page = searchParams.get('page');
+    const corpusId = searchParams.get('corpus');
     if (index === null || page === null) {
         return new Response(
             JSON.stringify({ error: "Missing index or page parameter" }),
         );
     }
     const indexNum = parseInt(index, 10);
     const pageNum = parseInt(page, 10);
         );
     }
+    const corpus = getCorpus(corpusId);
     try {
         let pagesData;
         if (isHFSpace()) {
+            const docRepoPath = getDocRepoPath(corpus, indexNum);
+            const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
             const res = await fetch(docUrl, {
                 headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
             });
             if (!res.ok) {
                 return new Response(
+                    JSON.stringify({ error: `doc_${indexNum} not found on HF (${corpus.id})` }),
                     { status: res.status, headers: { 'Content-Type': 'application/json' } }
                 );
             }
             pagesData = await res.json();
         } else {
+            const filePath = getDocLocalPath(corpus, indexNum);
             if (!fs.existsSync(filePath)) {
                 return new Response(
+                    JSON.stringify({ error: `doc_${indexNum} not found locally (${corpus.id})` }),
                     { status: 404, headers: { 'Content-Type': 'application/json' } }
                 );
             }
+            pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
         }
         const pageData = pagesData.find(p => p.document?.pages?.[0] === pageNum);
         if (!pageData) {
             return new Response(
+                JSON.stringify({ error: `Page ${pageNum} not found in doc ${indexNum} (${corpus.id})` }),
                 { status: 404, headers: { 'Content-Type': 'application/json' } }
             );
         }

app/api/documents/route.js CHANGED Viewed

@@ -1,9 +1,10 @@
-import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN } from '../../../utils/config.js';
 import yaml from 'js-yaml';
 /**
  * Fetch annotator_config.yaml and return the doc list for a given user.
  * Returns null if no config or user not found (show all docs).
  */
 async function getUserAssignedDocs(username) {
     if (!username) return null;
@@ -12,7 +13,7 @@ async function getUserAssignedDocs(username) {
         const configUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/annotator_config.yaml`;
         const res = await fetch(configUrl, {
             headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
-            next: { revalidate: 300 } // cache 5 min
         });
         if (!res.ok) return null;
@@ -20,9 +21,22 @@ async function getUserAssignedDocs(username) {
         const config = yaml.load(text);
         const annotator = (config.annotators || []).find(a => a.username === username);
-        if (!annotator || !annotator.docs || annotator.docs.length === 0) return null;
-        return new Set(annotator.docs);
     } catch (e) {
         console.warn('Could not load annotator_config.yaml:', e.message);
         return null;
@@ -31,76 +45,94 @@ async function getUserAssignedDocs(username) {
 export async function GET(request) {
     try {
-        // Get username from query param
         const { searchParams } = new URL(request.url);
         const username = searchParams.get('user');
-        // Fetch user's assigned docs (if configured)
         const assignedDocs = await getUserAssignedDocs(username);
-        // Fetch the index file from HF Datasets
-        const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_data/wbg_pdf_links.json`;
-        const linksRes = await fetch(linksUrl, {
-            headers: {
-                'Authorization': `Bearer ${process.env.HF_TOKEN}`
-            },
-            next: { revalidate: 3600 }
-        });
-        if (!linksRes.ok) {
-            console.error("Failed to fetch links JSON", await linksRes.text());
-            return new Response(
-                JSON.stringify({ error: "Missing wbg_pdf_links.json on HF Datasets" }),
-                { status: 404, headers: { 'Content-Type': 'application/json' } }
-            );
-        }
-        const links = await linksRes.json();
-        // Filter to docs with revalidation data, then by user assignment if available
-        let successLinks = links
-            .filter(l => l.status === 'success' && l.has_revalidation === true);
-        if (assignedDocs) {
-            successLinks = successLinks.filter(l => assignedDocs.has(l.index));
-        }
-        successLinks = successLinks.slice(0, MAX_DOCS_TO_SCAN);
-        // Parallel fetch
-        const results = await Promise.allSettled(
-            successLinks.map(async (link) => {
-                const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_direct_judged.jsonl`;
-                const docRes = await fetch(docUrl, {
-                    headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
-                });
-                if (!docRes.ok) return null;
-                const pagesData = await docRes.json();
-                const annotatablePages = pagesData
-                    .filter(page => page.datasets && page.datasets.length > 0)
-                    .map(page => page.document.pages[0]);
-                if (annotatablePages.length === 0) return null;
-                const pdfUrl = link.direct_pdf_url;
-                if (!pdfUrl) return null;
-                return {
-                    index: link.index,
-                    pdf_url: pdfUrl,
-                    landing_page: link.landing_page_url,
-                    annotatable_pages: annotatablePages
-                };
-            })
-        );
-        const documents = results
-            .filter(r => r.status === 'fulfilled' && r.value !== null)
-            .map(r => r.value);
-        return new Response(JSON.stringify(documents), {
             status: 200,
             headers: {
                 'Content-Type': 'application/json',
@@ -110,7 +142,7 @@ export async function GET(request) {
     } catch (error) {
         console.error(error);
         return new Response(
-            JSON.stringify({ error: "Failed to fetch documents from HF" }),
             { status: 500, headers: { 'Content-Type': 'application/json' } }
         );
     }

+import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN, getCorpus, getLinksRepoPath, getDocRepoPath } from '../../../utils/config.js';
 import yaml from 'js-yaml';
 /**
  * Fetch annotator_config.yaml and return the doc list for a given user.
  * Returns null if no config or user not found (show all docs).
+ * Now returns per-corpus assignments: { wbg: Set([1,2]), unhcr: Set([3,4]) }
  */
 async function getUserAssignedDocs(username) {
     if (!username) return null;
         const configUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/annotator_config.yaml`;
         const res = await fetch(configUrl, {
             headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
+            cache: 'no-store'
         });
         if (!res.ok) return null;
         const config = yaml.load(text);
         const annotator = (config.annotators || []).find(a => a.username === username);
+        if (!annotator || !annotator.docs) return null;
+        // Support both old format (flat array) and new format (per-corpus object)
+        if (Array.isArray(annotator.docs)) {
+            // Legacy: flat array — treat as default corpus
+            return { _flat: new Set(annotator.docs) };
+        }
+        // New format: { wbg: [1,2], unhcr: [3,4] }
+        const result = {};
+        for (const [corpusId, docList] of Object.entries(annotator.docs)) {
+            if (Array.isArray(docList)) {
+                result[corpusId] = new Set(docList);
+            }
+        }
+        return Object.keys(result).length > 0 ? result : null;
     } catch (e) {
         console.warn('Could not load annotator_config.yaml:', e.message);
         return null;
 export async function GET(request) {
     try {
         const { searchParams } = new URL(request.url);
         const username = searchParams.get('user');
         const assignedDocs = await getUserAssignedDocs(username);
+        // Import corpora list
+        const { getCorpora } = await import('../../../utils/config.js');
+        const corpora = getCorpora();
+        const allDocuments = [];
+        for (const corpus of corpora) {
+            // Determine which doc indices this user has for this corpus
+            let userDocSet = null;
+            if (assignedDocs) {
+                if (assignedDocs._flat) {
+                    // Legacy flat format — only applies to first/default corpus
+                    userDocSet = corpus === corpora[0] ? assignedDocs._flat : new Set();
+                } else {
+                    userDocSet = assignedDocs[corpus.id] || new Set();
+                }
+                if (userDocSet.size === 0) continue; // no docs for this corpus
+            }
+            // Fetch the links file for this corpus
+            const linksPath = getLinksRepoPath(corpus);
+            const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`;
+            const linksRes = await fetch(linksUrl, {
+                headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
+                cache: 'no-store'
+            });
+            if (!linksRes.ok) {
+                console.warn(`No links file for corpus ${corpus.id}: ${linksRes.status}`);
+                continue;
+            }
+            const links = await linksRes.json();
+            let successLinks = links
+                .filter(l => l.status === 'success' && l.has_revalidation === true);
+            if (userDocSet) {
+                successLinks = successLinks.filter(l => userDocSet.has(l.index));
+            }
+            successLinks = successLinks.slice(0, MAX_DOCS_TO_SCAN);
+            // Parallel fetch docs
+            const results = await Promise.allSettled(
+                successLinks.map(async (link) => {
+                    const docRepoPath = getDocRepoPath(corpus, link.index);
+                    const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
+                    const docRes = await fetch(docUrl, {
+                        headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
+                    });
+                    if (!docRes.ok) return null;
+                    const pagesData = await docRes.json();
+                    const annotatablePages = pagesData
+                        .filter(page => page.datasets && page.datasets.length > 0)
+                        .map(page => page.document.pages[0]);
+                    if (annotatablePages.length === 0) return null;
+                    const pdfUrl = link.direct_pdf_url;
+                    if (!pdfUrl) return null;
+                    return {
+                        corpus: corpus.id,
+                        corpus_name: corpus.name,
+                        index: link.index,
+                        pdf_url: pdfUrl,
+                        landing_page: link.landing_page_url,
+                        annotatable_pages: annotatablePages
+                    };
+                })
+            );
+            const docs = results
+                .filter(r => r.status === 'fulfilled' && r.value !== null)
+                .map(r => r.value);
+            allDocuments.push(...docs);
+        }
+        return new Response(JSON.stringify(allDocuments), {
             status: 200,
             headers: {
                 'Content-Type': 'application/json',
     } catch (error) {
         console.error(error);
         return new Response(
+            JSON.stringify({ error: "Failed to fetch documents" }),
             { status: 500, headers: { 'Content-Type': 'application/json' } }
         );
     }

app/api/leaderboard/route.js CHANGED Viewed

@@ -1,71 +1,69 @@
-import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN } from '../../../utils/config.js';
-export const dynamic = 'force-dynamic'; // disable Next.js route caching
 /**
  * GET /api/leaderboard
- * Returns annotator rankings based on validation counts.
  */
 export async function GET() {
     try {
-        const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_data/wbg_pdf_links.json`;
-        const linksRes = await fetch(linksUrl, {
-            headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
-            cache: 'no-store'
-        });
-        if (!linksRes.ok) {
-            return new Response(JSON.stringify({ error: 'Failed to fetch links' }), { status: 500 });
-        }
-        const links = await linksRes.json();
-        const activeLinks = links
-            .filter(l => l.status === 'success' && l.has_revalidation === true)
-            .slice(0, MAX_DOCS_TO_SCAN);
-        // Tally per-annotator stats
-        const stats = {}; // annotator -> { verified, correct, incorrect, docs, humanAdded }
-        const results = await Promise.allSettled(
-            activeLinks.map(async (link) => {
-                const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_direct_judged.jsonl`;
-                const docRes = await fetch(docUrl, {
-                    headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
-                    cache: 'no-store'
-                });
-                if (!docRes.ok) return;
-                const pagesData = await docRes.json();
-                const docAnnotators = new Set();
-                for (const page of pagesData) {
-                    for (const ds of (page.datasets || [])) {
-                        // Count human-added annotations
-                        if (ds.source === 'human' && ds.annotator) {
-                            if (!stats[ds.annotator]) {
-                                stats[ds.annotator] = { verified: 0, correct: 0, incorrect: 0, docs: new Set(), humanAdded: 0 };
                             }
-                            stats[ds.annotator].humanAdded++;
-                            stats[ds.annotator].docs.add(link.index);
-                        }
-                        // Count validations
-                        for (const v of (ds.validations || [])) {
-                            if (!v.annotator || !v.human_validated) continue;
-                            if (!stats[v.annotator]) {
-                                stats[v.annotator] = { verified: 0, correct: 0, incorrect: 0, docs: new Set(), humanAdded: 0 };
                             }
-                            stats[v.annotator].verified++;
-                            if (v.human_verdict === true) stats[v.annotator].correct++;
-                            else stats[v.annotator].incorrect++;
-                            stats[v.annotator].docs.add(link.index);
                         }
                     }
-                }
-            })
-        );
-        // Build ranked list
         const leaderboard = Object.entries(stats)
             .map(([annotator, s]) => ({
                 annotator,
@@ -74,7 +72,7 @@ export async function GET() {
                 incorrect: s.incorrect,
                 humanAdded: s.humanAdded,
                 docsWorked: s.docs.size,
-                score: s.verified + s.humanAdded, // total contributions
             }))
             .sort((a, b) => b.score - a.score);

+import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN, getCorpora, getLinksRepoPath, getDocRepoPath } from '../../../utils/config.js';
+export const dynamic = 'force-dynamic';
 /**
  * GET /api/leaderboard
+ * Scans ALL corpora and returns annotator rankings.
  */
 export async function GET() {
     try {
+        const corpora = getCorpora();
+        const stats = {}; // annotator -> { verified, correct, incorrect, docs, humanAdded }
+        for (const corpus of corpora) {
+            const linksPath = getLinksRepoPath(corpus);
+            const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`;
+            const linksRes = await fetch(linksUrl, {
+                headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
+                cache: 'no-store'
+            });
+            if (!linksRes.ok) continue;
+            const links = await linksRes.json();
+            const activeLinks = links
+                .filter(l => l.status === 'success' && l.has_revalidation === true)
+                .slice(0, MAX_DOCS_TO_SCAN);
+            await Promise.allSettled(
+                activeLinks.map(async (link) => {
+                    const docRepoPath = getDocRepoPath(corpus, link.index);
+                    const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
+                    const docRes = await fetch(docUrl, {
+                        headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
+                        cache: 'no-store'
+                    });
+                    if (!docRes.ok) return;
+                    const pagesData = await docRes.json();
+                    for (const page of pagesData) {
+                        for (const ds of (page.datasets || [])) {
+                            if (ds.source === 'human' && ds.annotator) {
+                                if (!stats[ds.annotator]) {
+                                    stats[ds.annotator] = { verified: 0, correct: 0, incorrect: 0, docs: new Set(), humanAdded: 0 };
+                                }
+                                stats[ds.annotator].humanAdded++;
+                                stats[ds.annotator].docs.add(`${corpus.id}:${link.index}`);
                             }
+                            for (const v of (ds.validations || [])) {
+                                if (!v.annotator || !v.human_validated) continue;
+                                if (!stats[v.annotator]) {
+                                    stats[v.annotator] = { verified: 0, correct: 0, incorrect: 0, docs: new Set(), humanAdded: 0 };
+                                }
+                                stats[v.annotator].verified++;
+                                if (v.human_verdict === true) stats[v.annotator].correct++;
+                                else stats[v.annotator].incorrect++;
+                                stats[v.annotator].docs.add(`${corpus.id}:${link.index}`);
                             }
                         }
                     }
+                })
+            );
+        }
         const leaderboard = Object.entries(stats)
             .map(([annotator, s]) => ({
                 annotator,
                 incorrect: s.incorrect,
                 humanAdded: s.humanAdded,
                 docsWorked: s.docs.size,
+                score: s.verified + s.humanAdded,
             }))
             .sort((a, b) => b.score - a.score);

app/api/progress/route.js CHANGED Viewed

@@ -1,101 +1,104 @@
-import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN } from '../../../utils/config.js';
 /**
  * GET /api/progress
- * Returns progress stats: total docs, pages, mentions, and how many are verified.
  */
 export async function GET() {
     try {
-        const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_data/wbg_pdf_links.json`;
-        const linksRes = await fetch(linksUrl, {
-            headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
-            next: { revalidate: 300 } // cache 5 min
-        });
-        if (!linksRes.ok) {
-            return new Response(JSON.stringify({ error: 'Failed to fetch links' }), { status: 500 });
-        }
-        const links = await linksRes.json();
-        const activeLinks = links
-            .filter(l => l.status === 'success' && l.has_revalidation === true)
-            .slice(0, MAX_DOCS_TO_SCAN);
-        // Fetch all docs in parallel
-        const results = await Promise.allSettled(
-            activeLinks.map(async (link) => {
-                const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_direct_judged.jsonl`;
-                const docRes = await fetch(docUrl, {
-                    headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
-                });
-                if (!docRes.ok) return null;
-                const pagesData = await docRes.json();
-                let totalMentions = 0;
-                let verifiedMentions = 0;
-                let totalPages = 0;
-                let completedPages = 0;
-                let humanAnnotations = 0;
-                for (const page of pagesData) {
-                    const datasets = (page.datasets || []).filter(ds => {
-                        // Exclude consensus non-datasets
-                        if (ds.dataset_tag === 'non-dataset' && ds.dataset_name?.judge_agrees === true) {
-                            return false;
-                        }
-                        return true;
                     });
-                    if (datasets.length === 0) continue;
-                    totalPages++;
-                    totalMentions += datasets.length;
-                    let pageVerified = 0;
-                    for (const ds of datasets) {
-                        if (ds.human_validated === true) {
-                            verifiedMentions++;
-                            pageVerified++;
-                        }
-                        if (ds.source === 'human') {
-                            humanAnnotations++;
                         }
-                    }
-                    // A page is "completed" if all its mentions are verified
-                    if (pageVerified === datasets.length) {
-                        completedPages++;
                     }
-                }
-                const docComplete = totalPages > 0 && completedPages === totalPages;
-                return {
-                    index: link.index,
-                    totalPages,
-                    completedPages,
-                    totalMentions,
-                    verifiedMentions,
-                    humanAnnotations,
-                    complete: docComplete,
-                };
-            })
-        );
-        const docs = results
-            .filter(r => r.status === 'fulfilled' && r.value !== null)
-            .map(r => r.value);
         const summary = {
-            totalDocs: docs.length,
-            completedDocs: docs.filter(d => d.complete).length,
-            totalPages: docs.reduce((s, d) => s + d.totalPages, 0),
-            completedPages: docs.reduce((s, d) => s + d.completedPages, 0),
-            totalMentions: docs.reduce((s, d) => s + d.totalMentions, 0),
-            verifiedMentions: docs.reduce((s, d) => s + d.verifiedMentions, 0),
-            humanAnnotations: docs.reduce((s, d) => s + d.humanAnnotations, 0),
-            docs,
         };
         return new Response(JSON.stringify(summary), {

+import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN, getCorpora, getLinksRepoPath, getDocRepoPath } from '../../../utils/config.js';
 /**
  * GET /api/progress
+ * Returns progress stats across ALL corpora.
  */
 export async function GET() {
     try {
+        const corpora = getCorpora();
+        const allDocs = [];
+        for (const corpus of corpora) {
+            const linksPath = getLinksRepoPath(corpus);
+            const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`;
+            const linksRes = await fetch(linksUrl, {
+                headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
+                next: { revalidate: 300 }
+            });
+            if (!linksRes.ok) continue;
+            const links = await linksRes.json();
+            const activeLinks = links
+                .filter(l => l.status === 'success' && l.has_revalidation === true)
+                .slice(0, MAX_DOCS_TO_SCAN);
+            const results = await Promise.allSettled(
+                activeLinks.map(async (link) => {
+                    const docRepoPath = getDocRepoPath(corpus, link.index);
+                    const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
+                    const docRes = await fetch(docUrl, {
+                        headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
                     });
+                    if (!docRes.ok) return null;
+                    const pagesData = await docRes.json();
+                    let totalMentions = 0;
+                    let verifiedMentions = 0;
+                    let totalPages = 0;
+                    let completedPages = 0;
+                    let humanAnnotations = 0;
+                    for (const page of pagesData) {
+                        const datasets = (page.datasets || []).filter(ds => {
+                            if (ds.dataset_tag === 'non-dataset' && ds.dataset_name?.judge_agrees === true) {
+                                return false;
+                            }
+                            return true;
+                        });
+                        if (datasets.length === 0) continue;
+                        totalPages++;
+                        totalMentions += datasets.length;
+                        let pageVerified = 0;
+                        for (const ds of datasets) {
+                            if (ds.human_validated === true) {
+                                verifiedMentions++;
+                                pageVerified++;
+                            }
+                            if (ds.source === 'human') {
+                                humanAnnotations++;
+                            }
                         }
+                        if (pageVerified === datasets.length) {
+                            completedPages++;
+                        }
                     }
+                    return {
+                        corpus: corpus.id,
+                        index: link.index,
+                        totalPages,
+                        completedPages,
+                        totalMentions,
+                        verifiedMentions,
+                        humanAnnotations,
+                        complete: totalPages > 0 && completedPages === totalPages,
+                    };
+                })
+            );
+            const docs = results
+                .filter(r => r.status === 'fulfilled' && r.value !== null)
+                .map(r => r.value);
+            allDocs.push(...docs);
+        }
         const summary = {
+            totalDocs: allDocs.length,
+            completedDocs: allDocs.filter(d => d.complete).length,
+            totalPages: allDocs.reduce((s, d) => s + d.totalPages, 0),
+            completedPages: allDocs.reduce((s, d) => s + d.completedPages, 0),
+            totalMentions: allDocs.reduce((s, d) => s + d.totalMentions, 0),
+            verifiedMentions: allDocs.reduce((s, d) => s + d.verifiedMentions, 0),
+            humanAnnotations: allDocs.reduce((s, d) => s + d.humanAnnotations, 0),
+            docs: allDocs,
         };
         return new Response(JSON.stringify(summary), {

app/api/validate/route.js CHANGED Viewed

@@ -1,31 +1,18 @@
 import { NextResponse } from 'next/server';
 import fs from 'fs';
-import path from 'path';
 import { commit } from '@huggingface/hub';
-import { HF_DATASET_ID, HF_DATASET_BASE_URL } from '../../../utils/config.js';
 const isHFSpace = () => process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
-function getDocFilePath(docIndex) {
-    return path.join(
-        process.cwd(),
-        'annotation_data', 'wbg_extractions',
-        `doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl`
-    );
-}
-function getDocRepoPath(docIndex) {
-    return `annotation_data/wbg_extractions/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`;
-}
 /**
  * PUT /api/validate
- * Updates a specific dataset entry within a page by its array index.
- * Body: { document_index, page_number, dataset_index, updates }
  */
 export async function PUT(request) {
     try {
-        const { document_index, page_number, dataset_index, updates } = await request.json();
         if (document_index == null || page_number == null || dataset_index == null || !updates) {
             return NextResponse.json(
@@ -37,23 +24,23 @@ export async function PUT(request) {
         let pagesData;
         if (isHFSpace()) {
-            const url = `${HF_DATASET_BASE_URL}/raw/main/${getDocRepoPath(document_index)}`;
             const res = await fetch(url, {
                 headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
             });
             if (!res.ok) {
-                return NextResponse.json({ error: 'Document not found on HF' }, { status: 404 });
             }
             pagesData = await res.json();
         } else {
-            const filePath = getDocFilePath(document_index);
             if (!fs.existsSync(filePath)) {
-                return NextResponse.json({ error: 'Document not found locally' }, { status: 404 });
             }
             pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
         }
-        // Find the page
         const pageIdx = pagesData.findIndex(p => p.document?.pages?.[0] === page_number);
         if (pageIdx === -1) {
             return NextResponse.json({ error: `Page ${page_number} not found` }, { status: 404 });
@@ -64,12 +51,9 @@ export async function PUT(request) {
             return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
         }
-        // Per-annotator validation: store in a `validations` array.
-        // Each annotator gets their own entry; re-validating updates in-place.
         const currentEntry = pagesData[pageIdx].datasets[dataset_index];
         const annotator = updates.annotator || 'unknown';
-        // Separate validation fields from other updates (like dataset_tag edits)
         const validationFields = ['human_validated', 'human_verdict', 'human_notes', 'annotator', 'validated_at'];
         const isValidation = validationFields.some(f => f in updates);
@@ -90,27 +74,19 @@ export async function PUT(request) {
                 validations.push(validationEntry);
             }
-            pagesData[pageIdx].datasets[dataset_index] = {
-                ...currentEntry,
-                validations,
-            };
         } else {
-            // Non-validation updates (e.g. dataset_tag edit) go at top level
-            pagesData[pageIdx].datasets[dataset_index] = {
-                ...currentEntry,
-                ...updates,
-            };
         }
         // Save back
         if (isHFSpace()) {
-            const token = process.env.HF_TOKEN;
-            const repoPath = getDocRepoPath(document_index);
             const content = JSON.stringify(pagesData, null, 2);
             await commit({
                 repo: { type: 'dataset', name: HF_DATASET_ID },
-                credentials: { accessToken: token },
-                title: `Validate dataset in doc_${document_index} page ${page_number}`,
                 operations: [{
                     operation: 'addOrUpdate',
                     path: repoPath,
@@ -118,7 +94,7 @@ export async function PUT(request) {
                 }],
             });
         } else {
-            const filePath = getDocFilePath(document_index);
             fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
         }
@@ -133,16 +109,18 @@ export async function PUT(request) {
 }
 /**
- * DELETE /api/validate?doc=X&page=Y&idx=Z
- * Removes a dataset entry by its array index.
  */
 export async function DELETE(request) {
     try {
         const { searchParams } = new URL(request.url);
         const document_index = parseInt(searchParams.get('doc'), 10);
         const page_number = parseInt(searchParams.get('page'), 10);
         const dataset_index = parseInt(searchParams.get('idx'), 10);
         if (isNaN(document_index) || isNaN(page_number) || isNaN(dataset_index)) {
             return NextResponse.json(
                 { error: 'Missing doc, page, or idx parameter' },
@@ -153,18 +131,19 @@ export async function DELETE(request) {
         let pagesData;
         if (isHFSpace()) {
-            const url = `${HF_DATASET_BASE_URL}/raw/main/${getDocRepoPath(document_index)}`;
             const res = await fetch(url, {
                 headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
             });
             if (!res.ok) {
-                return NextResponse.json({ error: 'Document not found on HF' }, { status: 404 });
             }
             pagesData = await res.json();
         } else {
-            const filePath = getDocFilePath(document_index);
             if (!fs.existsSync(filePath)) {
-                return NextResponse.json({ error: 'Document not found locally' }, { status: 404 });
             }
             pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
         }
@@ -179,18 +158,15 @@ export async function DELETE(request) {
             return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
         }
-        // Remove the entry
         pagesData[pageIdx].datasets.splice(dataset_index, 1);
-        // Save back
         if (isHFSpace()) {
-            const token = process.env.HF_TOKEN;
-            const repoPath = getDocRepoPath(document_index);
             const content = JSON.stringify(pagesData, null, 2);
             await commit({
                 repo: { type: 'dataset', name: HF_DATASET_ID },
-                credentials: { accessToken: token },
-                title: `Delete dataset from doc_${document_index} page ${page_number}`,
                 operations: [{
                     operation: 'addOrUpdate',
                     path: repoPath,
@@ -198,7 +174,7 @@ export async function DELETE(request) {
                 }],
             });
         } else {
-            const filePath = getDocFilePath(document_index);
             fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
         }

 import { NextResponse } from 'next/server';
 import fs from 'fs';
 import { commit } from '@huggingface/hub';
+import { HF_DATASET_ID, HF_DATASET_BASE_URL, getCorpus, getDocRepoPath, getDocLocalPath } from '../../../utils/config.js';
 const isHFSpace = () => process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
 /**
  * PUT /api/validate
+ * Body: { corpus, document_index, page_number, dataset_index, updates }
  */
 export async function PUT(request) {
     try {
+        const { corpus: corpusId, document_index, page_number, dataset_index, updates } = await request.json();
+        const corpus = getCorpus(corpusId);
         if (document_index == null || page_number == null || dataset_index == null || !updates) {
             return NextResponse.json(
         let pagesData;
         if (isHFSpace()) {
+            const repoPath = getDocRepoPath(corpus, document_index);
+            const url = `${HF_DATASET_BASE_URL}/raw/main/${repoPath}`;
             const res = await fetch(url, {
                 headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
             });
             if (!res.ok) {
+                return NextResponse.json({ error: `Document not found on HF (${corpus.id})` }, { status: 404 });
             }
             pagesData = await res.json();
         } else {
+            const filePath = getDocLocalPath(corpus, document_index);
             if (!fs.existsSync(filePath)) {
+                return NextResponse.json({ error: `Document not found locally (${corpus.id})` }, { status: 404 });
             }
             pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
         }
         const pageIdx = pagesData.findIndex(p => p.document?.pages?.[0] === page_number);
         if (pageIdx === -1) {
             return NextResponse.json({ error: `Page ${page_number} not found` }, { status: 404 });
             return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
         }
+        // Per-annotator validation
         const currentEntry = pagesData[pageIdx].datasets[dataset_index];
         const annotator = updates.annotator || 'unknown';
         const validationFields = ['human_validated', 'human_verdict', 'human_notes', 'annotator', 'validated_at'];
         const isValidation = validationFields.some(f => f in updates);
                 validations.push(validationEntry);
             }
+            pagesData[pageIdx].datasets[dataset_index] = { ...currentEntry, validations };
         } else {
+            pagesData[pageIdx].datasets[dataset_index] = { ...currentEntry, ...updates };
         }
         // Save back
         if (isHFSpace()) {
+            const repoPath = getDocRepoPath(corpus, document_index);
             const content = JSON.stringify(pagesData, null, 2);
             await commit({
                 repo: { type: 'dataset', name: HF_DATASET_ID },
+                credentials: { accessToken: process.env.HF_TOKEN },
+                title: `Validate ${corpus.id}/doc_${document_index} page ${page_number}`,
                 operations: [{
                     operation: 'addOrUpdate',
                     path: repoPath,
                 }],
             });
         } else {
+            const filePath = getDocLocalPath(corpus, document_index);
             fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
         }
 }
 /**
+ * DELETE /api/validate?corpus=X&doc=X&page=Y&idx=Z
  */
 export async function DELETE(request) {
     try {
         const { searchParams } = new URL(request.url);
+        const corpusId = searchParams.get('corpus');
         const document_index = parseInt(searchParams.get('doc'), 10);
         const page_number = parseInt(searchParams.get('page'), 10);
         const dataset_index = parseInt(searchParams.get('idx'), 10);
+        const corpus = getCorpus(corpusId);
         if (isNaN(document_index) || isNaN(page_number) || isNaN(dataset_index)) {
             return NextResponse.json(
                 { error: 'Missing doc, page, or idx parameter' },
         let pagesData;
         if (isHFSpace()) {
+            const repoPath = getDocRepoPath(corpus, document_index);
+            const url = `${HF_DATASET_BASE_URL}/raw/main/${repoPath}`;
             const res = await fetch(url, {
                 headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
             });
             if (!res.ok) {
+                return NextResponse.json({ error: `Document not found on HF (${corpus.id})` }, { status: 404 });
             }
             pagesData = await res.json();
         } else {
+            const filePath = getDocLocalPath(corpus, document_index);
             if (!fs.existsSync(filePath)) {
+                return NextResponse.json({ error: `Document not found locally (${corpus.id})` }, { status: 404 });
             }
             pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
         }
             return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
         }
         pagesData[pageIdx].datasets.splice(dataset_index, 1);
         if (isHFSpace()) {
+            const repoPath = getDocRepoPath(corpus, document_index);
             const content = JSON.stringify(pagesData, null, 2);
             await commit({
                 repo: { type: 'dataset', name: HF_DATASET_ID },
+                credentials: { accessToken: process.env.HF_TOKEN },
+                title: `Delete from ${corpus.id}/doc_${document_index} page ${page_number}`,
                 operations: [{
                     operation: 'addOrUpdate',
                     path: repoPath,
                 }],
             });
         } else {
+            const filePath = getDocLocalPath(corpus, document_index);
             fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
         }

app/components/DocumentSelector.js CHANGED Viewed

@@ -3,23 +3,28 @@
 export default function DocumentSelector({
     documents,
     selectedDocIndex,
     onDocChange,
 }) {
     return (
         <div className="navigation-controls">
             <div className="select-group">
                 <label htmlFor="doc-select">Document</label>
                 <select
                     id="doc-select"
-                    value={selectedDocIndex ?? ''}
                     onChange={(e) => {
-                        const docIdx = parseInt(e.target.value, 10);
-                        onDocChange(docIdx);
                     }}
                 >
                     {documents.map(doc => (
-                        <option key={doc.index} value={doc.index}>
-                            Document {doc.index} ({doc.annotatable_pages.length} pages)
                         </option>
                     ))}
                 </select>

 export default function DocumentSelector({
     documents,
     selectedDocIndex,
+    selectedCorpus,
     onDocChange,
 }) {
+    const currentValue = selectedCorpus && selectedDocIndex != null
+        ? `${selectedCorpus}:${selectedDocIndex}`
+        : '';
     return (
         <div className="navigation-controls">
             <div className="select-group">
                 <label htmlFor="doc-select">Document</label>
                 <select
                     id="doc-select"
+                    value={currentValue}
                     onChange={(e) => {
+                        const [corpus, idx] = e.target.value.split(':');
+                        onDocChange(corpus, parseInt(idx, 10));
                     }}
                 >
                     {documents.map(doc => (
+                        <option key={`${doc.corpus}:${doc.index}`} value={`${doc.corpus}:${doc.index}`}>
+                            [{doc.corpus_name}] Doc {doc.index} ({doc.annotatable_pages.length} pages)
                         </option>
                     ))}
                 </select>

app/page.js CHANGED Viewed

@@ -13,6 +13,7 @@ import Leaderboard from './components/Leaderboard';
 export default function Home() {
     const [documents, setDocuments] = useState([]);
     const [selectedDocIndex, setSelectedDocIndex] = useState(null);
     const [currentDoc, setCurrentDoc] = useState(null);
     // Page-by-page navigation: track the index into annotatable_pages array
@@ -60,17 +61,19 @@ export default function Home() {
             .then(data => {
                 setDocuments(data);
                 if (data.length > 0) {
-                    // Restore saved position from sessionStorage
                     const savedDoc = sessionStorage.getItem('selectedDocIndex');
                     const savedPage = sessionStorage.getItem('pageIdx');
                     const docIdx = savedDoc ? parseInt(savedDoc, 10) : null;
-                    const restoredDoc = docIdx !== null && data.find(d => d.index === docIdx);
                     if (restoredDoc) {
                         setSelectedDocIndex(docIdx);
                         setPageIdx(savedPage ? parseInt(savedPage, 10) : 0);
                     } else {
                         setSelectedDocIndex(data[0].index);
                         setPageIdx(0);
                     }
                 }
@@ -101,10 +104,11 @@ export default function Home() {
     // Update currentDoc when selection changes + persist to sessionStorage
     useEffect(() => {
-        if (selectedDocIndex !== null) {
-            const doc = documents.find(d => d.index === selectedDocIndex);
             setCurrentDoc(doc);
             sessionStorage.setItem('selectedDocIndex', selectedDocIndex);
             // Clamp pageIdx to valid range for this document
             if (doc) {
@@ -112,7 +116,7 @@ export default function Home() {
                 setPageIdx(prev => Math.min(prev, Math.max(0, maxPage)));
             }
         }
-    }, [selectedDocIndex, documents]);
     // Persist pageIdx to sessionStorage
     useEffect(() => {
@@ -123,7 +127,7 @@ export default function Home() {
     const refreshPageData = useCallback(() => {
         if (selectedDocIndex !== null && currentPageNumber !== null) {
             setLoadingPage(true);
-            fetch(`/api/document?index=${selectedDocIndex}&page=${currentPageNumber}`)
                 .then(res => res.json())
                 .then(data => {
                     setCurrentPageData(data);
@@ -175,7 +179,8 @@ export default function Home() {
         localStorage.setItem('annotator_name', name);
     };
-    const handleDocChange = (docIdx) => {
         setSelectedDocIndex(docIdx);
         setPageIdx(0);
     };
@@ -270,6 +275,7 @@ export default function Home() {
             dataset_tag: dataset_tag,
             source: 'human',
             annotator: annotatorName || "user",
             document_index: selectedDocIndex,
             page_number: currentPageNumber,
             timestamp: new Date().toISOString(),
@@ -313,7 +319,7 @@ export default function Home() {
         const rawIdx = ds._rawIndex ?? idx;
         try {
             const res = await fetch(
-                `/api/validate?doc=${selectedDocIndex}&page=${currentPageNumber}&idx=${rawIdx}`,
                 { method: 'DELETE' }
             );
             if (res.ok) {
@@ -373,6 +379,7 @@ export default function Home() {
                 method: 'PUT',
                 headers: { 'Content-Type': 'application/json' },
                 body: JSON.stringify({
                     document_index: selectedDocIndex,
                     page_number: currentPageNumber,
                     dataset_index: datasetIdx,
@@ -474,6 +481,7 @@ export default function Home() {
                         <DocumentSelector
                             documents={documents}
                             selectedDocIndex={selectedDocIndex}
                             onDocChange={handleDocChange}
                         />
                     </div>

 export default function Home() {
     const [documents, setDocuments] = useState([]);
     const [selectedDocIndex, setSelectedDocIndex] = useState(null);
+    const [selectedCorpus, setSelectedCorpus] = useState(null);
     const [currentDoc, setCurrentDoc] = useState(null);
     // Page-by-page navigation: track the index into annotatable_pages array
             .then(data => {
                 setDocuments(data);
                 if (data.length > 0) {
                     const savedDoc = sessionStorage.getItem('selectedDocIndex');
+                    const savedCorpus = sessionStorage.getItem('selectedCorpus');
                     const savedPage = sessionStorage.getItem('pageIdx');
                     const docIdx = savedDoc ? parseInt(savedDoc, 10) : null;
+                    const restoredDoc = docIdx !== null && data.find(d => d.index === docIdx && (!savedCorpus || d.corpus === savedCorpus));
                     if (restoredDoc) {
                         setSelectedDocIndex(docIdx);
+                        setSelectedCorpus(restoredDoc.corpus);
                         setPageIdx(savedPage ? parseInt(savedPage, 10) : 0);
                     } else {
                         setSelectedDocIndex(data[0].index);
+                        setSelectedCorpus(data[0].corpus);
                         setPageIdx(0);
                     }
                 }
     // Update currentDoc when selection changes + persist to sessionStorage
     useEffect(() => {
+        if (selectedDocIndex !== null && selectedCorpus !== null) {
+            const doc = documents.find(d => d.index === selectedDocIndex && d.corpus === selectedCorpus);
             setCurrentDoc(doc);
             sessionStorage.setItem('selectedDocIndex', selectedDocIndex);
+            sessionStorage.setItem('selectedCorpus', selectedCorpus);
             // Clamp pageIdx to valid range for this document
             if (doc) {
                 setPageIdx(prev => Math.min(prev, Math.max(0, maxPage)));
             }
         }
+    }, [selectedDocIndex, selectedCorpus, documents]);
     // Persist pageIdx to sessionStorage
     useEffect(() => {
     const refreshPageData = useCallback(() => {
         if (selectedDocIndex !== null && currentPageNumber !== null) {
             setLoadingPage(true);
+            fetch(`/api/document?index=${selectedDocIndex}&page=${currentPageNumber}&corpus=${selectedCorpus || ''}`)
                 .then(res => res.json())
                 .then(data => {
                     setCurrentPageData(data);
         localStorage.setItem('annotator_name', name);
     };
+    const handleDocChange = (corpus, docIdx) => {
+        setSelectedCorpus(corpus);
         setSelectedDocIndex(docIdx);
         setPageIdx(0);
     };
             dataset_tag: dataset_tag,
             source: 'human',
             annotator: annotatorName || "user",
+            corpus: selectedCorpus,
             document_index: selectedDocIndex,
             page_number: currentPageNumber,
             timestamp: new Date().toISOString(),
         const rawIdx = ds._rawIndex ?? idx;
         try {
             const res = await fetch(
+                `/api/validate?doc=${selectedDocIndex}&page=${currentPageNumber}&idx=${rawIdx}&corpus=${selectedCorpus || ''}`,
                 { method: 'DELETE' }
             );
             if (res.ok) {
                 method: 'PUT',
                 headers: { 'Content-Type': 'application/json' },
                 body: JSON.stringify({
+                    corpus: selectedCorpus,
                     document_index: selectedDocIndex,
                     page_number: currentPageNumber,
                     dataset_index: datasetIdx,
                         <DocumentSelector
                             documents={documents}
                             selectedDocIndex={selectedDocIndex}
+                            selectedCorpus={selectedCorpus}
                             onDocChange={handleDocChange}
                         />
                     </div>

generate_assignments.py CHANGED Viewed

@@ -2,8 +2,9 @@
 """
 generate_assignments.py
-Reads annotator_config.yaml, distributes available docs across annotators
-with configurable overlap, and writes back the updated config.
 Usage:
     python3 generate_assignments.py                 # Generate and save
@@ -26,7 +27,7 @@ except ImportError:
     sys.exit(1)
 CONFIG_PATH = Path(__file__).parent / "annotation_data" / "annotator_config.yaml"
-LINKS_PATH = Path(__file__).parent / "annotation_data" / "wbg_data" / "wbg_pdf_links.json"
 def load_config():
@@ -37,17 +38,25 @@ def save_config(config):
     CONFIG_PATH.write_text(yaml.dump(config, default_flow_style=False, sort_keys=False))
-def get_available_docs():
-    """Get list of active English doc indices."""
-    links = json.loads(LINKS_PATH.read_text())
     return sorted([
         l["index"] for l in links
         if l.get("has_revalidation") and l.get("status") == "success"
     ])
-def generate_assignments(config, seed=42):
-    """Distribute docs across annotators with overlap."""
     settings = config.get("settings", {})
     overlap_pct = settings.get("overlap_percent", 10)
     annotators = config.get("annotators", [])
@@ -56,44 +65,49 @@ def generate_assignments(config, seed=42):
         print("❌ No annotators defined in config.")
         return config
-    all_docs = get_available_docs()
-    n_docs = len(all_docs)
     n_annotators = len(annotators)
-    # Calculate overlap
-    n_overlap = max(1, round(n_docs * overlap_pct / 100))
-    # Shuffle docs deterministically
-    rng = random.Random(seed)
-    shuffled = all_docs.copy()
-    rng.shuffle(shuffled)
-    # Pick overlap docs (shared by ALL annotators)
-    overlap_docs = sorted(shuffled[:n_overlap])
-    remaining = shuffled[n_overlap:]
-    # Split remaining docs evenly across annotators
-    per_annotator = len(remaining) // n_annotators
-    extra = len(remaining) % n_annotators
-    print(f"\n📊 Assignment Summary:")
-    print(f"  Total docs:       {n_docs}")
-    print(f"  Annotators:       {n_annotators}")
-    print(f"  Overlap ({overlap_pct}%):   {n_overlap} docs shared by all")
-    print(f"  Per annotator:    ~{per_annotator + n_overlap} docs each")
-    print(f"  Overlap docs:     {overlap_docs}")
-    print()
-    start = 0
-    for i, ann in enumerate(annotators):
-        # Distribute remaining: first `extra` annotators get 1 more
-        count = per_annotator + (1 if i < extra else 0)
-        exclusive = sorted(remaining[start:start + count])
-        start += count
-        ann["docs"] = sorted(overlap_docs + exclusive)
-        print(f"  {ann['username']}: {len(ann['docs'])} docs "
-              f"({n_overlap} overlap + {len(exclusive)} exclusive)")
     return config
@@ -130,14 +144,17 @@ def upload_config():
 def main():
-    parser = argparse.ArgumentParser(description="Generate document assignments")
     parser.add_argument("--dry-run", action="store_true", help="Preview only")
     parser.add_argument("--upload", action="store_true", help="Upload config to HF")
     parser.add_argument("--seed", type=int, default=42, help="Random seed")
     args = parser.parse_args()
     config = load_config()
-    config = generate_assignments(config, seed=args.seed)
     if args.dry_run:
         print("\n[DRY RUN] Would save:")

 """
 generate_assignments.py
+Reads corpora.json and annotator_config.yaml, distributes available docs
+across annotators with configurable overlap per corpus, and writes back
+the updated config.
 Usage:
     python3 generate_assignments.py                 # Generate and save
     sys.exit(1)
 CONFIG_PATH = Path(__file__).parent / "annotation_data" / "annotator_config.yaml"
+CORPORA_PATH = Path(__file__).parent / "annotation_data" / "corpora.json"
 def load_config():
     CONFIG_PATH.write_text(yaml.dump(config, default_flow_style=False, sort_keys=False))
+def load_corpora():
+    return json.loads(CORPORA_PATH.read_text())
+def get_available_docs(corpus):
+    """Get list of active doc indices for a given corpus."""
+    links_path = Path(__file__).parent / "annotation_data" / corpus["links_file"]
+    if not links_path.exists():
+        print(f"  ⚠️  No links file for {corpus['id']}: {links_path}")
+        return []
+    links = json.loads(links_path.read_text())
     return sorted([
         l["index"] for l in links
         if l.get("has_revalidation") and l.get("status") == "success"
     ])
+def generate_assignments(config, corpora, seed=42):
+    """Distribute docs across annotators with overlap, per corpus."""
     settings = config.get("settings", {})
     overlap_pct = settings.get("overlap_percent", 10)
     annotators = config.get("annotators", [])
         print("❌ No annotators defined in config.")
         return config
     n_annotators = len(annotators)
+    rng = random.Random(seed)
+    # Initialize per-corpus doc dicts
+    for ann in annotators:
+        if not isinstance(ann.get("docs"), dict):
+            ann["docs"] = {}
+    for corpus in corpora:
+        cid = corpus["id"]
+        all_docs = get_available_docs(corpus)
+        n_docs = len(all_docs)
+        if n_docs == 0:
+            print(f"\n📂 {corpus['name']} ({cid}): no docs available")
+            continue
+        n_overlap = max(1, round(n_docs * overlap_pct / 100))
+        shuffled = all_docs.copy()
+        rng.shuffle(shuffled)
+        overlap_docs = sorted(shuffled[:n_overlap])
+        remaining = shuffled[n_overlap:]
+        per_annotator = len(remaining) // n_annotators
+        extra = len(remaining) % n_annotators
+        print(f"\n📂 {corpus['name']} ({cid}):")
+        print(f"  Total docs:       {n_docs}")
+        print(f"  Overlap ({overlap_pct}%):   {n_overlap} docs shared by all")
+        print(f"  Per annotator:    ~{per_annotator + n_overlap} docs each")
+        print(f"  Overlap docs:     {overlap_docs}")
+        start = 0
+        for i, ann in enumerate(annotators):
+            count = per_annotator + (1 if i < extra else 0)
+            exclusive = sorted(remaining[start:start + count])
+            start += count
+            ann["docs"][cid] = sorted(overlap_docs + exclusive)
+            print(f"  {ann['username']}: {len(ann['docs'][cid])} docs "
+                  f"({n_overlap} overlap + {len(exclusive)} exclusive)")
     return config
 def main():
+    parser = argparse.ArgumentParser(description="Generate document assignments per corpus")
     parser.add_argument("--dry-run", action="store_true", help="Preview only")
     parser.add_argument("--upload", action="store_true", help="Upload config to HF")
     parser.add_argument("--seed", type=int, default=42, help="Random seed")
     args = parser.parse_args()
+    corpora = load_corpora()
     config = load_config()
+    print(f"📋 Loaded {len(corpora)} corpora, {len(config.get('annotators', []))} annotators")
+    config = generate_assignments(config, corpora, seed=args.seed)
     if args.dry_run:
         print("\n[DRY RUN] Would save:")

utils/config.js CHANGED Viewed

@@ -1,4 +1,57 @@
 // Centralized configuration for the annotation app
 export const HF_DATASET_ID = process.env.HF_DATASET_REPO || 'ai4data/annotation_data';
 export const HF_DATASET_BASE_URL = `https://huggingface.co/datasets/${HF_DATASET_ID}`;
 export const MAX_DOCS_TO_SCAN = parseInt(process.env.MAX_DOCS_TO_SCAN || '50', 10);

+import fs from 'fs';
+import path from 'path';
 // Centralized configuration for the annotation app
 export const HF_DATASET_ID = process.env.HF_DATASET_REPO || 'ai4data/annotation_data';
 export const HF_DATASET_BASE_URL = `https://huggingface.co/datasets/${HF_DATASET_ID}`;
 export const MAX_DOCS_TO_SCAN = parseInt(process.env.MAX_DOCS_TO_SCAN || '50', 10);
+// ─── Corpus helpers ────────────────────────────────
+let _corporaCache = null;
+/**
+ * Returns the list of available corpora from corpora.json.
+ * Cached after first load.
+ */
+export function getCorpora() {
+    if (_corporaCache) return _corporaCache;
+    const filePath = path.join(process.cwd(), 'annotation_data', 'corpora.json');
+    _corporaCache = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
+    return _corporaCache;
+}
+/**
+ * Find a corpus by its ID (e.g. "wbg", "unhcr").
+ * Returns the default (first) corpus if corpusId is null/undefined.
+ */
+export function getCorpus(corpusId) {
+    const corpora = getCorpora();
+    if (!corpusId) return corpora[0];
+    return corpora.find(c => c.id === corpusId) || corpora[0];
+}
+/**
+ * HF repo path for a corpus's PDF links file.
+ */
+export function getLinksRepoPath(corpus) {
+    return `annotation_data/${corpus.links_file}`;
+}
+/**
+ * HF repo path for a specific doc's raw JSON.
+ */
+export function getDocRepoPath(corpus, docIndex) {
+    return `annotation_data/${corpus.extractions_dir}/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`;
+}
+/**
+ * Local file path for a specific doc's raw JSON.
+ */
+export function getDocLocalPath(corpus, docIndex) {
+    return path.join(
+        process.cwd(),
+        'annotation_data', corpus.extractions_dir,
+        `doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl`
+    );
+}

utils/storage.js CHANGED Viewed

@@ -1,54 +1,32 @@
 import fs from 'fs';
 import path from 'path';
 import { commit } from '@huggingface/hub';
-import { HF_DATASET_ID, HF_DATASET_BASE_URL } from './config.js';
-const getRootPath = () => process.cwd();
 const isHFSpace = () => {
     return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
 };
-/**
- * Returns the local file path for a document's raw JSON
- */
-function getDocFilePath(docIndex) {
-    return path.join(
-        getRootPath(),
-        'annotation_data', 'wbg_extractions',
-        `doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl`
-    );
-}
-/**
- * Returns the HF repo path for a document's raw JSON
- */
-function getDocRepoPath(docIndex) {
-    return `annotation_data/wbg_extractions/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`;
-}
 /**
  * Reads the full document JSON (all pages) from local file
  */
-function readDocLocal(docIndex) {
-    const filePath = getDocFilePath(docIndex);
     if (!fs.existsSync(filePath)) return null;
-    const raw = fs.readFileSync(filePath, 'utf-8');
-    return JSON.parse(raw);
 }
 /**
  * Writes the full document JSON (all pages) to local file
  */
-function writeDocLocal(docIndex, pagesData) {
-    const filePath = getDocFilePath(docIndex);
     fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
-    console.log(`Saved doc_${docIndex}_direct_judged.jsonl locally`);
 }
 /**
  * Finds the page index in the pages array by page_number
- * Uses document.pages[0] to match, consistent with the document/route.js API
  */
 function findPageIndex(pagesData, pageNumber) {
     return pagesData.findIndex(p => p.document?.pages?.[0] === pageNumber);
@@ -57,24 +35,25 @@ function findPageIndex(pagesData, pageNumber) {
 /**
  * Fetches the document JSON from HuggingFace
  */
-async function fetchDocFromHF(docIndex) {
     const token = process.env.HF_TOKEN;
-    const url = `${HF_DATASET_BASE_URL}/raw/main/${getDocRepoPath(docIndex)}`;
     const res = await fetch(url, {
         headers: { 'Authorization': `Bearer ${token}` }
     });
-    if (!res.ok) throw new Error(`Failed to fetch doc_${docIndex} from HF: ${res.status}`);
     return res.json();
 }
 /**
  * Commits the updated document JSON back to HuggingFace
  */
-async function commitDocToHF(docIndex, pagesData, commitMessage) {
     const token = process.env.HF_TOKEN;
     if (!token) throw new Error("Missing HF_TOKEN");
-    const repoPath = getDocRepoPath(docIndex);
     const content = JSON.stringify(pagesData, null, 2);
     await commit({
@@ -93,15 +72,13 @@ async function commitDocToHF(docIndex, pagesData, commitMessage) {
 // ─── Public API ────────────────────────────────────
 /**
- * Saves an annotation by appending it to the page's datasets array
- * in the per-document raw JSON file.
- *
- * @param {Object} annotation - Must include document_index, page_number, and dataset fields
  */
 export async function saveAnnotation(annotation) {
     const { document_index: docIndex, page_number: pageNumber } = annotation;
-    // Build the dataset entry (strip routing fields — they stay at page/doc level)
     const datasetEntry = {
         dataset_name: annotation.dataset_name,
         dataset_tag: annotation.dataset_tag,
@@ -122,33 +99,33 @@ export async function saveAnnotation(annotation) {
     };
     if (isHFSpace()) {
-        // Production: fetch from HF, modify, commit back
-        const pagesData = await fetchDocFromHF(docIndex);
         const pageIdx = findPageIndex(pagesData, pageNumber);
-        if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex}`);
         pagesData[pageIdx].datasets.push(datasetEntry);
-        await commitDocToHF(docIndex, pagesData,
-            `Add human annotation to doc_${docIndex} page ${pageNumber}`);
     } else {
-        // Local: read, modify, write
-        const pagesData = readDocLocal(docIndex);
-        if (!pagesData) throw new Error(`doc_${docIndex}_direct_judged.jsonl not found locally`);
         const pageIdx = findPageIndex(pagesData, pageNumber);
-        if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex}`);
         pagesData[pageIdx].datasets.push(datasetEntry);
-        writeDocLocal(docIndex, pagesData);
     }
 }
 /**
- * Deletes an annotation from the page's datasets array by timestamp
  */
-export async function deleteAnnotation(timestamp, docIndex, pageNumber) {
     if (isHFSpace()) {
-        const pagesData = await fetchDocFromHF(docIndex);
         const pageIdx = findPageIndex(pagesData, pageNumber);
         if (pageIdx === -1) return false;
@@ -158,11 +135,11 @@ export async function deleteAnnotation(timestamp, docIndex, pageNumber) {
         );
         if (pagesData[pageIdx].datasets.length === before) return false;
-        await commitDocToHF(docIndex, pagesData,
-            `Delete annotation from doc_${docIndex} page ${pageNumber}`);
         return true;
     } else {
-        const pagesData = readDocLocal(docIndex);
         if (!pagesData) return false;
         const pageIdx = findPageIndex(pagesData, pageNumber);
@@ -174,17 +151,19 @@ export async function deleteAnnotation(timestamp, docIndex, pageNumber) {
         );
         if (pagesData[pageIdx].datasets.length === before) return false;
-        writeDocLocal(docIndex, pagesData);
         return true;
     }
 }
 /**
- * Updates an annotation in the page's datasets array by timestamp
  */
-export async function updateAnnotation(timestamp, docIndex, pageNumber, updates) {
     if (isHFSpace()) {
-        const pagesData = await fetchDocFromHF(docIndex);
         const pageIdx = findPageIndex(pagesData, pageNumber);
         if (pageIdx === -1) return null;
@@ -195,11 +174,11 @@ export async function updateAnnotation(timestamp, docIndex, pageNumber, updates)
             ...pagesData[pageIdx].datasets[dsIdx],
             ...updates
         };
-        await commitDocToHF(docIndex, pagesData,
-            `Update annotation in doc_${docIndex} page ${pageNumber}`);
         return pagesData[pageIdx].datasets[dsIdx];
     } else {
-        const pagesData = readDocLocal(docIndex);
         if (!pagesData) return null;
         const pageIdx = findPageIndex(pagesData, pageNumber);
@@ -212,46 +191,50 @@ export async function updateAnnotation(timestamp, docIndex, pageNumber, updates)
             ...pagesData[pageIdx].datasets[dsIdx],
             ...updates
         };
-        writeDocLocal(docIndex, pagesData);
         return pagesData[pageIdx].datasets[dsIdx];
     }
 }
 /**
- * Retrieves all human annotations (those with annotator field) from local files.
- * Scans all doc files and returns entries that have a timestamp (human-added).
  */
-export async function getAnnotations(docIndex = null) {
-    const extractionsDir = path.join(getRootPath(), 'annotation_data', 'wbg_extractions');
-    if (!fs.existsSync(extractionsDir)) return [];
     const results = [];
-    const dirs = fs.readdirSync(extractionsDir).filter(d => d.startsWith('doc_'));
-    for (const dir of dirs) {
-        const idx = parseInt(dir.replace('doc_', ''), 10);
-        if (docIndex !== null && idx !== docIndex) continue;
-        const filePath = path.join(extractionsDir, dir, 'raw', `${dir}_direct_judged.jsonl`);
-        if (!fs.existsSync(filePath)) continue;
-        try {
-            const pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
-            for (const page of pagesData) {
-                const pageNum = page.document?.pages?.[0];
-                for (const ds of (page.datasets || [])) {
-                    // Only return human annotations (those with annotator field)
-                    if (ds.annotator) {
-                        results.push({
-                            ...ds,
-                            document_index: idx,
-                            page_number: pageNum,
-                        });
                     }
                 }
             }
-        } catch (e) {
-            console.error(`Error reading ${filePath}:`, e);
         }
     }

 import fs from 'fs';
 import path from 'path';
 import { commit } from '@huggingface/hub';
+import { HF_DATASET_ID, HF_DATASET_BASE_URL, getCorpus, getDocRepoPath, getDocLocalPath } from './config.js';
 const isHFSpace = () => {
     return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
 };
 /**
  * Reads the full document JSON (all pages) from local file
  */
+function readDocLocal(corpus, docIndex) {
+    const filePath = getDocLocalPath(corpus, docIndex);
     if (!fs.existsSync(filePath)) return null;
+    return JSON.parse(fs.readFileSync(filePath, 'utf-8'));
 }
 /**
  * Writes the full document JSON (all pages) to local file
  */
+function writeDocLocal(corpus, docIndex, pagesData) {
+    const filePath = getDocLocalPath(corpus, docIndex);
     fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
+    console.log(`Saved doc_${docIndex} locally (${corpus.id})`);
 }
 /**
  * Finds the page index in the pages array by page_number
  */
 function findPageIndex(pagesData, pageNumber) {
     return pagesData.findIndex(p => p.document?.pages?.[0] === pageNumber);
 /**
  * Fetches the document JSON from HuggingFace
  */
+async function fetchDocFromHF(corpus, docIndex) {
     const token = process.env.HF_TOKEN;
+    const repoPath = getDocRepoPath(corpus, docIndex);
+    const url = `${HF_DATASET_BASE_URL}/raw/main/${repoPath}`;
     const res = await fetch(url, {
         headers: { 'Authorization': `Bearer ${token}` }
     });
+    if (!res.ok) throw new Error(`Failed to fetch doc_${docIndex} (${corpus.id}) from HF: ${res.status}`);
     return res.json();
 }
 /**
  * Commits the updated document JSON back to HuggingFace
  */
+async function commitDocToHF(corpus, docIndex, pagesData, commitMessage) {
     const token = process.env.HF_TOKEN;
     if (!token) throw new Error("Missing HF_TOKEN");
+    const repoPath = getDocRepoPath(corpus, docIndex);
     const content = JSON.stringify(pagesData, null, 2);
     await commit({
 // ─── Public API ────────────────────────────────────
 /**
+ * Saves an annotation by appending it to the page's datasets array.
+ * @param {Object} annotation - Must include corpus (optional, defaults to first), document_index, page_number
  */
 export async function saveAnnotation(annotation) {
+    const corpus = getCorpus(annotation.corpus);
     const { document_index: docIndex, page_number: pageNumber } = annotation;
     const datasetEntry = {
         dataset_name: annotation.dataset_name,
         dataset_tag: annotation.dataset_tag,
     };
     if (isHFSpace()) {
+        const pagesData = await fetchDocFromHF(corpus, docIndex);
         const pageIdx = findPageIndex(pagesData, pageNumber);
+        if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex} (${corpus.id})`);
         pagesData[pageIdx].datasets.push(datasetEntry);
+        await commitDocToHF(corpus, docIndex, pagesData,
+            `Add annotation to ${corpus.id}/doc_${docIndex} page ${pageNumber}`);
     } else {
+        const pagesData = readDocLocal(corpus, docIndex);
+        if (!pagesData) throw new Error(`doc_${docIndex} not found locally (${corpus.id})`);
         const pageIdx = findPageIndex(pagesData, pageNumber);
+        if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex} (${corpus.id})`);
         pagesData[pageIdx].datasets.push(datasetEntry);
+        writeDocLocal(corpus, docIndex, pagesData);
     }
 }
 /**
+ * Deletes an annotation by timestamp
  */
+export async function deleteAnnotation(timestamp, docIndex, pageNumber, corpusId) {
+    const corpus = getCorpus(corpusId);
     if (isHFSpace()) {
+        const pagesData = await fetchDocFromHF(corpus, docIndex);
         const pageIdx = findPageIndex(pagesData, pageNumber);
         if (pageIdx === -1) return false;
         );
         if (pagesData[pageIdx].datasets.length === before) return false;
+        await commitDocToHF(corpus, docIndex, pagesData,
+            `Delete annotation from ${corpus.id}/doc_${docIndex} page ${pageNumber}`);
         return true;
     } else {
+        const pagesData = readDocLocal(corpus, docIndex);
         if (!pagesData) return false;
         const pageIdx = findPageIndex(pagesData, pageNumber);
         );
         if (pagesData[pageIdx].datasets.length === before) return false;
+        writeDocLocal(corpus, docIndex, pagesData);
         return true;
     }
 }
 /**
+ * Updates an annotation by timestamp
  */
+export async function updateAnnotation(timestamp, docIndex, pageNumber, updates, corpusId) {
+    const corpus = getCorpus(corpusId);
     if (isHFSpace()) {
+        const pagesData = await fetchDocFromHF(corpus, docIndex);
         const pageIdx = findPageIndex(pagesData, pageNumber);
         if (pageIdx === -1) return null;
             ...pagesData[pageIdx].datasets[dsIdx],
             ...updates
         };
+        await commitDocToHF(corpus, docIndex, pagesData,
+            `Update annotation in ${corpus.id}/doc_${docIndex} page ${pageNumber}`);
         return pagesData[pageIdx].datasets[dsIdx];
     } else {
+        const pagesData = readDocLocal(corpus, docIndex);
         if (!pagesData) return null;
         const pageIdx = findPageIndex(pagesData, pageNumber);
             ...pagesData[pageIdx].datasets[dsIdx],
             ...updates
         };
+        writeDocLocal(corpus, docIndex, pagesData);
         return pagesData[pageIdx].datasets[dsIdx];
     }
 }
 /**
+ * Retrieves all human annotations from local files.
  */
+export async function getAnnotations(docIndex = null, corpusId = null) {
+    const { getCorpora } = await import('./config.js');
+    const corporaList = corpusId ? [getCorpus(corpusId)] : getCorpora();
     const results = [];
+    for (const corpus of corporaList) {
+        const extractionsDir = path.join(process.cwd(), 'annotation_data', corpus.extractions_dir);
+        if (!fs.existsSync(extractionsDir)) continue;
+        const dirs = fs.readdirSync(extractionsDir).filter(d => d.startsWith('doc_'));
+        for (const dir of dirs) {
+            const idx = parseInt(dir.replace('doc_', ''), 10);
+            if (docIndex !== null && idx !== docIndex) continue;
+            const filePath = path.join(extractionsDir, dir, 'raw', `${dir}_direct_judged.jsonl`);
+            if (!fs.existsSync(filePath)) continue;
+            try {
+                const pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
+                for (const page of pagesData) {
+                    const pageNum = page.document?.pages?.[0];
+                    for (const ds of (page.datasets || [])) {
+                        if (ds.annotator) {
+                            results.push({
+                                ...ds,
+                                corpus: corpus.id,
+                                document_index: idx,
+                                page_number: pageNum,
+                            });
+                        }
                     }
                 }
+            } catch (e) {
+                console.error(`Error reading ${filePath}:`, e);
             }
         }
     }