rafmacalaba commited on
Commit
a2c885c
Β·
1 Parent(s): aeca117

feat: multi-corpus support

Browse files

- corpora.json registry: add new corpora by adding entries
- All APIs/utils resolve paths via config.js helpers
- Per-corpus doc assignments: docs: { wbg: [...], unhcr: [...] }
- Document selector shows [World Bank] Doc 3 labels
- Leaderboard/progress scan all corpora
- generate_assignments.py handles per-corpus distribution

app/api/document/route.js CHANGED
@@ -1,6 +1,5 @@
1
- import { HF_DATASET_BASE_URL } from '../../../utils/config.js';
2
  import fs from 'fs';
3
- import path from 'path';
4
 
5
  const isHFSpace = () => {
6
  return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
@@ -10,8 +9,8 @@ export async function GET(request) {
10
  const { searchParams } = new URL(request.url);
11
  const index = searchParams.get('index');
12
  const page = searchParams.get('page');
 
13
 
14
- // Validate required params
15
  if (index === null || page === null) {
16
  return new Response(
17
  JSON.stringify({ error: "Missing index or page parameter" }),
@@ -19,7 +18,6 @@ export async function GET(request) {
19
  );
20
  }
21
 
22
- // Validate numeric values
23
  const indexNum = parseInt(index, 10);
24
  const pageNum = parseInt(page, 10);
25
 
@@ -30,46 +28,42 @@ export async function GET(request) {
30
  );
31
  }
32
 
 
 
33
  try {
34
  let pagesData;
35
 
36
  if (isHFSpace()) {
37
- // Production: fetch from HuggingFace
38
- const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${indexNum}/raw/doc_${indexNum}_direct_judged.jsonl`;
39
  const res = await fetch(docUrl, {
40
  headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
41
  });
42
 
43
  if (!res.ok) {
44
  return new Response(
45
- JSON.stringify({ error: `doc_${indexNum}_direct_judged.jsonl not found on HF Datasets` }),
46
  { status: res.status, headers: { 'Content-Type': 'application/json' } }
47
  );
48
  }
49
  pagesData = await res.json();
50
  } else {
51
- // Local dev: read from local file (reflects saved annotations immediately)
52
- const filePath = path.join(
53
- process.cwd(),
54
- 'annotation_data', 'wbg_extractions',
55
- `doc_${indexNum}`, 'raw', `doc_${indexNum}_direct_judged.jsonl`
56
- );
57
 
58
  if (!fs.existsSync(filePath)) {
59
  return new Response(
60
- JSON.stringify({ error: `doc_${indexNum}_direct_judged.jsonl not found locally` }),
61
  { status: 404, headers: { 'Content-Type': 'application/json' } }
62
  );
63
  }
64
- const raw = fs.readFileSync(filePath, 'utf-8');
65
- pagesData = JSON.parse(raw);
66
  }
67
 
68
  const pageData = pagesData.find(p => p.document?.pages?.[0] === pageNum);
69
 
70
  if (!pageData) {
71
  return new Response(
72
- JSON.stringify({ error: `Page ${pageNum} not found in doc ${indexNum}` }),
73
  { status: 404, headers: { 'Content-Type': 'application/json' } }
74
  );
75
  }
 
1
+ import { HF_DATASET_BASE_URL, getCorpus, getDocRepoPath, getDocLocalPath } from '../../../utils/config.js';
2
  import fs from 'fs';
 
3
 
4
  const isHFSpace = () => {
5
  return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
 
9
  const { searchParams } = new URL(request.url);
10
  const index = searchParams.get('index');
11
  const page = searchParams.get('page');
12
+ const corpusId = searchParams.get('corpus');
13
 
 
14
  if (index === null || page === null) {
15
  return new Response(
16
  JSON.stringify({ error: "Missing index or page parameter" }),
 
18
  );
19
  }
20
 
 
21
  const indexNum = parseInt(index, 10);
22
  const pageNum = parseInt(page, 10);
23
 
 
28
  );
29
  }
30
 
31
+ const corpus = getCorpus(corpusId);
32
+
33
  try {
34
  let pagesData;
35
 
36
  if (isHFSpace()) {
37
+ const docRepoPath = getDocRepoPath(corpus, indexNum);
38
+ const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
39
  const res = await fetch(docUrl, {
40
  headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
41
  });
42
 
43
  if (!res.ok) {
44
  return new Response(
45
+ JSON.stringify({ error: `doc_${indexNum} not found on HF (${corpus.id})` }),
46
  { status: res.status, headers: { 'Content-Type': 'application/json' } }
47
  );
48
  }
49
  pagesData = await res.json();
50
  } else {
51
+ const filePath = getDocLocalPath(corpus, indexNum);
 
 
 
 
 
52
 
53
  if (!fs.existsSync(filePath)) {
54
  return new Response(
55
+ JSON.stringify({ error: `doc_${indexNum} not found locally (${corpus.id})` }),
56
  { status: 404, headers: { 'Content-Type': 'application/json' } }
57
  );
58
  }
59
+ pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
 
60
  }
61
 
62
  const pageData = pagesData.find(p => p.document?.pages?.[0] === pageNum);
63
 
64
  if (!pageData) {
65
  return new Response(
66
+ JSON.stringify({ error: `Page ${pageNum} not found in doc ${indexNum} (${corpus.id})` }),
67
  { status: 404, headers: { 'Content-Type': 'application/json' } }
68
  );
69
  }
app/api/documents/route.js CHANGED
@@ -1,9 +1,10 @@
1
- import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN } from '../../../utils/config.js';
2
  import yaml from 'js-yaml';
3
 
4
  /**
5
  * Fetch annotator_config.yaml and return the doc list for a given user.
6
  * Returns null if no config or user not found (show all docs).
 
7
  */
8
  async function getUserAssignedDocs(username) {
9
  if (!username) return null;
@@ -12,7 +13,7 @@ async function getUserAssignedDocs(username) {
12
  const configUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/annotator_config.yaml`;
13
  const res = await fetch(configUrl, {
14
  headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
15
- next: { revalidate: 300 } // cache 5 min
16
  });
17
  if (!res.ok) return null;
18
 
@@ -20,9 +21,22 @@ async function getUserAssignedDocs(username) {
20
  const config = yaml.load(text);
21
 
22
  const annotator = (config.annotators || []).find(a => a.username === username);
23
- if (!annotator || !annotator.docs || annotator.docs.length === 0) return null;
24
 
25
- return new Set(annotator.docs);
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  } catch (e) {
27
  console.warn('Could not load annotator_config.yaml:', e.message);
28
  return null;
@@ -31,76 +45,94 @@ async function getUserAssignedDocs(username) {
31
 
32
  export async function GET(request) {
33
  try {
34
- // Get username from query param
35
  const { searchParams } = new URL(request.url);
36
  const username = searchParams.get('user');
37
 
38
- // Fetch user's assigned docs (if configured)
39
  const assignedDocs = await getUserAssignedDocs(username);
40
 
41
- // Fetch the index file from HF Datasets
42
- const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_data/wbg_pdf_links.json`;
43
- const linksRes = await fetch(linksUrl, {
44
- headers: {
45
- 'Authorization': `Bearer ${process.env.HF_TOKEN}`
46
- },
47
- next: { revalidate: 3600 }
48
- });
49
-
50
- if (!linksRes.ok) {
51
- console.error("Failed to fetch links JSON", await linksRes.text());
52
- return new Response(
53
- JSON.stringify({ error: "Missing wbg_pdf_links.json on HF Datasets" }),
54
- { status: 404, headers: { 'Content-Type': 'application/json' } }
55
- );
56
- }
57
-
58
- const links = await linksRes.json();
59
-
60
- // Filter to docs with revalidation data, then by user assignment if available
61
- let successLinks = links
62
- .filter(l => l.status === 'success' && l.has_revalidation === true);
63
-
64
- if (assignedDocs) {
65
- successLinks = successLinks.filter(l => assignedDocs.has(l.index));
66
- }
67
-
68
- successLinks = successLinks.slice(0, MAX_DOCS_TO_SCAN);
69
 
70
- // Parallel fetch
71
- const results = await Promise.allSettled(
72
- successLinks.map(async (link) => {
73
- const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_direct_judged.jsonl`;
74
- const docRes = await fetch(docUrl, {
75
- headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
76
- });
 
 
 
 
 
77
 
78
- if (!docRes.ok) return null;
79
 
80
- const pagesData = await docRes.json();
81
- const annotatablePages = pagesData
82
- .filter(page => page.datasets && page.datasets.length > 0)
83
- .map(page => page.document.pages[0]);
84
 
85
- if (annotatablePages.length === 0) return null;
 
 
86
 
87
- const pdfUrl = link.direct_pdf_url;
88
- if (!pdfUrl) return null;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
- return {
91
- index: link.index,
92
- pdf_url: pdfUrl,
93
- landing_page: link.landing_page_url,
94
- annotatable_pages: annotatablePages
95
- };
96
- })
97
- );
98
 
99
- const documents = results
100
- .filter(r => r.status === 'fulfilled' && r.value !== null)
101
- .map(r => r.value);
102
 
103
- return new Response(JSON.stringify(documents), {
104
  status: 200,
105
  headers: {
106
  'Content-Type': 'application/json',
@@ -110,7 +142,7 @@ export async function GET(request) {
110
  } catch (error) {
111
  console.error(error);
112
  return new Response(
113
- JSON.stringify({ error: "Failed to fetch documents from HF" }),
114
  { status: 500, headers: { 'Content-Type': 'application/json' } }
115
  );
116
  }
 
1
+ import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN, getCorpus, getLinksRepoPath, getDocRepoPath } from '../../../utils/config.js';
2
  import yaml from 'js-yaml';
3
 
4
  /**
5
  * Fetch annotator_config.yaml and return the doc list for a given user.
6
  * Returns null if no config or user not found (show all docs).
7
+ * Now returns per-corpus assignments: { wbg: Set([1,2]), unhcr: Set([3,4]) }
8
  */
9
  async function getUserAssignedDocs(username) {
10
  if (!username) return null;
 
13
  const configUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/annotator_config.yaml`;
14
  const res = await fetch(configUrl, {
15
  headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
16
+ cache: 'no-store'
17
  });
18
  if (!res.ok) return null;
19
 
 
21
  const config = yaml.load(text);
22
 
23
  const annotator = (config.annotators || []).find(a => a.username === username);
24
+ if (!annotator || !annotator.docs) return null;
25
 
26
+ // Support both old format (flat array) and new format (per-corpus object)
27
+ if (Array.isArray(annotator.docs)) {
28
+ // Legacy: flat array β€” treat as default corpus
29
+ return { _flat: new Set(annotator.docs) };
30
+ }
31
+
32
+ // New format: { wbg: [1,2], unhcr: [3,4] }
33
+ const result = {};
34
+ for (const [corpusId, docList] of Object.entries(annotator.docs)) {
35
+ if (Array.isArray(docList)) {
36
+ result[corpusId] = new Set(docList);
37
+ }
38
+ }
39
+ return Object.keys(result).length > 0 ? result : null;
40
  } catch (e) {
41
  console.warn('Could not load annotator_config.yaml:', e.message);
42
  return null;
 
45
 
46
  export async function GET(request) {
47
  try {
 
48
  const { searchParams } = new URL(request.url);
49
  const username = searchParams.get('user');
50
 
 
51
  const assignedDocs = await getUserAssignedDocs(username);
52
 
53
+ // Import corpora list
54
+ const { getCorpora } = await import('../../../utils/config.js');
55
+ const corpora = getCorpora();
56
+
57
+ const allDocuments = [];
58
+
59
+ for (const corpus of corpora) {
60
+ // Determine which doc indices this user has for this corpus
61
+ let userDocSet = null;
62
+ if (assignedDocs) {
63
+ if (assignedDocs._flat) {
64
+ // Legacy flat format β€” only applies to first/default corpus
65
+ userDocSet = corpus === corpora[0] ? assignedDocs._flat : new Set();
66
+ } else {
67
+ userDocSet = assignedDocs[corpus.id] || new Set();
68
+ }
69
+ if (userDocSet.size === 0) continue; // no docs for this corpus
70
+ }
 
 
 
 
 
 
 
 
 
 
71
 
72
+ // Fetch the links file for this corpus
73
+ const linksPath = getLinksRepoPath(corpus);
74
+ const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`;
75
+ const linksRes = await fetch(linksUrl, {
76
+ headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
77
+ cache: 'no-store'
78
+ });
79
+
80
+ if (!linksRes.ok) {
81
+ console.warn(`No links file for corpus ${corpus.id}: ${linksRes.status}`);
82
+ continue;
83
+ }
84
 
85
+ const links = await linksRes.json();
86
 
87
+ let successLinks = links
88
+ .filter(l => l.status === 'success' && l.has_revalidation === true);
 
 
89
 
90
+ if (userDocSet) {
91
+ successLinks = successLinks.filter(l => userDocSet.has(l.index));
92
+ }
93
 
94
+ successLinks = successLinks.slice(0, MAX_DOCS_TO_SCAN);
95
+
96
+ // Parallel fetch docs
97
+ const results = await Promise.allSettled(
98
+ successLinks.map(async (link) => {
99
+ const docRepoPath = getDocRepoPath(corpus, link.index);
100
+ const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
101
+ const docRes = await fetch(docUrl, {
102
+ headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
103
+ });
104
+
105
+ if (!docRes.ok) return null;
106
+
107
+ const pagesData = await docRes.json();
108
+ const annotatablePages = pagesData
109
+ .filter(page => page.datasets && page.datasets.length > 0)
110
+ .map(page => page.document.pages[0]);
111
+
112
+ if (annotatablePages.length === 0) return null;
113
+
114
+ const pdfUrl = link.direct_pdf_url;
115
+ if (!pdfUrl) return null;
116
+
117
+ return {
118
+ corpus: corpus.id,
119
+ corpus_name: corpus.name,
120
+ index: link.index,
121
+ pdf_url: pdfUrl,
122
+ landing_page: link.landing_page_url,
123
+ annotatable_pages: annotatablePages
124
+ };
125
+ })
126
+ );
127
 
128
+ const docs = results
129
+ .filter(r => r.status === 'fulfilled' && r.value !== null)
130
+ .map(r => r.value);
 
 
 
 
 
131
 
132
+ allDocuments.push(...docs);
133
+ }
 
134
 
135
+ return new Response(JSON.stringify(allDocuments), {
136
  status: 200,
137
  headers: {
138
  'Content-Type': 'application/json',
 
142
  } catch (error) {
143
  console.error(error);
144
  return new Response(
145
+ JSON.stringify({ error: "Failed to fetch documents" }),
146
  { status: 500, headers: { 'Content-Type': 'application/json' } }
147
  );
148
  }
app/api/leaderboard/route.js CHANGED
@@ -1,71 +1,69 @@
1
- import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN } from '../../../utils/config.js';
2
 
3
- export const dynamic = 'force-dynamic'; // disable Next.js route caching
4
 
5
  /**
6
  * GET /api/leaderboard
7
- * Returns annotator rankings based on validation counts.
8
  */
9
  export async function GET() {
10
  try {
11
- const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_data/wbg_pdf_links.json`;
12
- const linksRes = await fetch(linksUrl, {
13
- headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
14
- cache: 'no-store'
15
- });
16
 
17
- if (!linksRes.ok) {
18
- return new Response(JSON.stringify({ error: 'Failed to fetch links' }), { status: 500 });
19
- }
 
 
 
 
20
 
21
- const links = await linksRes.json();
22
- const activeLinks = links
23
- .filter(l => l.status === 'success' && l.has_revalidation === true)
24
- .slice(0, MAX_DOCS_TO_SCAN);
25
 
26
- // Tally per-annotator stats
27
- const stats = {}; // annotator -> { verified, correct, incorrect, docs, humanAdded }
 
 
28
 
29
- const results = await Promise.allSettled(
30
- activeLinks.map(async (link) => {
31
- const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_direct_judged.jsonl`;
32
- const docRes = await fetch(docUrl, {
33
- headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
34
- cache: 'no-store'
35
- });
36
- if (!docRes.ok) return;
 
37
 
38
- const pagesData = await docRes.json();
39
- const docAnnotators = new Set();
40
 
41
- for (const page of pagesData) {
42
- for (const ds of (page.datasets || [])) {
43
- // Count human-added annotations
44
- if (ds.source === 'human' && ds.annotator) {
45
- if (!stats[ds.annotator]) {
46
- stats[ds.annotator] = { verified: 0, correct: 0, incorrect: 0, docs: new Set(), humanAdded: 0 };
 
 
47
  }
48
- stats[ds.annotator].humanAdded++;
49
- stats[ds.annotator].docs.add(link.index);
50
- }
51
 
52
- // Count validations
53
- for (const v of (ds.validations || [])) {
54
- if (!v.annotator || !v.human_validated) continue;
55
- if (!stats[v.annotator]) {
56
- stats[v.annotator] = { verified: 0, correct: 0, incorrect: 0, docs: new Set(), humanAdded: 0 };
 
 
 
 
57
  }
58
- stats[v.annotator].verified++;
59
- if (v.human_verdict === true) stats[v.annotator].correct++;
60
- else stats[v.annotator].incorrect++;
61
- stats[v.annotator].docs.add(link.index);
62
  }
63
  }
64
- }
65
- })
66
- );
67
 
68
- // Build ranked list
69
  const leaderboard = Object.entries(stats)
70
  .map(([annotator, s]) => ({
71
  annotator,
@@ -74,7 +72,7 @@ export async function GET() {
74
  incorrect: s.incorrect,
75
  humanAdded: s.humanAdded,
76
  docsWorked: s.docs.size,
77
- score: s.verified + s.humanAdded, // total contributions
78
  }))
79
  .sort((a, b) => b.score - a.score);
80
 
 
1
+ import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN, getCorpora, getLinksRepoPath, getDocRepoPath } from '../../../utils/config.js';
2
 
3
+ export const dynamic = 'force-dynamic';
4
 
5
  /**
6
  * GET /api/leaderboard
7
+ * Scans ALL corpora and returns annotator rankings.
8
  */
9
  export async function GET() {
10
  try {
11
+ const corpora = getCorpora();
12
+ const stats = {}; // annotator -> { verified, correct, incorrect, docs, humanAdded }
 
 
 
13
 
14
+ for (const corpus of corpora) {
15
+ const linksPath = getLinksRepoPath(corpus);
16
+ const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`;
17
+ const linksRes = await fetch(linksUrl, {
18
+ headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
19
+ cache: 'no-store'
20
+ });
21
 
22
+ if (!linksRes.ok) continue;
 
 
 
23
 
24
+ const links = await linksRes.json();
25
+ const activeLinks = links
26
+ .filter(l => l.status === 'success' && l.has_revalidation === true)
27
+ .slice(0, MAX_DOCS_TO_SCAN);
28
 
29
+ await Promise.allSettled(
30
+ activeLinks.map(async (link) => {
31
+ const docRepoPath = getDocRepoPath(corpus, link.index);
32
+ const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
33
+ const docRes = await fetch(docUrl, {
34
+ headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
35
+ cache: 'no-store'
36
+ });
37
+ if (!docRes.ok) return;
38
 
39
+ const pagesData = await docRes.json();
 
40
 
41
+ for (const page of pagesData) {
42
+ for (const ds of (page.datasets || [])) {
43
+ if (ds.source === 'human' && ds.annotator) {
44
+ if (!stats[ds.annotator]) {
45
+ stats[ds.annotator] = { verified: 0, correct: 0, incorrect: 0, docs: new Set(), humanAdded: 0 };
46
+ }
47
+ stats[ds.annotator].humanAdded++;
48
+ stats[ds.annotator].docs.add(`${corpus.id}:${link.index}`);
49
  }
 
 
 
50
 
51
+ for (const v of (ds.validations || [])) {
52
+ if (!v.annotator || !v.human_validated) continue;
53
+ if (!stats[v.annotator]) {
54
+ stats[v.annotator] = { verified: 0, correct: 0, incorrect: 0, docs: new Set(), humanAdded: 0 };
55
+ }
56
+ stats[v.annotator].verified++;
57
+ if (v.human_verdict === true) stats[v.annotator].correct++;
58
+ else stats[v.annotator].incorrect++;
59
+ stats[v.annotator].docs.add(`${corpus.id}:${link.index}`);
60
  }
 
 
 
 
61
  }
62
  }
63
+ })
64
+ );
65
+ }
66
 
 
67
  const leaderboard = Object.entries(stats)
68
  .map(([annotator, s]) => ({
69
  annotator,
 
72
  incorrect: s.incorrect,
73
  humanAdded: s.humanAdded,
74
  docsWorked: s.docs.size,
75
+ score: s.verified + s.humanAdded,
76
  }))
77
  .sort((a, b) => b.score - a.score);
78
 
app/api/progress/route.js CHANGED
@@ -1,101 +1,104 @@
1
- import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN } from '../../../utils/config.js';
2
 
3
  /**
4
  * GET /api/progress
5
- * Returns progress stats: total docs, pages, mentions, and how many are verified.
6
  */
7
  export async function GET() {
8
  try {
9
- const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_data/wbg_pdf_links.json`;
10
- const linksRes = await fetch(linksUrl, {
11
- headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
12
- next: { revalidate: 300 } // cache 5 min
13
- });
14
-
15
- if (!linksRes.ok) {
16
- return new Response(JSON.stringify({ error: 'Failed to fetch links' }), { status: 500 });
17
- }
18
-
19
- const links = await linksRes.json();
20
- const activeLinks = links
21
- .filter(l => l.status === 'success' && l.has_revalidation === true)
22
- .slice(0, MAX_DOCS_TO_SCAN);
23
-
24
- // Fetch all docs in parallel
25
- const results = await Promise.allSettled(
26
- activeLinks.map(async (link) => {
27
- const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_direct_judged.jsonl`;
28
- const docRes = await fetch(docUrl, {
29
- headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
30
- });
31
- if (!docRes.ok) return null;
32
-
33
- const pagesData = await docRes.json();
34
-
35
- let totalMentions = 0;
36
- let verifiedMentions = 0;
37
- let totalPages = 0;
38
- let completedPages = 0;
39
- let humanAnnotations = 0;
40
-
41
- for (const page of pagesData) {
42
- const datasets = (page.datasets || []).filter(ds => {
43
- // Exclude consensus non-datasets
44
- if (ds.dataset_tag === 'non-dataset' && ds.dataset_name?.judge_agrees === true) {
45
- return false;
46
- }
47
- return true;
48
  });
49
-
50
- if (datasets.length === 0) continue;
51
-
52
- totalPages++;
53
- totalMentions += datasets.length;
54
-
55
- let pageVerified = 0;
56
- for (const ds of datasets) {
57
- if (ds.human_validated === true) {
58
- verifiedMentions++;
59
- pageVerified++;
60
- }
61
- if (ds.source === 'human') {
62
- humanAnnotations++;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  }
64
- }
65
 
66
- // A page is "completed" if all its mentions are verified
67
- if (pageVerified === datasets.length) {
68
- completedPages++;
69
  }
70
- }
71
-
72
- const docComplete = totalPages > 0 && completedPages === totalPages;
73
-
74
- return {
75
- index: link.index,
76
- totalPages,
77
- completedPages,
78
- totalMentions,
79
- verifiedMentions,
80
- humanAnnotations,
81
- complete: docComplete,
82
- };
83
- })
84
- );
85
 
86
- const docs = results
87
- .filter(r => r.status === 'fulfilled' && r.value !== null)
88
- .map(r => r.value);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
 
90
  const summary = {
91
- totalDocs: docs.length,
92
- completedDocs: docs.filter(d => d.complete).length,
93
- totalPages: docs.reduce((s, d) => s + d.totalPages, 0),
94
- completedPages: docs.reduce((s, d) => s + d.completedPages, 0),
95
- totalMentions: docs.reduce((s, d) => s + d.totalMentions, 0),
96
- verifiedMentions: docs.reduce((s, d) => s + d.verifiedMentions, 0),
97
- humanAnnotations: docs.reduce((s, d) => s + d.humanAnnotations, 0),
98
- docs,
99
  };
100
 
101
  return new Response(JSON.stringify(summary), {
 
1
+ import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN, getCorpora, getLinksRepoPath, getDocRepoPath } from '../../../utils/config.js';
2
 
3
  /**
4
  * GET /api/progress
5
+ * Returns progress stats across ALL corpora.
6
  */
7
  export async function GET() {
8
  try {
9
+ const corpora = getCorpora();
10
+ const allDocs = [];
11
+
12
+ for (const corpus of corpora) {
13
+ const linksPath = getLinksRepoPath(corpus);
14
+ const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`;
15
+ const linksRes = await fetch(linksUrl, {
16
+ headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
17
+ next: { revalidate: 300 }
18
+ });
19
+
20
+ if (!linksRes.ok) continue;
21
+
22
+ const links = await linksRes.json();
23
+ const activeLinks = links
24
+ .filter(l => l.status === 'success' && l.has_revalidation === true)
25
+ .slice(0, MAX_DOCS_TO_SCAN);
26
+
27
+ const results = await Promise.allSettled(
28
+ activeLinks.map(async (link) => {
29
+ const docRepoPath = getDocRepoPath(corpus, link.index);
30
+ const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
31
+ const docRes = await fetch(docUrl, {
32
+ headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  });
34
+ if (!docRes.ok) return null;
35
+
36
+ const pagesData = await docRes.json();
37
+
38
+ let totalMentions = 0;
39
+ let verifiedMentions = 0;
40
+ let totalPages = 0;
41
+ let completedPages = 0;
42
+ let humanAnnotations = 0;
43
+
44
+ for (const page of pagesData) {
45
+ const datasets = (page.datasets || []).filter(ds => {
46
+ if (ds.dataset_tag === 'non-dataset' && ds.dataset_name?.judge_agrees === true) {
47
+ return false;
48
+ }
49
+ return true;
50
+ });
51
+
52
+ if (datasets.length === 0) continue;
53
+
54
+ totalPages++;
55
+ totalMentions += datasets.length;
56
+
57
+ let pageVerified = 0;
58
+ for (const ds of datasets) {
59
+ if (ds.human_validated === true) {
60
+ verifiedMentions++;
61
+ pageVerified++;
62
+ }
63
+ if (ds.source === 'human') {
64
+ humanAnnotations++;
65
+ }
66
  }
 
67
 
68
+ if (pageVerified === datasets.length) {
69
+ completedPages++;
70
+ }
71
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ return {
74
+ corpus: corpus.id,
75
+ index: link.index,
76
+ totalPages,
77
+ completedPages,
78
+ totalMentions,
79
+ verifiedMentions,
80
+ humanAnnotations,
81
+ complete: totalPages > 0 && completedPages === totalPages,
82
+ };
83
+ })
84
+ );
85
+
86
+ const docs = results
87
+ .filter(r => r.status === 'fulfilled' && r.value !== null)
88
+ .map(r => r.value);
89
+
90
+ allDocs.push(...docs);
91
+ }
92
 
93
  const summary = {
94
+ totalDocs: allDocs.length,
95
+ completedDocs: allDocs.filter(d => d.complete).length,
96
+ totalPages: allDocs.reduce((s, d) => s + d.totalPages, 0),
97
+ completedPages: allDocs.reduce((s, d) => s + d.completedPages, 0),
98
+ totalMentions: allDocs.reduce((s, d) => s + d.totalMentions, 0),
99
+ verifiedMentions: allDocs.reduce((s, d) => s + d.verifiedMentions, 0),
100
+ humanAnnotations: allDocs.reduce((s, d) => s + d.humanAnnotations, 0),
101
+ docs: allDocs,
102
  };
103
 
104
  return new Response(JSON.stringify(summary), {
app/api/validate/route.js CHANGED
@@ -1,31 +1,18 @@
1
  import { NextResponse } from 'next/server';
2
  import fs from 'fs';
3
- import path from 'path';
4
  import { commit } from '@huggingface/hub';
5
- import { HF_DATASET_ID, HF_DATASET_BASE_URL } from '../../../utils/config.js';
6
 
7
  const isHFSpace = () => process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
8
 
9
- function getDocFilePath(docIndex) {
10
- return path.join(
11
- process.cwd(),
12
- 'annotation_data', 'wbg_extractions',
13
- `doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl`
14
- );
15
- }
16
-
17
- function getDocRepoPath(docIndex) {
18
- return `annotation_data/wbg_extractions/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`;
19
- }
20
-
21
  /**
22
  * PUT /api/validate
23
- * Updates a specific dataset entry within a page by its array index.
24
- * Body: { document_index, page_number, dataset_index, updates }
25
  */
26
  export async function PUT(request) {
27
  try {
28
- const { document_index, page_number, dataset_index, updates } = await request.json();
 
29
 
30
  if (document_index == null || page_number == null || dataset_index == null || !updates) {
31
  return NextResponse.json(
@@ -37,23 +24,23 @@ export async function PUT(request) {
37
  let pagesData;
38
 
39
  if (isHFSpace()) {
40
- const url = `${HF_DATASET_BASE_URL}/raw/main/${getDocRepoPath(document_index)}`;
 
41
  const res = await fetch(url, {
42
  headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
43
  });
44
  if (!res.ok) {
45
- return NextResponse.json({ error: 'Document not found on HF' }, { status: 404 });
46
  }
47
  pagesData = await res.json();
48
  } else {
49
- const filePath = getDocFilePath(document_index);
50
  if (!fs.existsSync(filePath)) {
51
- return NextResponse.json({ error: 'Document not found locally' }, { status: 404 });
52
  }
53
  pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
54
  }
55
 
56
- // Find the page
57
  const pageIdx = pagesData.findIndex(p => p.document?.pages?.[0] === page_number);
58
  if (pageIdx === -1) {
59
  return NextResponse.json({ error: `Page ${page_number} not found` }, { status: 404 });
@@ -64,12 +51,9 @@ export async function PUT(request) {
64
  return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
65
  }
66
 
67
- // Per-annotator validation: store in a `validations` array.
68
- // Each annotator gets their own entry; re-validating updates in-place.
69
  const currentEntry = pagesData[pageIdx].datasets[dataset_index];
70
  const annotator = updates.annotator || 'unknown';
71
-
72
- // Separate validation fields from other updates (like dataset_tag edits)
73
  const validationFields = ['human_validated', 'human_verdict', 'human_notes', 'annotator', 'validated_at'];
74
  const isValidation = validationFields.some(f => f in updates);
75
 
@@ -90,27 +74,19 @@ export async function PUT(request) {
90
  validations.push(validationEntry);
91
  }
92
 
93
- pagesData[pageIdx].datasets[dataset_index] = {
94
- ...currentEntry,
95
- validations,
96
- };
97
  } else {
98
- // Non-validation updates (e.g. dataset_tag edit) go at top level
99
- pagesData[pageIdx].datasets[dataset_index] = {
100
- ...currentEntry,
101
- ...updates,
102
- };
103
  }
104
 
105
  // Save back
106
  if (isHFSpace()) {
107
- const token = process.env.HF_TOKEN;
108
- const repoPath = getDocRepoPath(document_index);
109
  const content = JSON.stringify(pagesData, null, 2);
110
  await commit({
111
  repo: { type: 'dataset', name: HF_DATASET_ID },
112
- credentials: { accessToken: token },
113
- title: `Validate dataset in doc_${document_index} page ${page_number}`,
114
  operations: [{
115
  operation: 'addOrUpdate',
116
  path: repoPath,
@@ -118,7 +94,7 @@ export async function PUT(request) {
118
  }],
119
  });
120
  } else {
121
- const filePath = getDocFilePath(document_index);
122
  fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
123
  }
124
 
@@ -133,16 +109,18 @@ export async function PUT(request) {
133
  }
134
 
135
  /**
136
- * DELETE /api/validate?doc=X&page=Y&idx=Z
137
- * Removes a dataset entry by its array index.
138
  */
139
  export async function DELETE(request) {
140
  try {
141
  const { searchParams } = new URL(request.url);
 
142
  const document_index = parseInt(searchParams.get('doc'), 10);
143
  const page_number = parseInt(searchParams.get('page'), 10);
144
  const dataset_index = parseInt(searchParams.get('idx'), 10);
145
 
 
 
146
  if (isNaN(document_index) || isNaN(page_number) || isNaN(dataset_index)) {
147
  return NextResponse.json(
148
  { error: 'Missing doc, page, or idx parameter' },
@@ -153,18 +131,19 @@ export async function DELETE(request) {
153
  let pagesData;
154
 
155
  if (isHFSpace()) {
156
- const url = `${HF_DATASET_BASE_URL}/raw/main/${getDocRepoPath(document_index)}`;
 
157
  const res = await fetch(url, {
158
  headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
159
  });
160
  if (!res.ok) {
161
- return NextResponse.json({ error: 'Document not found on HF' }, { status: 404 });
162
  }
163
  pagesData = await res.json();
164
  } else {
165
- const filePath = getDocFilePath(document_index);
166
  if (!fs.existsSync(filePath)) {
167
- return NextResponse.json({ error: 'Document not found locally' }, { status: 404 });
168
  }
169
  pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
170
  }
@@ -179,18 +158,15 @@ export async function DELETE(request) {
179
  return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
180
  }
181
 
182
- // Remove the entry
183
  pagesData[pageIdx].datasets.splice(dataset_index, 1);
184
 
185
- // Save back
186
  if (isHFSpace()) {
187
- const token = process.env.HF_TOKEN;
188
- const repoPath = getDocRepoPath(document_index);
189
  const content = JSON.stringify(pagesData, null, 2);
190
  await commit({
191
  repo: { type: 'dataset', name: HF_DATASET_ID },
192
- credentials: { accessToken: token },
193
- title: `Delete dataset from doc_${document_index} page ${page_number}`,
194
  operations: [{
195
  operation: 'addOrUpdate',
196
  path: repoPath,
@@ -198,7 +174,7 @@ export async function DELETE(request) {
198
  }],
199
  });
200
  } else {
201
- const filePath = getDocFilePath(document_index);
202
  fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
203
  }
204
 
 
1
  import { NextResponse } from 'next/server';
2
  import fs from 'fs';
 
3
  import { commit } from '@huggingface/hub';
4
+ import { HF_DATASET_ID, HF_DATASET_BASE_URL, getCorpus, getDocRepoPath, getDocLocalPath } from '../../../utils/config.js';
5
 
6
  const isHFSpace = () => process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
7
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  /**
9
  * PUT /api/validate
10
+ * Body: { corpus, document_index, page_number, dataset_index, updates }
 
11
  */
12
  export async function PUT(request) {
13
  try {
14
+ const { corpus: corpusId, document_index, page_number, dataset_index, updates } = await request.json();
15
+ const corpus = getCorpus(corpusId);
16
 
17
  if (document_index == null || page_number == null || dataset_index == null || !updates) {
18
  return NextResponse.json(
 
24
  let pagesData;
25
 
26
  if (isHFSpace()) {
27
+ const repoPath = getDocRepoPath(corpus, document_index);
28
+ const url = `${HF_DATASET_BASE_URL}/raw/main/${repoPath}`;
29
  const res = await fetch(url, {
30
  headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
31
  });
32
  if (!res.ok) {
33
+ return NextResponse.json({ error: `Document not found on HF (${corpus.id})` }, { status: 404 });
34
  }
35
  pagesData = await res.json();
36
  } else {
37
+ const filePath = getDocLocalPath(corpus, document_index);
38
  if (!fs.existsSync(filePath)) {
39
+ return NextResponse.json({ error: `Document not found locally (${corpus.id})` }, { status: 404 });
40
  }
41
  pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
42
  }
43
 
 
44
  const pageIdx = pagesData.findIndex(p => p.document?.pages?.[0] === page_number);
45
  if (pageIdx === -1) {
46
  return NextResponse.json({ error: `Page ${page_number} not found` }, { status: 404 });
 
51
  return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
52
  }
53
 
54
+ // Per-annotator validation
 
55
  const currentEntry = pagesData[pageIdx].datasets[dataset_index];
56
  const annotator = updates.annotator || 'unknown';
 
 
57
  const validationFields = ['human_validated', 'human_verdict', 'human_notes', 'annotator', 'validated_at'];
58
  const isValidation = validationFields.some(f => f in updates);
59
 
 
74
  validations.push(validationEntry);
75
  }
76
 
77
+ pagesData[pageIdx].datasets[dataset_index] = { ...currentEntry, validations };
 
 
 
78
  } else {
79
+ pagesData[pageIdx].datasets[dataset_index] = { ...currentEntry, ...updates };
 
 
 
 
80
  }
81
 
82
  // Save back
83
  if (isHFSpace()) {
84
+ const repoPath = getDocRepoPath(corpus, document_index);
 
85
  const content = JSON.stringify(pagesData, null, 2);
86
  await commit({
87
  repo: { type: 'dataset', name: HF_DATASET_ID },
88
+ credentials: { accessToken: process.env.HF_TOKEN },
89
+ title: `Validate ${corpus.id}/doc_${document_index} page ${page_number}`,
90
  operations: [{
91
  operation: 'addOrUpdate',
92
  path: repoPath,
 
94
  }],
95
  });
96
  } else {
97
+ const filePath = getDocLocalPath(corpus, document_index);
98
  fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
99
  }
100
 
 
109
  }
110
 
111
  /**
112
+ * DELETE /api/validate?corpus=X&doc=X&page=Y&idx=Z
 
113
  */
114
  export async function DELETE(request) {
115
  try {
116
  const { searchParams } = new URL(request.url);
117
+ const corpusId = searchParams.get('corpus');
118
  const document_index = parseInt(searchParams.get('doc'), 10);
119
  const page_number = parseInt(searchParams.get('page'), 10);
120
  const dataset_index = parseInt(searchParams.get('idx'), 10);
121
 
122
+ const corpus = getCorpus(corpusId);
123
+
124
  if (isNaN(document_index) || isNaN(page_number) || isNaN(dataset_index)) {
125
  return NextResponse.json(
126
  { error: 'Missing doc, page, or idx parameter' },
 
131
  let pagesData;
132
 
133
  if (isHFSpace()) {
134
+ const repoPath = getDocRepoPath(corpus, document_index);
135
+ const url = `${HF_DATASET_BASE_URL}/raw/main/${repoPath}`;
136
  const res = await fetch(url, {
137
  headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
138
  });
139
  if (!res.ok) {
140
+ return NextResponse.json({ error: `Document not found on HF (${corpus.id})` }, { status: 404 });
141
  }
142
  pagesData = await res.json();
143
  } else {
144
+ const filePath = getDocLocalPath(corpus, document_index);
145
  if (!fs.existsSync(filePath)) {
146
+ return NextResponse.json({ error: `Document not found locally (${corpus.id})` }, { status: 404 });
147
  }
148
  pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
149
  }
 
158
  return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
159
  }
160
 
 
161
  pagesData[pageIdx].datasets.splice(dataset_index, 1);
162
 
 
163
  if (isHFSpace()) {
164
+ const repoPath = getDocRepoPath(corpus, document_index);
 
165
  const content = JSON.stringify(pagesData, null, 2);
166
  await commit({
167
  repo: { type: 'dataset', name: HF_DATASET_ID },
168
+ credentials: { accessToken: process.env.HF_TOKEN },
169
+ title: `Delete from ${corpus.id}/doc_${document_index} page ${page_number}`,
170
  operations: [{
171
  operation: 'addOrUpdate',
172
  path: repoPath,
 
174
  }],
175
  });
176
  } else {
177
+ const filePath = getDocLocalPath(corpus, document_index);
178
  fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
179
  }
180
 
app/components/DocumentSelector.js CHANGED
@@ -3,23 +3,28 @@
3
  export default function DocumentSelector({
4
  documents,
5
  selectedDocIndex,
 
6
  onDocChange,
7
  }) {
 
 
 
 
8
  return (
9
  <div className="navigation-controls">
10
  <div className="select-group">
11
  <label htmlFor="doc-select">Document</label>
12
  <select
13
  id="doc-select"
14
- value={selectedDocIndex ?? ''}
15
  onChange={(e) => {
16
- const docIdx = parseInt(e.target.value, 10);
17
- onDocChange(docIdx);
18
  }}
19
  >
20
  {documents.map(doc => (
21
- <option key={doc.index} value={doc.index}>
22
- Document {doc.index} ({doc.annotatable_pages.length} pages)
23
  </option>
24
  ))}
25
  </select>
 
3
  export default function DocumentSelector({
4
  documents,
5
  selectedDocIndex,
6
+ selectedCorpus,
7
  onDocChange,
8
  }) {
9
+ const currentValue = selectedCorpus && selectedDocIndex != null
10
+ ? `${selectedCorpus}:${selectedDocIndex}`
11
+ : '';
12
+
13
  return (
14
  <div className="navigation-controls">
15
  <div className="select-group">
16
  <label htmlFor="doc-select">Document</label>
17
  <select
18
  id="doc-select"
19
+ value={currentValue}
20
  onChange={(e) => {
21
+ const [corpus, idx] = e.target.value.split(':');
22
+ onDocChange(corpus, parseInt(idx, 10));
23
  }}
24
  >
25
  {documents.map(doc => (
26
+ <option key={`${doc.corpus}:${doc.index}`} value={`${doc.corpus}:${doc.index}`}>
27
+ [{doc.corpus_name}] Doc {doc.index} ({doc.annotatable_pages.length} pages)
28
  </option>
29
  ))}
30
  </select>
app/page.js CHANGED
@@ -13,6 +13,7 @@ import Leaderboard from './components/Leaderboard';
13
  export default function Home() {
14
  const [documents, setDocuments] = useState([]);
15
  const [selectedDocIndex, setSelectedDocIndex] = useState(null);
 
16
  const [currentDoc, setCurrentDoc] = useState(null);
17
 
18
  // Page-by-page navigation: track the index into annotatable_pages array
@@ -60,17 +61,19 @@ export default function Home() {
60
  .then(data => {
61
  setDocuments(data);
62
  if (data.length > 0) {
63
- // Restore saved position from sessionStorage
64
  const savedDoc = sessionStorage.getItem('selectedDocIndex');
 
65
  const savedPage = sessionStorage.getItem('pageIdx');
66
  const docIdx = savedDoc ? parseInt(savedDoc, 10) : null;
67
- const restoredDoc = docIdx !== null && data.find(d => d.index === docIdx);
68
 
69
  if (restoredDoc) {
70
  setSelectedDocIndex(docIdx);
 
71
  setPageIdx(savedPage ? parseInt(savedPage, 10) : 0);
72
  } else {
73
  setSelectedDocIndex(data[0].index);
 
74
  setPageIdx(0);
75
  }
76
  }
@@ -101,10 +104,11 @@ export default function Home() {
101
 
102
  // Update currentDoc when selection changes + persist to sessionStorage
103
  useEffect(() => {
104
- if (selectedDocIndex !== null) {
105
- const doc = documents.find(d => d.index === selectedDocIndex);
106
  setCurrentDoc(doc);
107
  sessionStorage.setItem('selectedDocIndex', selectedDocIndex);
 
108
 
109
  // Clamp pageIdx to valid range for this document
110
  if (doc) {
@@ -112,7 +116,7 @@ export default function Home() {
112
  setPageIdx(prev => Math.min(prev, Math.max(0, maxPage)));
113
  }
114
  }
115
- }, [selectedDocIndex, documents]);
116
 
117
  // Persist pageIdx to sessionStorage
118
  useEffect(() => {
@@ -123,7 +127,7 @@ export default function Home() {
123
  const refreshPageData = useCallback(() => {
124
  if (selectedDocIndex !== null && currentPageNumber !== null) {
125
  setLoadingPage(true);
126
- fetch(`/api/document?index=${selectedDocIndex}&page=${currentPageNumber}`)
127
  .then(res => res.json())
128
  .then(data => {
129
  setCurrentPageData(data);
@@ -175,7 +179,8 @@ export default function Home() {
175
  localStorage.setItem('annotator_name', name);
176
  };
177
 
178
- const handleDocChange = (docIdx) => {
 
179
  setSelectedDocIndex(docIdx);
180
  setPageIdx(0);
181
  };
@@ -270,6 +275,7 @@ export default function Home() {
270
  dataset_tag: dataset_tag,
271
  source: 'human',
272
  annotator: annotatorName || "user",
 
273
  document_index: selectedDocIndex,
274
  page_number: currentPageNumber,
275
  timestamp: new Date().toISOString(),
@@ -313,7 +319,7 @@ export default function Home() {
313
  const rawIdx = ds._rawIndex ?? idx;
314
  try {
315
  const res = await fetch(
316
- `/api/validate?doc=${selectedDocIndex}&page=${currentPageNumber}&idx=${rawIdx}`,
317
  { method: 'DELETE' }
318
  );
319
  if (res.ok) {
@@ -373,6 +379,7 @@ export default function Home() {
373
  method: 'PUT',
374
  headers: { 'Content-Type': 'application/json' },
375
  body: JSON.stringify({
 
376
  document_index: selectedDocIndex,
377
  page_number: currentPageNumber,
378
  dataset_index: datasetIdx,
@@ -474,6 +481,7 @@ export default function Home() {
474
  <DocumentSelector
475
  documents={documents}
476
  selectedDocIndex={selectedDocIndex}
 
477
  onDocChange={handleDocChange}
478
  />
479
  </div>
 
13
  export default function Home() {
14
  const [documents, setDocuments] = useState([]);
15
  const [selectedDocIndex, setSelectedDocIndex] = useState(null);
16
+ const [selectedCorpus, setSelectedCorpus] = useState(null);
17
  const [currentDoc, setCurrentDoc] = useState(null);
18
 
19
  // Page-by-page navigation: track the index into annotatable_pages array
 
61
  .then(data => {
62
  setDocuments(data);
63
  if (data.length > 0) {
 
64
  const savedDoc = sessionStorage.getItem('selectedDocIndex');
65
+ const savedCorpus = sessionStorage.getItem('selectedCorpus');
66
  const savedPage = sessionStorage.getItem('pageIdx');
67
  const docIdx = savedDoc ? parseInt(savedDoc, 10) : null;
68
+ const restoredDoc = docIdx !== null && data.find(d => d.index === docIdx && (!savedCorpus || d.corpus === savedCorpus));
69
 
70
  if (restoredDoc) {
71
  setSelectedDocIndex(docIdx);
72
+ setSelectedCorpus(restoredDoc.corpus);
73
  setPageIdx(savedPage ? parseInt(savedPage, 10) : 0);
74
  } else {
75
  setSelectedDocIndex(data[0].index);
76
+ setSelectedCorpus(data[0].corpus);
77
  setPageIdx(0);
78
  }
79
  }
 
104
 
105
  // Update currentDoc when selection changes + persist to sessionStorage
106
  useEffect(() => {
107
+ if (selectedDocIndex !== null && selectedCorpus !== null) {
108
+ const doc = documents.find(d => d.index === selectedDocIndex && d.corpus === selectedCorpus);
109
  setCurrentDoc(doc);
110
  sessionStorage.setItem('selectedDocIndex', selectedDocIndex);
111
+ sessionStorage.setItem('selectedCorpus', selectedCorpus);
112
 
113
  // Clamp pageIdx to valid range for this document
114
  if (doc) {
 
116
  setPageIdx(prev => Math.min(prev, Math.max(0, maxPage)));
117
  }
118
  }
119
+ }, [selectedDocIndex, selectedCorpus, documents]);
120
 
121
  // Persist pageIdx to sessionStorage
122
  useEffect(() => {
 
127
  const refreshPageData = useCallback(() => {
128
  if (selectedDocIndex !== null && currentPageNumber !== null) {
129
  setLoadingPage(true);
130
+ fetch(`/api/document?index=${selectedDocIndex}&page=${currentPageNumber}&corpus=${selectedCorpus || ''}`)
131
  .then(res => res.json())
132
  .then(data => {
133
  setCurrentPageData(data);
 
179
  localStorage.setItem('annotator_name', name);
180
  };
181
 
182
+ const handleDocChange = (corpus, docIdx) => {
183
+ setSelectedCorpus(corpus);
184
  setSelectedDocIndex(docIdx);
185
  setPageIdx(0);
186
  };
 
275
  dataset_tag: dataset_tag,
276
  source: 'human',
277
  annotator: annotatorName || "user",
278
+ corpus: selectedCorpus,
279
  document_index: selectedDocIndex,
280
  page_number: currentPageNumber,
281
  timestamp: new Date().toISOString(),
 
319
  const rawIdx = ds._rawIndex ?? idx;
320
  try {
321
  const res = await fetch(
322
+ `/api/validate?doc=${selectedDocIndex}&page=${currentPageNumber}&idx=${rawIdx}&corpus=${selectedCorpus || ''}`,
323
  { method: 'DELETE' }
324
  );
325
  if (res.ok) {
 
379
  method: 'PUT',
380
  headers: { 'Content-Type': 'application/json' },
381
  body: JSON.stringify({
382
+ corpus: selectedCorpus,
383
  document_index: selectedDocIndex,
384
  page_number: currentPageNumber,
385
  dataset_index: datasetIdx,
 
481
  <DocumentSelector
482
  documents={documents}
483
  selectedDocIndex={selectedDocIndex}
484
+ selectedCorpus={selectedCorpus}
485
  onDocChange={handleDocChange}
486
  />
487
  </div>
generate_assignments.py CHANGED
@@ -2,8 +2,9 @@
2
  """
3
  generate_assignments.py
4
 
5
- Reads annotator_config.yaml, distributes available docs across annotators
6
- with configurable overlap, and writes back the updated config.
 
7
 
8
  Usage:
9
  python3 generate_assignments.py # Generate and save
@@ -26,7 +27,7 @@ except ImportError:
26
  sys.exit(1)
27
 
28
  CONFIG_PATH = Path(__file__).parent / "annotation_data" / "annotator_config.yaml"
29
- LINKS_PATH = Path(__file__).parent / "annotation_data" / "wbg_data" / "wbg_pdf_links.json"
30
 
31
 
32
  def load_config():
@@ -37,17 +38,25 @@ def save_config(config):
37
  CONFIG_PATH.write_text(yaml.dump(config, default_flow_style=False, sort_keys=False))
38
 
39
 
40
- def get_available_docs():
41
- """Get list of active English doc indices."""
42
- links = json.loads(LINKS_PATH.read_text())
 
 
 
 
 
 
 
 
43
  return sorted([
44
  l["index"] for l in links
45
  if l.get("has_revalidation") and l.get("status") == "success"
46
  ])
47
 
48
 
49
- def generate_assignments(config, seed=42):
50
- """Distribute docs across annotators with overlap."""
51
  settings = config.get("settings", {})
52
  overlap_pct = settings.get("overlap_percent", 10)
53
  annotators = config.get("annotators", [])
@@ -56,44 +65,49 @@ def generate_assignments(config, seed=42):
56
  print("❌ No annotators defined in config.")
57
  return config
58
 
59
- all_docs = get_available_docs()
60
- n_docs = len(all_docs)
61
  n_annotators = len(annotators)
 
62
 
63
- # Calculate overlap
64
- n_overlap = max(1, round(n_docs * overlap_pct / 100))
 
 
65
 
66
- # Shuffle docs deterministically
67
- rng = random.Random(seed)
68
- shuffled = all_docs.copy()
69
- rng.shuffle(shuffled)
70
-
71
- # Pick overlap docs (shared by ALL annotators)
72
- overlap_docs = sorted(shuffled[:n_overlap])
73
- remaining = shuffled[n_overlap:]
74
-
75
- # Split remaining docs evenly across annotators
76
- per_annotator = len(remaining) // n_annotators
77
- extra = len(remaining) % n_annotators
78
-
79
- print(f"\nπŸ“Š Assignment Summary:")
80
- print(f" Total docs: {n_docs}")
81
- print(f" Annotators: {n_annotators}")
82
- print(f" Overlap ({overlap_pct}%): {n_overlap} docs shared by all")
83
- print(f" Per annotator: ~{per_annotator + n_overlap} docs each")
84
- print(f" Overlap docs: {overlap_docs}")
85
- print()
86
-
87
- start = 0
88
- for i, ann in enumerate(annotators):
89
- # Distribute remaining: first `extra` annotators get 1 more
90
- count = per_annotator + (1 if i < extra else 0)
91
- exclusive = sorted(remaining[start:start + count])
92
- start += count
93
-
94
- ann["docs"] = sorted(overlap_docs + exclusive)
95
- print(f" {ann['username']}: {len(ann['docs'])} docs "
96
- f"({n_overlap} overlap + {len(exclusive)} exclusive)")
 
 
 
 
97
 
98
  return config
99
 
@@ -130,14 +144,17 @@ def upload_config():
130
 
131
 
132
  def main():
133
- parser = argparse.ArgumentParser(description="Generate document assignments")
134
  parser.add_argument("--dry-run", action="store_true", help="Preview only")
135
  parser.add_argument("--upload", action="store_true", help="Upload config to HF")
136
  parser.add_argument("--seed", type=int, default=42, help="Random seed")
137
  args = parser.parse_args()
138
 
 
139
  config = load_config()
140
- config = generate_assignments(config, seed=args.seed)
 
 
141
 
142
  if args.dry_run:
143
  print("\n[DRY RUN] Would save:")
 
2
  """
3
  generate_assignments.py
4
 
5
+ Reads corpora.json and annotator_config.yaml, distributes available docs
6
+ across annotators with configurable overlap per corpus, and writes back
7
+ the updated config.
8
 
9
  Usage:
10
  python3 generate_assignments.py # Generate and save
 
27
  sys.exit(1)
28
 
29
  CONFIG_PATH = Path(__file__).parent / "annotation_data" / "annotator_config.yaml"
30
+ CORPORA_PATH = Path(__file__).parent / "annotation_data" / "corpora.json"
31
 
32
 
33
  def load_config():
 
38
  CONFIG_PATH.write_text(yaml.dump(config, default_flow_style=False, sort_keys=False))
39
 
40
 
41
+ def load_corpora():
42
+ return json.loads(CORPORA_PATH.read_text())
43
+
44
+
45
+ def get_available_docs(corpus):
46
+ """Get list of active doc indices for a given corpus."""
47
+ links_path = Path(__file__).parent / "annotation_data" / corpus["links_file"]
48
+ if not links_path.exists():
49
+ print(f" ⚠️ No links file for {corpus['id']}: {links_path}")
50
+ return []
51
+ links = json.loads(links_path.read_text())
52
  return sorted([
53
  l["index"] for l in links
54
  if l.get("has_revalidation") and l.get("status") == "success"
55
  ])
56
 
57
 
58
+ def generate_assignments(config, corpora, seed=42):
59
+ """Distribute docs across annotators with overlap, per corpus."""
60
  settings = config.get("settings", {})
61
  overlap_pct = settings.get("overlap_percent", 10)
62
  annotators = config.get("annotators", [])
 
65
  print("❌ No annotators defined in config.")
66
  return config
67
 
 
 
68
  n_annotators = len(annotators)
69
+ rng = random.Random(seed)
70
 
71
+ # Initialize per-corpus doc dicts
72
+ for ann in annotators:
73
+ if not isinstance(ann.get("docs"), dict):
74
+ ann["docs"] = {}
75
 
76
+ for corpus in corpora:
77
+ cid = corpus["id"]
78
+ all_docs = get_available_docs(corpus)
79
+ n_docs = len(all_docs)
80
+
81
+ if n_docs == 0:
82
+ print(f"\nπŸ“‚ {corpus['name']} ({cid}): no docs available")
83
+ continue
84
+
85
+ n_overlap = max(1, round(n_docs * overlap_pct / 100))
86
+
87
+ shuffled = all_docs.copy()
88
+ rng.shuffle(shuffled)
89
+
90
+ overlap_docs = sorted(shuffled[:n_overlap])
91
+ remaining = shuffled[n_overlap:]
92
+
93
+ per_annotator = len(remaining) // n_annotators
94
+ extra = len(remaining) % n_annotators
95
+
96
+ print(f"\nπŸ“‚ {corpus['name']} ({cid}):")
97
+ print(f" Total docs: {n_docs}")
98
+ print(f" Overlap ({overlap_pct}%): {n_overlap} docs shared by all")
99
+ print(f" Per annotator: ~{per_annotator + n_overlap} docs each")
100
+ print(f" Overlap docs: {overlap_docs}")
101
+
102
+ start = 0
103
+ for i, ann in enumerate(annotators):
104
+ count = per_annotator + (1 if i < extra else 0)
105
+ exclusive = sorted(remaining[start:start + count])
106
+ start += count
107
+
108
+ ann["docs"][cid] = sorted(overlap_docs + exclusive)
109
+ print(f" {ann['username']}: {len(ann['docs'][cid])} docs "
110
+ f"({n_overlap} overlap + {len(exclusive)} exclusive)")
111
 
112
  return config
113
 
 
144
 
145
 
146
  def main():
147
+ parser = argparse.ArgumentParser(description="Generate document assignments per corpus")
148
  parser.add_argument("--dry-run", action="store_true", help="Preview only")
149
  parser.add_argument("--upload", action="store_true", help="Upload config to HF")
150
  parser.add_argument("--seed", type=int, default=42, help="Random seed")
151
  args = parser.parse_args()
152
 
153
+ corpora = load_corpora()
154
  config = load_config()
155
+
156
+ print(f"πŸ“‹ Loaded {len(corpora)} corpora, {len(config.get('annotators', []))} annotators")
157
+ config = generate_assignments(config, corpora, seed=args.seed)
158
 
159
  if args.dry_run:
160
  print("\n[DRY RUN] Would save:")
utils/config.js CHANGED
@@ -1,4 +1,57 @@
 
 
 
1
  // Centralized configuration for the annotation app
2
  export const HF_DATASET_ID = process.env.HF_DATASET_REPO || 'ai4data/annotation_data';
3
  export const HF_DATASET_BASE_URL = `https://huggingface.co/datasets/${HF_DATASET_ID}`;
4
  export const MAX_DOCS_TO_SCAN = parseInt(process.env.MAX_DOCS_TO_SCAN || '50', 10);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import fs from 'fs';
2
+ import path from 'path';
3
+
4
  // Centralized configuration for the annotation app
5
  export const HF_DATASET_ID = process.env.HF_DATASET_REPO || 'ai4data/annotation_data';
6
  export const HF_DATASET_BASE_URL = `https://huggingface.co/datasets/${HF_DATASET_ID}`;
7
  export const MAX_DOCS_TO_SCAN = parseInt(process.env.MAX_DOCS_TO_SCAN || '50', 10);
8
+
9
+ // ─── Corpus helpers ────────────────────────────────
10
+
11
+ let _corporaCache = null;
12
+
13
+ /**
14
+ * Returns the list of available corpora from corpora.json.
15
+ * Cached after first load.
16
+ */
17
+ export function getCorpora() {
18
+ if (_corporaCache) return _corporaCache;
19
+ const filePath = path.join(process.cwd(), 'annotation_data', 'corpora.json');
20
+ _corporaCache = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
21
+ return _corporaCache;
22
+ }
23
+
24
+ /**
25
+ * Find a corpus by its ID (e.g. "wbg", "unhcr").
26
+ * Returns the default (first) corpus if corpusId is null/undefined.
27
+ */
28
+ export function getCorpus(corpusId) {
29
+ const corpora = getCorpora();
30
+ if (!corpusId) return corpora[0];
31
+ return corpora.find(c => c.id === corpusId) || corpora[0];
32
+ }
33
+
34
+ /**
35
+ * HF repo path for a corpus's PDF links file.
36
+ */
37
+ export function getLinksRepoPath(corpus) {
38
+ return `annotation_data/${corpus.links_file}`;
39
+ }
40
+
41
+ /**
42
+ * HF repo path for a specific doc's raw JSON.
43
+ */
44
+ export function getDocRepoPath(corpus, docIndex) {
45
+ return `annotation_data/${corpus.extractions_dir}/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`;
46
+ }
47
+
48
+ /**
49
+ * Local file path for a specific doc's raw JSON.
50
+ */
51
+ export function getDocLocalPath(corpus, docIndex) {
52
+ return path.join(
53
+ process.cwd(),
54
+ 'annotation_data', corpus.extractions_dir,
55
+ `doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl`
56
+ );
57
+ }
utils/storage.js CHANGED
@@ -1,54 +1,32 @@
1
  import fs from 'fs';
2
  import path from 'path';
3
  import { commit } from '@huggingface/hub';
4
- import { HF_DATASET_ID, HF_DATASET_BASE_URL } from './config.js';
5
-
6
- const getRootPath = () => process.cwd();
7
 
8
  const isHFSpace = () => {
9
  return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
10
  };
11
 
12
- /**
13
- * Returns the local file path for a document's raw JSON
14
- */
15
- function getDocFilePath(docIndex) {
16
- return path.join(
17
- getRootPath(),
18
- 'annotation_data', 'wbg_extractions',
19
- `doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl`
20
- );
21
- }
22
-
23
- /**
24
- * Returns the HF repo path for a document's raw JSON
25
- */
26
- function getDocRepoPath(docIndex) {
27
- return `annotation_data/wbg_extractions/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`;
28
- }
29
-
30
  /**
31
  * Reads the full document JSON (all pages) from local file
32
  */
33
- function readDocLocal(docIndex) {
34
- const filePath = getDocFilePath(docIndex);
35
  if (!fs.existsSync(filePath)) return null;
36
- const raw = fs.readFileSync(filePath, 'utf-8');
37
- return JSON.parse(raw);
38
  }
39
 
40
  /**
41
  * Writes the full document JSON (all pages) to local file
42
  */
43
- function writeDocLocal(docIndex, pagesData) {
44
- const filePath = getDocFilePath(docIndex);
45
  fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
46
- console.log(`Saved doc_${docIndex}_direct_judged.jsonl locally`);
47
  }
48
 
49
  /**
50
  * Finds the page index in the pages array by page_number
51
- * Uses document.pages[0] to match, consistent with the document/route.js API
52
  */
53
  function findPageIndex(pagesData, pageNumber) {
54
  return pagesData.findIndex(p => p.document?.pages?.[0] === pageNumber);
@@ -57,24 +35,25 @@ function findPageIndex(pagesData, pageNumber) {
57
  /**
58
  * Fetches the document JSON from HuggingFace
59
  */
60
- async function fetchDocFromHF(docIndex) {
61
  const token = process.env.HF_TOKEN;
62
- const url = `${HF_DATASET_BASE_URL}/raw/main/${getDocRepoPath(docIndex)}`;
 
63
  const res = await fetch(url, {
64
  headers: { 'Authorization': `Bearer ${token}` }
65
  });
66
- if (!res.ok) throw new Error(`Failed to fetch doc_${docIndex} from HF: ${res.status}`);
67
  return res.json();
68
  }
69
 
70
  /**
71
  * Commits the updated document JSON back to HuggingFace
72
  */
73
- async function commitDocToHF(docIndex, pagesData, commitMessage) {
74
  const token = process.env.HF_TOKEN;
75
  if (!token) throw new Error("Missing HF_TOKEN");
76
 
77
- const repoPath = getDocRepoPath(docIndex);
78
  const content = JSON.stringify(pagesData, null, 2);
79
 
80
  await commit({
@@ -93,15 +72,13 @@ async function commitDocToHF(docIndex, pagesData, commitMessage) {
93
  // ─── Public API ────────────────────────────────────
94
 
95
  /**
96
- * Saves an annotation by appending it to the page's datasets array
97
- * in the per-document raw JSON file.
98
- *
99
- * @param {Object} annotation - Must include document_index, page_number, and dataset fields
100
  */
101
  export async function saveAnnotation(annotation) {
 
102
  const { document_index: docIndex, page_number: pageNumber } = annotation;
103
 
104
- // Build the dataset entry (strip routing fields β€” they stay at page/doc level)
105
  const datasetEntry = {
106
  dataset_name: annotation.dataset_name,
107
  dataset_tag: annotation.dataset_tag,
@@ -122,33 +99,33 @@ export async function saveAnnotation(annotation) {
122
  };
123
 
124
  if (isHFSpace()) {
125
- // Production: fetch from HF, modify, commit back
126
- const pagesData = await fetchDocFromHF(docIndex);
127
  const pageIdx = findPageIndex(pagesData, pageNumber);
128
- if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex}`);
129
 
130
  pagesData[pageIdx].datasets.push(datasetEntry);
131
- await commitDocToHF(docIndex, pagesData,
132
- `Add human annotation to doc_${docIndex} page ${pageNumber}`);
133
  } else {
134
- // Local: read, modify, write
135
- const pagesData = readDocLocal(docIndex);
136
- if (!pagesData) throw new Error(`doc_${docIndex}_direct_judged.jsonl not found locally`);
137
 
138
  const pageIdx = findPageIndex(pagesData, pageNumber);
139
- if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex}`);
140
 
141
  pagesData[pageIdx].datasets.push(datasetEntry);
142
- writeDocLocal(docIndex, pagesData);
143
  }
144
  }
145
 
146
  /**
147
- * Deletes an annotation from the page's datasets array by timestamp
148
  */
149
- export async function deleteAnnotation(timestamp, docIndex, pageNumber) {
 
 
150
  if (isHFSpace()) {
151
- const pagesData = await fetchDocFromHF(docIndex);
152
  const pageIdx = findPageIndex(pagesData, pageNumber);
153
  if (pageIdx === -1) return false;
154
 
@@ -158,11 +135,11 @@ export async function deleteAnnotation(timestamp, docIndex, pageNumber) {
158
  );
159
  if (pagesData[pageIdx].datasets.length === before) return false;
160
 
161
- await commitDocToHF(docIndex, pagesData,
162
- `Delete annotation from doc_${docIndex} page ${pageNumber}`);
163
  return true;
164
  } else {
165
- const pagesData = readDocLocal(docIndex);
166
  if (!pagesData) return false;
167
 
168
  const pageIdx = findPageIndex(pagesData, pageNumber);
@@ -174,17 +151,19 @@ export async function deleteAnnotation(timestamp, docIndex, pageNumber) {
174
  );
175
  if (pagesData[pageIdx].datasets.length === before) return false;
176
 
177
- writeDocLocal(docIndex, pagesData);
178
  return true;
179
  }
180
  }
181
 
182
  /**
183
- * Updates an annotation in the page's datasets array by timestamp
184
  */
185
- export async function updateAnnotation(timestamp, docIndex, pageNumber, updates) {
 
 
186
  if (isHFSpace()) {
187
- const pagesData = await fetchDocFromHF(docIndex);
188
  const pageIdx = findPageIndex(pagesData, pageNumber);
189
  if (pageIdx === -1) return null;
190
 
@@ -195,11 +174,11 @@ export async function updateAnnotation(timestamp, docIndex, pageNumber, updates)
195
  ...pagesData[pageIdx].datasets[dsIdx],
196
  ...updates
197
  };
198
- await commitDocToHF(docIndex, pagesData,
199
- `Update annotation in doc_${docIndex} page ${pageNumber}`);
200
  return pagesData[pageIdx].datasets[dsIdx];
201
  } else {
202
- const pagesData = readDocLocal(docIndex);
203
  if (!pagesData) return null;
204
 
205
  const pageIdx = findPageIndex(pagesData, pageNumber);
@@ -212,46 +191,50 @@ export async function updateAnnotation(timestamp, docIndex, pageNumber, updates)
212
  ...pagesData[pageIdx].datasets[dsIdx],
213
  ...updates
214
  };
215
- writeDocLocal(docIndex, pagesData);
216
  return pagesData[pageIdx].datasets[dsIdx];
217
  }
218
  }
219
 
220
  /**
221
- * Retrieves all human annotations (those with annotator field) from local files.
222
- * Scans all doc files and returns entries that have a timestamp (human-added).
223
  */
224
- export async function getAnnotations(docIndex = null) {
225
- const extractionsDir = path.join(getRootPath(), 'annotation_data', 'wbg_extractions');
226
- if (!fs.existsSync(extractionsDir)) return [];
227
-
228
  const results = [];
229
- const dirs = fs.readdirSync(extractionsDir).filter(d => d.startsWith('doc_'));
230
-
231
- for (const dir of dirs) {
232
- const idx = parseInt(dir.replace('doc_', ''), 10);
233
- if (docIndex !== null && idx !== docIndex) continue;
234
-
235
- const filePath = path.join(extractionsDir, dir, 'raw', `${dir}_direct_judged.jsonl`);
236
- if (!fs.existsSync(filePath)) continue;
237
-
238
- try {
239
- const pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
240
- for (const page of pagesData) {
241
- const pageNum = page.document?.pages?.[0];
242
- for (const ds of (page.datasets || [])) {
243
- // Only return human annotations (those with annotator field)
244
- if (ds.annotator) {
245
- results.push({
246
- ...ds,
247
- document_index: idx,
248
- page_number: pageNum,
249
- });
 
 
 
 
 
 
250
  }
251
  }
 
 
252
  }
253
- } catch (e) {
254
- console.error(`Error reading ${filePath}:`, e);
255
  }
256
  }
257
 
 
1
  import fs from 'fs';
2
  import path from 'path';
3
  import { commit } from '@huggingface/hub';
4
+ import { HF_DATASET_ID, HF_DATASET_BASE_URL, getCorpus, getDocRepoPath, getDocLocalPath } from './config.js';
 
 
5
 
6
  const isHFSpace = () => {
7
  return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
8
  };
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  /**
11
  * Reads the full document JSON (all pages) from local file
12
  */
13
+ function readDocLocal(corpus, docIndex) {
14
+ const filePath = getDocLocalPath(corpus, docIndex);
15
  if (!fs.existsSync(filePath)) return null;
16
+ return JSON.parse(fs.readFileSync(filePath, 'utf-8'));
 
17
  }
18
 
19
  /**
20
  * Writes the full document JSON (all pages) to local file
21
  */
22
+ function writeDocLocal(corpus, docIndex, pagesData) {
23
+ const filePath = getDocLocalPath(corpus, docIndex);
24
  fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
25
+ console.log(`Saved doc_${docIndex} locally (${corpus.id})`);
26
  }
27
 
28
  /**
29
  * Finds the page index in the pages array by page_number
 
30
  */
31
  function findPageIndex(pagesData, pageNumber) {
32
  return pagesData.findIndex(p => p.document?.pages?.[0] === pageNumber);
 
35
  /**
36
  * Fetches the document JSON from HuggingFace
37
  */
38
+ async function fetchDocFromHF(corpus, docIndex) {
39
  const token = process.env.HF_TOKEN;
40
+ const repoPath = getDocRepoPath(corpus, docIndex);
41
+ const url = `${HF_DATASET_BASE_URL}/raw/main/${repoPath}`;
42
  const res = await fetch(url, {
43
  headers: { 'Authorization': `Bearer ${token}` }
44
  });
45
+ if (!res.ok) throw new Error(`Failed to fetch doc_${docIndex} (${corpus.id}) from HF: ${res.status}`);
46
  return res.json();
47
  }
48
 
49
  /**
50
  * Commits the updated document JSON back to HuggingFace
51
  */
52
+ async function commitDocToHF(corpus, docIndex, pagesData, commitMessage) {
53
  const token = process.env.HF_TOKEN;
54
  if (!token) throw new Error("Missing HF_TOKEN");
55
 
56
+ const repoPath = getDocRepoPath(corpus, docIndex);
57
  const content = JSON.stringify(pagesData, null, 2);
58
 
59
  await commit({
 
72
  // ─── Public API ────────────────────────────────────
73
 
74
  /**
75
+ * Saves an annotation by appending it to the page's datasets array.
76
+ * @param {Object} annotation - Must include corpus (optional, defaults to first), document_index, page_number
 
 
77
  */
78
  export async function saveAnnotation(annotation) {
79
+ const corpus = getCorpus(annotation.corpus);
80
  const { document_index: docIndex, page_number: pageNumber } = annotation;
81
 
 
82
  const datasetEntry = {
83
  dataset_name: annotation.dataset_name,
84
  dataset_tag: annotation.dataset_tag,
 
99
  };
100
 
101
  if (isHFSpace()) {
102
+ const pagesData = await fetchDocFromHF(corpus, docIndex);
 
103
  const pageIdx = findPageIndex(pagesData, pageNumber);
104
+ if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex} (${corpus.id})`);
105
 
106
  pagesData[pageIdx].datasets.push(datasetEntry);
107
+ await commitDocToHF(corpus, docIndex, pagesData,
108
+ `Add annotation to ${corpus.id}/doc_${docIndex} page ${pageNumber}`);
109
  } else {
110
+ const pagesData = readDocLocal(corpus, docIndex);
111
+ if (!pagesData) throw new Error(`doc_${docIndex} not found locally (${corpus.id})`);
 
112
 
113
  const pageIdx = findPageIndex(pagesData, pageNumber);
114
+ if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex} (${corpus.id})`);
115
 
116
  pagesData[pageIdx].datasets.push(datasetEntry);
117
+ writeDocLocal(corpus, docIndex, pagesData);
118
  }
119
  }
120
 
121
  /**
122
+ * Deletes an annotation by timestamp
123
  */
124
+ export async function deleteAnnotation(timestamp, docIndex, pageNumber, corpusId) {
125
+ const corpus = getCorpus(corpusId);
126
+
127
  if (isHFSpace()) {
128
+ const pagesData = await fetchDocFromHF(corpus, docIndex);
129
  const pageIdx = findPageIndex(pagesData, pageNumber);
130
  if (pageIdx === -1) return false;
131
 
 
135
  );
136
  if (pagesData[pageIdx].datasets.length === before) return false;
137
 
138
+ await commitDocToHF(corpus, docIndex, pagesData,
139
+ `Delete annotation from ${corpus.id}/doc_${docIndex} page ${pageNumber}`);
140
  return true;
141
  } else {
142
+ const pagesData = readDocLocal(corpus, docIndex);
143
  if (!pagesData) return false;
144
 
145
  const pageIdx = findPageIndex(pagesData, pageNumber);
 
151
  );
152
  if (pagesData[pageIdx].datasets.length === before) return false;
153
 
154
+ writeDocLocal(corpus, docIndex, pagesData);
155
  return true;
156
  }
157
  }
158
 
159
  /**
160
+ * Updates an annotation by timestamp
161
  */
162
+ export async function updateAnnotation(timestamp, docIndex, pageNumber, updates, corpusId) {
163
+ const corpus = getCorpus(corpusId);
164
+
165
  if (isHFSpace()) {
166
+ const pagesData = await fetchDocFromHF(corpus, docIndex);
167
  const pageIdx = findPageIndex(pagesData, pageNumber);
168
  if (pageIdx === -1) return null;
169
 
 
174
  ...pagesData[pageIdx].datasets[dsIdx],
175
  ...updates
176
  };
177
+ await commitDocToHF(corpus, docIndex, pagesData,
178
+ `Update annotation in ${corpus.id}/doc_${docIndex} page ${pageNumber}`);
179
  return pagesData[pageIdx].datasets[dsIdx];
180
  } else {
181
+ const pagesData = readDocLocal(corpus, docIndex);
182
  if (!pagesData) return null;
183
 
184
  const pageIdx = findPageIndex(pagesData, pageNumber);
 
191
  ...pagesData[pageIdx].datasets[dsIdx],
192
  ...updates
193
  };
194
+ writeDocLocal(corpus, docIndex, pagesData);
195
  return pagesData[pageIdx].datasets[dsIdx];
196
  }
197
  }
198
 
199
  /**
200
+ * Retrieves all human annotations from local files.
 
201
  */
202
+ export async function getAnnotations(docIndex = null, corpusId = null) {
203
+ const { getCorpora } = await import('./config.js');
204
+ const corporaList = corpusId ? [getCorpus(corpusId)] : getCorpora();
 
205
  const results = [];
206
+
207
+ for (const corpus of corporaList) {
208
+ const extractionsDir = path.join(process.cwd(), 'annotation_data', corpus.extractions_dir);
209
+ if (!fs.existsSync(extractionsDir)) continue;
210
+
211
+ const dirs = fs.readdirSync(extractionsDir).filter(d => d.startsWith('doc_'));
212
+
213
+ for (const dir of dirs) {
214
+ const idx = parseInt(dir.replace('doc_', ''), 10);
215
+ if (docIndex !== null && idx !== docIndex) continue;
216
+
217
+ const filePath = path.join(extractionsDir, dir, 'raw', `${dir}_direct_judged.jsonl`);
218
+ if (!fs.existsSync(filePath)) continue;
219
+
220
+ try {
221
+ const pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
222
+ for (const page of pagesData) {
223
+ const pageNum = page.document?.pages?.[0];
224
+ for (const ds of (page.datasets || [])) {
225
+ if (ds.annotator) {
226
+ results.push({
227
+ ...ds,
228
+ corpus: corpus.id,
229
+ document_index: idx,
230
+ page_number: pageNum,
231
+ });
232
+ }
233
  }
234
  }
235
+ } catch (e) {
236
+ console.error(`Error reading ${filePath}:`, e);
237
  }
 
 
238
  }
239
  }
240