Spaces:
Sleeping
Sleeping
Commit Β·
a2c885c
1
Parent(s): aeca117
feat: multi-corpus support
Browse files- corpora.json registry: add new corpora by adding entries
- All APIs/utils resolve paths via config.js helpers
- Per-corpus doc assignments: docs: { wbg: [...], unhcr: [...] }
- Document selector shows [World Bank] Doc 3 labels
- Leaderboard/progress scan all corpora
- generate_assignments.py handles per-corpus distribution
- app/api/document/route.js +11 -17
- app/api/documents/route.js +94 -62
- app/api/leaderboard/route.js +48 -50
- app/api/progress/route.js +88 -85
- app/api/validate/route.js +29 -53
- app/components/DocumentSelector.js +10 -5
- app/page.js +16 -8
- generate_assignments.py +62 -45
- utils/config.js +53 -0
- utils/storage.js +75 -92
app/api/document/route.js
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
-
import { HF_DATASET_BASE_URL } from '../../../utils/config.js';
|
| 2 |
import fs from 'fs';
|
| 3 |
-
import path from 'path';
|
| 4 |
|
| 5 |
const isHFSpace = () => {
|
| 6 |
return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
|
|
@@ -10,8 +9,8 @@ export async function GET(request) {
|
|
| 10 |
const { searchParams } = new URL(request.url);
|
| 11 |
const index = searchParams.get('index');
|
| 12 |
const page = searchParams.get('page');
|
|
|
|
| 13 |
|
| 14 |
-
// Validate required params
|
| 15 |
if (index === null || page === null) {
|
| 16 |
return new Response(
|
| 17 |
JSON.stringify({ error: "Missing index or page parameter" }),
|
|
@@ -19,7 +18,6 @@ export async function GET(request) {
|
|
| 19 |
);
|
| 20 |
}
|
| 21 |
|
| 22 |
-
// Validate numeric values
|
| 23 |
const indexNum = parseInt(index, 10);
|
| 24 |
const pageNum = parseInt(page, 10);
|
| 25 |
|
|
@@ -30,46 +28,42 @@ export async function GET(request) {
|
|
| 30 |
);
|
| 31 |
}
|
| 32 |
|
|
|
|
|
|
|
| 33 |
try {
|
| 34 |
let pagesData;
|
| 35 |
|
| 36 |
if (isHFSpace()) {
|
| 37 |
-
|
| 38 |
-
const docUrl = `${HF_DATASET_BASE_URL}/raw/main/
|
| 39 |
const res = await fetch(docUrl, {
|
| 40 |
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
|
| 41 |
});
|
| 42 |
|
| 43 |
if (!res.ok) {
|
| 44 |
return new Response(
|
| 45 |
-
JSON.stringify({ error: `doc_${indexNum}
|
| 46 |
{ status: res.status, headers: { 'Content-Type': 'application/json' } }
|
| 47 |
);
|
| 48 |
}
|
| 49 |
pagesData = await res.json();
|
| 50 |
} else {
|
| 51 |
-
|
| 52 |
-
const filePath = path.join(
|
| 53 |
-
process.cwd(),
|
| 54 |
-
'annotation_data', 'wbg_extractions',
|
| 55 |
-
`doc_${indexNum}`, 'raw', `doc_${indexNum}_direct_judged.jsonl`
|
| 56 |
-
);
|
| 57 |
|
| 58 |
if (!fs.existsSync(filePath)) {
|
| 59 |
return new Response(
|
| 60 |
-
JSON.stringify({ error: `doc_${indexNum}
|
| 61 |
{ status: 404, headers: { 'Content-Type': 'application/json' } }
|
| 62 |
);
|
| 63 |
}
|
| 64 |
-
|
| 65 |
-
pagesData = JSON.parse(raw);
|
| 66 |
}
|
| 67 |
|
| 68 |
const pageData = pagesData.find(p => p.document?.pages?.[0] === pageNum);
|
| 69 |
|
| 70 |
if (!pageData) {
|
| 71 |
return new Response(
|
| 72 |
-
JSON.stringify({ error: `Page ${pageNum} not found in doc ${indexNum}` }),
|
| 73 |
{ status: 404, headers: { 'Content-Type': 'application/json' } }
|
| 74 |
);
|
| 75 |
}
|
|
|
|
| 1 |
+
import { HF_DATASET_BASE_URL, getCorpus, getDocRepoPath, getDocLocalPath } from '../../../utils/config.js';
|
| 2 |
import fs from 'fs';
|
|
|
|
| 3 |
|
| 4 |
const isHFSpace = () => {
|
| 5 |
return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
|
|
|
|
| 9 |
const { searchParams } = new URL(request.url);
|
| 10 |
const index = searchParams.get('index');
|
| 11 |
const page = searchParams.get('page');
|
| 12 |
+
const corpusId = searchParams.get('corpus');
|
| 13 |
|
|
|
|
| 14 |
if (index === null || page === null) {
|
| 15 |
return new Response(
|
| 16 |
JSON.stringify({ error: "Missing index or page parameter" }),
|
|
|
|
| 18 |
);
|
| 19 |
}
|
| 20 |
|
|
|
|
| 21 |
const indexNum = parseInt(index, 10);
|
| 22 |
const pageNum = parseInt(page, 10);
|
| 23 |
|
|
|
|
| 28 |
);
|
| 29 |
}
|
| 30 |
|
| 31 |
+
const corpus = getCorpus(corpusId);
|
| 32 |
+
|
| 33 |
try {
|
| 34 |
let pagesData;
|
| 35 |
|
| 36 |
if (isHFSpace()) {
|
| 37 |
+
const docRepoPath = getDocRepoPath(corpus, indexNum);
|
| 38 |
+
const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
|
| 39 |
const res = await fetch(docUrl, {
|
| 40 |
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
|
| 41 |
});
|
| 42 |
|
| 43 |
if (!res.ok) {
|
| 44 |
return new Response(
|
| 45 |
+
JSON.stringify({ error: `doc_${indexNum} not found on HF (${corpus.id})` }),
|
| 46 |
{ status: res.status, headers: { 'Content-Type': 'application/json' } }
|
| 47 |
);
|
| 48 |
}
|
| 49 |
pagesData = await res.json();
|
| 50 |
} else {
|
| 51 |
+
const filePath = getDocLocalPath(corpus, indexNum);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
|
| 53 |
if (!fs.existsSync(filePath)) {
|
| 54 |
return new Response(
|
| 55 |
+
JSON.stringify({ error: `doc_${indexNum} not found locally (${corpus.id})` }),
|
| 56 |
{ status: 404, headers: { 'Content-Type': 'application/json' } }
|
| 57 |
);
|
| 58 |
}
|
| 59 |
+
pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
|
|
|
|
| 60 |
}
|
| 61 |
|
| 62 |
const pageData = pagesData.find(p => p.document?.pages?.[0] === pageNum);
|
| 63 |
|
| 64 |
if (!pageData) {
|
| 65 |
return new Response(
|
| 66 |
+
JSON.stringify({ error: `Page ${pageNum} not found in doc ${indexNum} (${corpus.id})` }),
|
| 67 |
{ status: 404, headers: { 'Content-Type': 'application/json' } }
|
| 68 |
);
|
| 69 |
}
|
app/api/documents/route.js
CHANGED
|
@@ -1,9 +1,10 @@
|
|
| 1 |
-
import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN } from '../../../utils/config.js';
|
| 2 |
import yaml from 'js-yaml';
|
| 3 |
|
| 4 |
/**
|
| 5 |
* Fetch annotator_config.yaml and return the doc list for a given user.
|
| 6 |
* Returns null if no config or user not found (show all docs).
|
|
|
|
| 7 |
*/
|
| 8 |
async function getUserAssignedDocs(username) {
|
| 9 |
if (!username) return null;
|
|
@@ -12,7 +13,7 @@ async function getUserAssignedDocs(username) {
|
|
| 12 |
const configUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/annotator_config.yaml`;
|
| 13 |
const res = await fetch(configUrl, {
|
| 14 |
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
|
| 15 |
-
|
| 16 |
});
|
| 17 |
if (!res.ok) return null;
|
| 18 |
|
|
@@ -20,9 +21,22 @@ async function getUserAssignedDocs(username) {
|
|
| 20 |
const config = yaml.load(text);
|
| 21 |
|
| 22 |
const annotator = (config.annotators || []).find(a => a.username === username);
|
| 23 |
-
if (!annotator || !annotator.docs
|
| 24 |
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
} catch (e) {
|
| 27 |
console.warn('Could not load annotator_config.yaml:', e.message);
|
| 28 |
return null;
|
|
@@ -31,76 +45,94 @@ async function getUserAssignedDocs(username) {
|
|
| 31 |
|
| 32 |
export async function GET(request) {
|
| 33 |
try {
|
| 34 |
-
// Get username from query param
|
| 35 |
const { searchParams } = new URL(request.url);
|
| 36 |
const username = searchParams.get('user');
|
| 37 |
|
| 38 |
-
// Fetch user's assigned docs (if configured)
|
| 39 |
const assignedDocs = await getUserAssignedDocs(username);
|
| 40 |
|
| 41 |
-
//
|
| 42 |
-
const
|
| 43 |
-
const
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
// Filter to docs with revalidation data, then by user assignment if available
|
| 61 |
-
let successLinks = links
|
| 62 |
-
.filter(l => l.status === 'success' && l.has_revalidation === true);
|
| 63 |
-
|
| 64 |
-
if (assignedDocs) {
|
| 65 |
-
successLinks = successLinks.filter(l => assignedDocs.has(l.index));
|
| 66 |
-
}
|
| 67 |
-
|
| 68 |
-
successLinks = successLinks.slice(0, MAX_DOCS_TO_SCAN);
|
| 69 |
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 77 |
|
| 78 |
-
|
| 79 |
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
.filter(page => page.datasets && page.datasets.length > 0)
|
| 83 |
-
.map(page => page.document.pages[0]);
|
| 84 |
|
| 85 |
-
|
|
|
|
|
|
|
| 86 |
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
landing_page: link.landing_page_url,
|
| 94 |
-
annotatable_pages: annotatablePages
|
| 95 |
-
};
|
| 96 |
-
})
|
| 97 |
-
);
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
.map(r => r.value);
|
| 102 |
|
| 103 |
-
return new Response(JSON.stringify(
|
| 104 |
status: 200,
|
| 105 |
headers: {
|
| 106 |
'Content-Type': 'application/json',
|
|
@@ -110,7 +142,7 @@ export async function GET(request) {
|
|
| 110 |
} catch (error) {
|
| 111 |
console.error(error);
|
| 112 |
return new Response(
|
| 113 |
-
JSON.stringify({ error: "Failed to fetch documents
|
| 114 |
{ status: 500, headers: { 'Content-Type': 'application/json' } }
|
| 115 |
);
|
| 116 |
}
|
|
|
|
| 1 |
+
import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN, getCorpus, getLinksRepoPath, getDocRepoPath } from '../../../utils/config.js';
|
| 2 |
import yaml from 'js-yaml';
|
| 3 |
|
| 4 |
/**
|
| 5 |
* Fetch annotator_config.yaml and return the doc list for a given user.
|
| 6 |
* Returns null if no config or user not found (show all docs).
|
| 7 |
+
* Now returns per-corpus assignments: { wbg: Set([1,2]), unhcr: Set([3,4]) }
|
| 8 |
*/
|
| 9 |
async function getUserAssignedDocs(username) {
|
| 10 |
if (!username) return null;
|
|
|
|
| 13 |
const configUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/annotator_config.yaml`;
|
| 14 |
const res = await fetch(configUrl, {
|
| 15 |
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
|
| 16 |
+
cache: 'no-store'
|
| 17 |
});
|
| 18 |
if (!res.ok) return null;
|
| 19 |
|
|
|
|
| 21 |
const config = yaml.load(text);
|
| 22 |
|
| 23 |
const annotator = (config.annotators || []).find(a => a.username === username);
|
| 24 |
+
if (!annotator || !annotator.docs) return null;
|
| 25 |
|
| 26 |
+
// Support both old format (flat array) and new format (per-corpus object)
|
| 27 |
+
if (Array.isArray(annotator.docs)) {
|
| 28 |
+
// Legacy: flat array β treat as default corpus
|
| 29 |
+
return { _flat: new Set(annotator.docs) };
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
// New format: { wbg: [1,2], unhcr: [3,4] }
|
| 33 |
+
const result = {};
|
| 34 |
+
for (const [corpusId, docList] of Object.entries(annotator.docs)) {
|
| 35 |
+
if (Array.isArray(docList)) {
|
| 36 |
+
result[corpusId] = new Set(docList);
|
| 37 |
+
}
|
| 38 |
+
}
|
| 39 |
+
return Object.keys(result).length > 0 ? result : null;
|
| 40 |
} catch (e) {
|
| 41 |
console.warn('Could not load annotator_config.yaml:', e.message);
|
| 42 |
return null;
|
|
|
|
| 45 |
|
| 46 |
export async function GET(request) {
|
| 47 |
try {
|
|
|
|
| 48 |
const { searchParams } = new URL(request.url);
|
| 49 |
const username = searchParams.get('user');
|
| 50 |
|
|
|
|
| 51 |
const assignedDocs = await getUserAssignedDocs(username);
|
| 52 |
|
| 53 |
+
// Import corpora list
|
| 54 |
+
const { getCorpora } = await import('../../../utils/config.js');
|
| 55 |
+
const corpora = getCorpora();
|
| 56 |
+
|
| 57 |
+
const allDocuments = [];
|
| 58 |
+
|
| 59 |
+
for (const corpus of corpora) {
|
| 60 |
+
// Determine which doc indices this user has for this corpus
|
| 61 |
+
let userDocSet = null;
|
| 62 |
+
if (assignedDocs) {
|
| 63 |
+
if (assignedDocs._flat) {
|
| 64 |
+
// Legacy flat format β only applies to first/default corpus
|
| 65 |
+
userDocSet = corpus === corpora[0] ? assignedDocs._flat : new Set();
|
| 66 |
+
} else {
|
| 67 |
+
userDocSet = assignedDocs[corpus.id] || new Set();
|
| 68 |
+
}
|
| 69 |
+
if (userDocSet.size === 0) continue; // no docs for this corpus
|
| 70 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
|
| 72 |
+
// Fetch the links file for this corpus
|
| 73 |
+
const linksPath = getLinksRepoPath(corpus);
|
| 74 |
+
const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`;
|
| 75 |
+
const linksRes = await fetch(linksUrl, {
|
| 76 |
+
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
|
| 77 |
+
cache: 'no-store'
|
| 78 |
+
});
|
| 79 |
+
|
| 80 |
+
if (!linksRes.ok) {
|
| 81 |
+
console.warn(`No links file for corpus ${corpus.id}: ${linksRes.status}`);
|
| 82 |
+
continue;
|
| 83 |
+
}
|
| 84 |
|
| 85 |
+
const links = await linksRes.json();
|
| 86 |
|
| 87 |
+
let successLinks = links
|
| 88 |
+
.filter(l => l.status === 'success' && l.has_revalidation === true);
|
|
|
|
|
|
|
| 89 |
|
| 90 |
+
if (userDocSet) {
|
| 91 |
+
successLinks = successLinks.filter(l => userDocSet.has(l.index));
|
| 92 |
+
}
|
| 93 |
|
| 94 |
+
successLinks = successLinks.slice(0, MAX_DOCS_TO_SCAN);
|
| 95 |
+
|
| 96 |
+
// Parallel fetch docs
|
| 97 |
+
const results = await Promise.allSettled(
|
| 98 |
+
successLinks.map(async (link) => {
|
| 99 |
+
const docRepoPath = getDocRepoPath(corpus, link.index);
|
| 100 |
+
const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
|
| 101 |
+
const docRes = await fetch(docUrl, {
|
| 102 |
+
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
|
| 103 |
+
});
|
| 104 |
+
|
| 105 |
+
if (!docRes.ok) return null;
|
| 106 |
+
|
| 107 |
+
const pagesData = await docRes.json();
|
| 108 |
+
const annotatablePages = pagesData
|
| 109 |
+
.filter(page => page.datasets && page.datasets.length > 0)
|
| 110 |
+
.map(page => page.document.pages[0]);
|
| 111 |
+
|
| 112 |
+
if (annotatablePages.length === 0) return null;
|
| 113 |
+
|
| 114 |
+
const pdfUrl = link.direct_pdf_url;
|
| 115 |
+
if (!pdfUrl) return null;
|
| 116 |
+
|
| 117 |
+
return {
|
| 118 |
+
corpus: corpus.id,
|
| 119 |
+
corpus_name: corpus.name,
|
| 120 |
+
index: link.index,
|
| 121 |
+
pdf_url: pdfUrl,
|
| 122 |
+
landing_page: link.landing_page_url,
|
| 123 |
+
annotatable_pages: annotatablePages
|
| 124 |
+
};
|
| 125 |
+
})
|
| 126 |
+
);
|
| 127 |
|
| 128 |
+
const docs = results
|
| 129 |
+
.filter(r => r.status === 'fulfilled' && r.value !== null)
|
| 130 |
+
.map(r => r.value);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 131 |
|
| 132 |
+
allDocuments.push(...docs);
|
| 133 |
+
}
|
|
|
|
| 134 |
|
| 135 |
+
return new Response(JSON.stringify(allDocuments), {
|
| 136 |
status: 200,
|
| 137 |
headers: {
|
| 138 |
'Content-Type': 'application/json',
|
|
|
|
| 142 |
} catch (error) {
|
| 143 |
console.error(error);
|
| 144 |
return new Response(
|
| 145 |
+
JSON.stringify({ error: "Failed to fetch documents" }),
|
| 146 |
{ status: 500, headers: { 'Content-Type': 'application/json' } }
|
| 147 |
);
|
| 148 |
}
|
app/api/leaderboard/route.js
CHANGED
|
@@ -1,71 +1,69 @@
|
|
| 1 |
-
import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN } from '../../../utils/config.js';
|
| 2 |
|
| 3 |
-
export const dynamic = 'force-dynamic';
|
| 4 |
|
| 5 |
/**
|
| 6 |
* GET /api/leaderboard
|
| 7 |
-
*
|
| 8 |
*/
|
| 9 |
export async function GET() {
|
| 10 |
try {
|
| 11 |
-
const
|
| 12 |
-
const
|
| 13 |
-
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
|
| 14 |
-
cache: 'no-store'
|
| 15 |
-
});
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
-
|
| 22 |
-
const activeLinks = links
|
| 23 |
-
.filter(l => l.status === 'success' && l.has_revalidation === true)
|
| 24 |
-
.slice(0, MAX_DOCS_TO_SCAN);
|
| 25 |
|
| 26 |
-
|
| 27 |
-
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
| 37 |
|
| 38 |
-
|
| 39 |
-
const docAnnotators = new Set();
|
| 40 |
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
| 47 |
}
|
| 48 |
-
stats[ds.annotator].humanAdded++;
|
| 49 |
-
stats[ds.annotator].docs.add(link.index);
|
| 50 |
-
}
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
}
|
| 58 |
-
stats[v.annotator].verified++;
|
| 59 |
-
if (v.human_verdict === true) stats[v.annotator].correct++;
|
| 60 |
-
else stats[v.annotator].incorrect++;
|
| 61 |
-
stats[v.annotator].docs.add(link.index);
|
| 62 |
}
|
| 63 |
}
|
| 64 |
-
}
|
| 65 |
-
|
| 66 |
-
|
| 67 |
|
| 68 |
-
// Build ranked list
|
| 69 |
const leaderboard = Object.entries(stats)
|
| 70 |
.map(([annotator, s]) => ({
|
| 71 |
annotator,
|
|
@@ -74,7 +72,7 @@ export async function GET() {
|
|
| 74 |
incorrect: s.incorrect,
|
| 75 |
humanAdded: s.humanAdded,
|
| 76 |
docsWorked: s.docs.size,
|
| 77 |
-
score: s.verified + s.humanAdded,
|
| 78 |
}))
|
| 79 |
.sort((a, b) => b.score - a.score);
|
| 80 |
|
|
|
|
| 1 |
+
import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN, getCorpora, getLinksRepoPath, getDocRepoPath } from '../../../utils/config.js';
|
| 2 |
|
| 3 |
+
export const dynamic = 'force-dynamic';
|
| 4 |
|
| 5 |
/**
|
| 6 |
* GET /api/leaderboard
|
| 7 |
+
* Scans ALL corpora and returns annotator rankings.
|
| 8 |
*/
|
| 9 |
export async function GET() {
|
| 10 |
try {
|
| 11 |
+
const corpora = getCorpora();
|
| 12 |
+
const stats = {}; // annotator -> { verified, correct, incorrect, docs, humanAdded }
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
for (const corpus of corpora) {
|
| 15 |
+
const linksPath = getLinksRepoPath(corpus);
|
| 16 |
+
const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`;
|
| 17 |
+
const linksRes = await fetch(linksUrl, {
|
| 18 |
+
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
|
| 19 |
+
cache: 'no-store'
|
| 20 |
+
});
|
| 21 |
|
| 22 |
+
if (!linksRes.ok) continue;
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
const links = await linksRes.json();
|
| 25 |
+
const activeLinks = links
|
| 26 |
+
.filter(l => l.status === 'success' && l.has_revalidation === true)
|
| 27 |
+
.slice(0, MAX_DOCS_TO_SCAN);
|
| 28 |
|
| 29 |
+
await Promise.allSettled(
|
| 30 |
+
activeLinks.map(async (link) => {
|
| 31 |
+
const docRepoPath = getDocRepoPath(corpus, link.index);
|
| 32 |
+
const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
|
| 33 |
+
const docRes = await fetch(docUrl, {
|
| 34 |
+
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
|
| 35 |
+
cache: 'no-store'
|
| 36 |
+
});
|
| 37 |
+
if (!docRes.ok) return;
|
| 38 |
|
| 39 |
+
const pagesData = await docRes.json();
|
|
|
|
| 40 |
|
| 41 |
+
for (const page of pagesData) {
|
| 42 |
+
for (const ds of (page.datasets || [])) {
|
| 43 |
+
if (ds.source === 'human' && ds.annotator) {
|
| 44 |
+
if (!stats[ds.annotator]) {
|
| 45 |
+
stats[ds.annotator] = { verified: 0, correct: 0, incorrect: 0, docs: new Set(), humanAdded: 0 };
|
| 46 |
+
}
|
| 47 |
+
stats[ds.annotator].humanAdded++;
|
| 48 |
+
stats[ds.annotator].docs.add(`${corpus.id}:${link.index}`);
|
| 49 |
}
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
+
for (const v of (ds.validations || [])) {
|
| 52 |
+
if (!v.annotator || !v.human_validated) continue;
|
| 53 |
+
if (!stats[v.annotator]) {
|
| 54 |
+
stats[v.annotator] = { verified: 0, correct: 0, incorrect: 0, docs: new Set(), humanAdded: 0 };
|
| 55 |
+
}
|
| 56 |
+
stats[v.annotator].verified++;
|
| 57 |
+
if (v.human_verdict === true) stats[v.annotator].correct++;
|
| 58 |
+
else stats[v.annotator].incorrect++;
|
| 59 |
+
stats[v.annotator].docs.add(`${corpus.id}:${link.index}`);
|
| 60 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
}
|
| 62 |
}
|
| 63 |
+
})
|
| 64 |
+
);
|
| 65 |
+
}
|
| 66 |
|
|
|
|
| 67 |
const leaderboard = Object.entries(stats)
|
| 68 |
.map(([annotator, s]) => ({
|
| 69 |
annotator,
|
|
|
|
| 72 |
incorrect: s.incorrect,
|
| 73 |
humanAdded: s.humanAdded,
|
| 74 |
docsWorked: s.docs.size,
|
| 75 |
+
score: s.verified + s.humanAdded,
|
| 76 |
}))
|
| 77 |
.sort((a, b) => b.score - a.score);
|
| 78 |
|
app/api/progress/route.js
CHANGED
|
@@ -1,101 +1,104 @@
|
|
| 1 |
-
import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN } from '../../../utils/config.js';
|
| 2 |
|
| 3 |
/**
|
| 4 |
* GET /api/progress
|
| 5 |
-
* Returns progress stats
|
| 6 |
*/
|
| 7 |
export async function GET() {
|
| 8 |
try {
|
| 9 |
-
const
|
| 10 |
-
const
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
.
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
const pagesData = await docRes.json();
|
| 34 |
-
|
| 35 |
-
let totalMentions = 0;
|
| 36 |
-
let verifiedMentions = 0;
|
| 37 |
-
let totalPages = 0;
|
| 38 |
-
let completedPages = 0;
|
| 39 |
-
let humanAnnotations = 0;
|
| 40 |
-
|
| 41 |
-
for (const page of pagesData) {
|
| 42 |
-
const datasets = (page.datasets || []).filter(ds => {
|
| 43 |
-
// Exclude consensus non-datasets
|
| 44 |
-
if (ds.dataset_tag === 'non-dataset' && ds.dataset_name?.judge_agrees === true) {
|
| 45 |
-
return false;
|
| 46 |
-
}
|
| 47 |
-
return true;
|
| 48 |
});
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
totalMentions
|
| 54 |
-
|
| 55 |
-
let
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
}
|
| 64 |
-
}
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
}
|
| 70 |
-
}
|
| 71 |
-
|
| 72 |
-
const docComplete = totalPages > 0 && completedPages === totalPages;
|
| 73 |
-
|
| 74 |
-
return {
|
| 75 |
-
index: link.index,
|
| 76 |
-
totalPages,
|
| 77 |
-
completedPages,
|
| 78 |
-
totalMentions,
|
| 79 |
-
verifiedMentions,
|
| 80 |
-
humanAnnotations,
|
| 81 |
-
complete: docComplete,
|
| 82 |
-
};
|
| 83 |
-
})
|
| 84 |
-
);
|
| 85 |
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
const summary = {
|
| 91 |
-
totalDocs:
|
| 92 |
-
completedDocs:
|
| 93 |
-
totalPages:
|
| 94 |
-
completedPages:
|
| 95 |
-
totalMentions:
|
| 96 |
-
verifiedMentions:
|
| 97 |
-
humanAnnotations:
|
| 98 |
-
docs,
|
| 99 |
};
|
| 100 |
|
| 101 |
return new Response(JSON.stringify(summary), {
|
|
|
|
| 1 |
+
import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN, getCorpora, getLinksRepoPath, getDocRepoPath } from '../../../utils/config.js';
|
| 2 |
|
| 3 |
/**
|
| 4 |
* GET /api/progress
|
| 5 |
+
* Returns progress stats across ALL corpora.
|
| 6 |
*/
|
| 7 |
export async function GET() {
|
| 8 |
try {
|
| 9 |
+
const corpora = getCorpora();
|
| 10 |
+
const allDocs = [];
|
| 11 |
+
|
| 12 |
+
for (const corpus of corpora) {
|
| 13 |
+
const linksPath = getLinksRepoPath(corpus);
|
| 14 |
+
const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`;
|
| 15 |
+
const linksRes = await fetch(linksUrl, {
|
| 16 |
+
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
|
| 17 |
+
next: { revalidate: 300 }
|
| 18 |
+
});
|
| 19 |
+
|
| 20 |
+
if (!linksRes.ok) continue;
|
| 21 |
+
|
| 22 |
+
const links = await linksRes.json();
|
| 23 |
+
const activeLinks = links
|
| 24 |
+
.filter(l => l.status === 'success' && l.has_revalidation === true)
|
| 25 |
+
.slice(0, MAX_DOCS_TO_SCAN);
|
| 26 |
+
|
| 27 |
+
const results = await Promise.allSettled(
|
| 28 |
+
activeLinks.map(async (link) => {
|
| 29 |
+
const docRepoPath = getDocRepoPath(corpus, link.index);
|
| 30 |
+
const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
|
| 31 |
+
const docRes = await fetch(docUrl, {
|
| 32 |
+
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
});
|
| 34 |
+
if (!docRes.ok) return null;
|
| 35 |
+
|
| 36 |
+
const pagesData = await docRes.json();
|
| 37 |
+
|
| 38 |
+
let totalMentions = 0;
|
| 39 |
+
let verifiedMentions = 0;
|
| 40 |
+
let totalPages = 0;
|
| 41 |
+
let completedPages = 0;
|
| 42 |
+
let humanAnnotations = 0;
|
| 43 |
+
|
| 44 |
+
for (const page of pagesData) {
|
| 45 |
+
const datasets = (page.datasets || []).filter(ds => {
|
| 46 |
+
if (ds.dataset_tag === 'non-dataset' && ds.dataset_name?.judge_agrees === true) {
|
| 47 |
+
return false;
|
| 48 |
+
}
|
| 49 |
+
return true;
|
| 50 |
+
});
|
| 51 |
+
|
| 52 |
+
if (datasets.length === 0) continue;
|
| 53 |
+
|
| 54 |
+
totalPages++;
|
| 55 |
+
totalMentions += datasets.length;
|
| 56 |
+
|
| 57 |
+
let pageVerified = 0;
|
| 58 |
+
for (const ds of datasets) {
|
| 59 |
+
if (ds.human_validated === true) {
|
| 60 |
+
verifiedMentions++;
|
| 61 |
+
pageVerified++;
|
| 62 |
+
}
|
| 63 |
+
if (ds.source === 'human') {
|
| 64 |
+
humanAnnotations++;
|
| 65 |
+
}
|
| 66 |
}
|
|
|
|
| 67 |
|
| 68 |
+
if (pageVerified === datasets.length) {
|
| 69 |
+
completedPages++;
|
| 70 |
+
}
|
| 71 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
+
return {
|
| 74 |
+
corpus: corpus.id,
|
| 75 |
+
index: link.index,
|
| 76 |
+
totalPages,
|
| 77 |
+
completedPages,
|
| 78 |
+
totalMentions,
|
| 79 |
+
verifiedMentions,
|
| 80 |
+
humanAnnotations,
|
| 81 |
+
complete: totalPages > 0 && completedPages === totalPages,
|
| 82 |
+
};
|
| 83 |
+
})
|
| 84 |
+
);
|
| 85 |
+
|
| 86 |
+
const docs = results
|
| 87 |
+
.filter(r => r.status === 'fulfilled' && r.value !== null)
|
| 88 |
+
.map(r => r.value);
|
| 89 |
+
|
| 90 |
+
allDocs.push(...docs);
|
| 91 |
+
}
|
| 92 |
|
| 93 |
const summary = {
|
| 94 |
+
totalDocs: allDocs.length,
|
| 95 |
+
completedDocs: allDocs.filter(d => d.complete).length,
|
| 96 |
+
totalPages: allDocs.reduce((s, d) => s + d.totalPages, 0),
|
| 97 |
+
completedPages: allDocs.reduce((s, d) => s + d.completedPages, 0),
|
| 98 |
+
totalMentions: allDocs.reduce((s, d) => s + d.totalMentions, 0),
|
| 99 |
+
verifiedMentions: allDocs.reduce((s, d) => s + d.verifiedMentions, 0),
|
| 100 |
+
humanAnnotations: allDocs.reduce((s, d) => s + d.humanAnnotations, 0),
|
| 101 |
+
docs: allDocs,
|
| 102 |
};
|
| 103 |
|
| 104 |
return new Response(JSON.stringify(summary), {
|
app/api/validate/route.js
CHANGED
|
@@ -1,31 +1,18 @@
|
|
| 1 |
import { NextResponse } from 'next/server';
|
| 2 |
import fs from 'fs';
|
| 3 |
-
import path from 'path';
|
| 4 |
import { commit } from '@huggingface/hub';
|
| 5 |
-
import { HF_DATASET_ID, HF_DATASET_BASE_URL } from '../../../utils/config.js';
|
| 6 |
|
| 7 |
const isHFSpace = () => process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
|
| 8 |
|
| 9 |
-
function getDocFilePath(docIndex) {
|
| 10 |
-
return path.join(
|
| 11 |
-
process.cwd(),
|
| 12 |
-
'annotation_data', 'wbg_extractions',
|
| 13 |
-
`doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl`
|
| 14 |
-
);
|
| 15 |
-
}
|
| 16 |
-
|
| 17 |
-
function getDocRepoPath(docIndex) {
|
| 18 |
-
return `annotation_data/wbg_extractions/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`;
|
| 19 |
-
}
|
| 20 |
-
|
| 21 |
/**
|
| 22 |
* PUT /api/validate
|
| 23 |
-
*
|
| 24 |
-
* Body: { document_index, page_number, dataset_index, updates }
|
| 25 |
*/
|
| 26 |
export async function PUT(request) {
|
| 27 |
try {
|
| 28 |
-
const { document_index, page_number, dataset_index, updates } = await request.json();
|
|
|
|
| 29 |
|
| 30 |
if (document_index == null || page_number == null || dataset_index == null || !updates) {
|
| 31 |
return NextResponse.json(
|
|
@@ -37,23 +24,23 @@ export async function PUT(request) {
|
|
| 37 |
let pagesData;
|
| 38 |
|
| 39 |
if (isHFSpace()) {
|
| 40 |
-
const
|
|
|
|
| 41 |
const res = await fetch(url, {
|
| 42 |
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
|
| 43 |
});
|
| 44 |
if (!res.ok) {
|
| 45 |
-
return NextResponse.json({ error:
|
| 46 |
}
|
| 47 |
pagesData = await res.json();
|
| 48 |
} else {
|
| 49 |
-
const filePath =
|
| 50 |
if (!fs.existsSync(filePath)) {
|
| 51 |
-
return NextResponse.json({ error:
|
| 52 |
}
|
| 53 |
pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
|
| 54 |
}
|
| 55 |
|
| 56 |
-
// Find the page
|
| 57 |
const pageIdx = pagesData.findIndex(p => p.document?.pages?.[0] === page_number);
|
| 58 |
if (pageIdx === -1) {
|
| 59 |
return NextResponse.json({ error: `Page ${page_number} not found` }, { status: 404 });
|
|
@@ -64,12 +51,9 @@ export async function PUT(request) {
|
|
| 64 |
return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
|
| 65 |
}
|
| 66 |
|
| 67 |
-
// Per-annotator validation
|
| 68 |
-
// Each annotator gets their own entry; re-validating updates in-place.
|
| 69 |
const currentEntry = pagesData[pageIdx].datasets[dataset_index];
|
| 70 |
const annotator = updates.annotator || 'unknown';
|
| 71 |
-
|
| 72 |
-
// Separate validation fields from other updates (like dataset_tag edits)
|
| 73 |
const validationFields = ['human_validated', 'human_verdict', 'human_notes', 'annotator', 'validated_at'];
|
| 74 |
const isValidation = validationFields.some(f => f in updates);
|
| 75 |
|
|
@@ -90,27 +74,19 @@ export async function PUT(request) {
|
|
| 90 |
validations.push(validationEntry);
|
| 91 |
}
|
| 92 |
|
| 93 |
-
pagesData[pageIdx].datasets[dataset_index] = {
|
| 94 |
-
...currentEntry,
|
| 95 |
-
validations,
|
| 96 |
-
};
|
| 97 |
} else {
|
| 98 |
-
|
| 99 |
-
pagesData[pageIdx].datasets[dataset_index] = {
|
| 100 |
-
...currentEntry,
|
| 101 |
-
...updates,
|
| 102 |
-
};
|
| 103 |
}
|
| 104 |
|
| 105 |
// Save back
|
| 106 |
if (isHFSpace()) {
|
| 107 |
-
const
|
| 108 |
-
const repoPath = getDocRepoPath(document_index);
|
| 109 |
const content = JSON.stringify(pagesData, null, 2);
|
| 110 |
await commit({
|
| 111 |
repo: { type: 'dataset', name: HF_DATASET_ID },
|
| 112 |
-
credentials: { accessToken:
|
| 113 |
-
title: `Validate
|
| 114 |
operations: [{
|
| 115 |
operation: 'addOrUpdate',
|
| 116 |
path: repoPath,
|
|
@@ -118,7 +94,7 @@ export async function PUT(request) {
|
|
| 118 |
}],
|
| 119 |
});
|
| 120 |
} else {
|
| 121 |
-
const filePath =
|
| 122 |
fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
|
| 123 |
}
|
| 124 |
|
|
@@ -133,16 +109,18 @@ export async function PUT(request) {
|
|
| 133 |
}
|
| 134 |
|
| 135 |
/**
|
| 136 |
-
* DELETE /api/validate?doc=X&page=Y&idx=Z
|
| 137 |
-
* Removes a dataset entry by its array index.
|
| 138 |
*/
|
| 139 |
export async function DELETE(request) {
|
| 140 |
try {
|
| 141 |
const { searchParams } = new URL(request.url);
|
|
|
|
| 142 |
const document_index = parseInt(searchParams.get('doc'), 10);
|
| 143 |
const page_number = parseInt(searchParams.get('page'), 10);
|
| 144 |
const dataset_index = parseInt(searchParams.get('idx'), 10);
|
| 145 |
|
|
|
|
|
|
|
| 146 |
if (isNaN(document_index) || isNaN(page_number) || isNaN(dataset_index)) {
|
| 147 |
return NextResponse.json(
|
| 148 |
{ error: 'Missing doc, page, or idx parameter' },
|
|
@@ -153,18 +131,19 @@ export async function DELETE(request) {
|
|
| 153 |
let pagesData;
|
| 154 |
|
| 155 |
if (isHFSpace()) {
|
| 156 |
-
const
|
|
|
|
| 157 |
const res = await fetch(url, {
|
| 158 |
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
|
| 159 |
});
|
| 160 |
if (!res.ok) {
|
| 161 |
-
return NextResponse.json({ error:
|
| 162 |
}
|
| 163 |
pagesData = await res.json();
|
| 164 |
} else {
|
| 165 |
-
const filePath =
|
| 166 |
if (!fs.existsSync(filePath)) {
|
| 167 |
-
return NextResponse.json({ error:
|
| 168 |
}
|
| 169 |
pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
|
| 170 |
}
|
|
@@ -179,18 +158,15 @@ export async function DELETE(request) {
|
|
| 179 |
return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
|
| 180 |
}
|
| 181 |
|
| 182 |
-
// Remove the entry
|
| 183 |
pagesData[pageIdx].datasets.splice(dataset_index, 1);
|
| 184 |
|
| 185 |
-
// Save back
|
| 186 |
if (isHFSpace()) {
|
| 187 |
-
const
|
| 188 |
-
const repoPath = getDocRepoPath(document_index);
|
| 189 |
const content = JSON.stringify(pagesData, null, 2);
|
| 190 |
await commit({
|
| 191 |
repo: { type: 'dataset', name: HF_DATASET_ID },
|
| 192 |
-
credentials: { accessToken:
|
| 193 |
-
title: `Delete
|
| 194 |
operations: [{
|
| 195 |
operation: 'addOrUpdate',
|
| 196 |
path: repoPath,
|
|
@@ -198,7 +174,7 @@ export async function DELETE(request) {
|
|
| 198 |
}],
|
| 199 |
});
|
| 200 |
} else {
|
| 201 |
-
const filePath =
|
| 202 |
fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
|
| 203 |
}
|
| 204 |
|
|
|
|
| 1 |
import { NextResponse } from 'next/server';
|
| 2 |
import fs from 'fs';
|
|
|
|
| 3 |
import { commit } from '@huggingface/hub';
|
| 4 |
+
import { HF_DATASET_ID, HF_DATASET_BASE_URL, getCorpus, getDocRepoPath, getDocLocalPath } from '../../../utils/config.js';
|
| 5 |
|
| 6 |
const isHFSpace = () => process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
/**
|
| 9 |
* PUT /api/validate
|
| 10 |
+
* Body: { corpus, document_index, page_number, dataset_index, updates }
|
|
|
|
| 11 |
*/
|
| 12 |
export async function PUT(request) {
|
| 13 |
try {
|
| 14 |
+
const { corpus: corpusId, document_index, page_number, dataset_index, updates } = await request.json();
|
| 15 |
+
const corpus = getCorpus(corpusId);
|
| 16 |
|
| 17 |
if (document_index == null || page_number == null || dataset_index == null || !updates) {
|
| 18 |
return NextResponse.json(
|
|
|
|
| 24 |
let pagesData;
|
| 25 |
|
| 26 |
if (isHFSpace()) {
|
| 27 |
+
const repoPath = getDocRepoPath(corpus, document_index);
|
| 28 |
+
const url = `${HF_DATASET_BASE_URL}/raw/main/${repoPath}`;
|
| 29 |
const res = await fetch(url, {
|
| 30 |
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
|
| 31 |
});
|
| 32 |
if (!res.ok) {
|
| 33 |
+
return NextResponse.json({ error: `Document not found on HF (${corpus.id})` }, { status: 404 });
|
| 34 |
}
|
| 35 |
pagesData = await res.json();
|
| 36 |
} else {
|
| 37 |
+
const filePath = getDocLocalPath(corpus, document_index);
|
| 38 |
if (!fs.existsSync(filePath)) {
|
| 39 |
+
return NextResponse.json({ error: `Document not found locally (${corpus.id})` }, { status: 404 });
|
| 40 |
}
|
| 41 |
pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
|
| 42 |
}
|
| 43 |
|
|
|
|
| 44 |
const pageIdx = pagesData.findIndex(p => p.document?.pages?.[0] === page_number);
|
| 45 |
if (pageIdx === -1) {
|
| 46 |
return NextResponse.json({ error: `Page ${page_number} not found` }, { status: 404 });
|
|
|
|
| 51 |
return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
|
| 52 |
}
|
| 53 |
|
| 54 |
+
// Per-annotator validation
|
|
|
|
| 55 |
const currentEntry = pagesData[pageIdx].datasets[dataset_index];
|
| 56 |
const annotator = updates.annotator || 'unknown';
|
|
|
|
|
|
|
| 57 |
const validationFields = ['human_validated', 'human_verdict', 'human_notes', 'annotator', 'validated_at'];
|
| 58 |
const isValidation = validationFields.some(f => f in updates);
|
| 59 |
|
|
|
|
| 74 |
validations.push(validationEntry);
|
| 75 |
}
|
| 76 |
|
| 77 |
+
pagesData[pageIdx].datasets[dataset_index] = { ...currentEntry, validations };
|
|
|
|
|
|
|
|
|
|
| 78 |
} else {
|
| 79 |
+
pagesData[pageIdx].datasets[dataset_index] = { ...currentEntry, ...updates };
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
}
|
| 81 |
|
| 82 |
// Save back
|
| 83 |
if (isHFSpace()) {
|
| 84 |
+
const repoPath = getDocRepoPath(corpus, document_index);
|
|
|
|
| 85 |
const content = JSON.stringify(pagesData, null, 2);
|
| 86 |
await commit({
|
| 87 |
repo: { type: 'dataset', name: HF_DATASET_ID },
|
| 88 |
+
credentials: { accessToken: process.env.HF_TOKEN },
|
| 89 |
+
title: `Validate ${corpus.id}/doc_${document_index} page ${page_number}`,
|
| 90 |
operations: [{
|
| 91 |
operation: 'addOrUpdate',
|
| 92 |
path: repoPath,
|
|
|
|
| 94 |
}],
|
| 95 |
});
|
| 96 |
} else {
|
| 97 |
+
const filePath = getDocLocalPath(corpus, document_index);
|
| 98 |
fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
|
| 99 |
}
|
| 100 |
|
|
|
|
| 109 |
}
|
| 110 |
|
| 111 |
/**
|
| 112 |
+
* DELETE /api/validate?corpus=X&doc=X&page=Y&idx=Z
|
|
|
|
| 113 |
*/
|
| 114 |
export async function DELETE(request) {
|
| 115 |
try {
|
| 116 |
const { searchParams } = new URL(request.url);
|
| 117 |
+
const corpusId = searchParams.get('corpus');
|
| 118 |
const document_index = parseInt(searchParams.get('doc'), 10);
|
| 119 |
const page_number = parseInt(searchParams.get('page'), 10);
|
| 120 |
const dataset_index = parseInt(searchParams.get('idx'), 10);
|
| 121 |
|
| 122 |
+
const corpus = getCorpus(corpusId);
|
| 123 |
+
|
| 124 |
if (isNaN(document_index) || isNaN(page_number) || isNaN(dataset_index)) {
|
| 125 |
return NextResponse.json(
|
| 126 |
{ error: 'Missing doc, page, or idx parameter' },
|
|
|
|
| 131 |
let pagesData;
|
| 132 |
|
| 133 |
if (isHFSpace()) {
|
| 134 |
+
const repoPath = getDocRepoPath(corpus, document_index);
|
| 135 |
+
const url = `${HF_DATASET_BASE_URL}/raw/main/${repoPath}`;
|
| 136 |
const res = await fetch(url, {
|
| 137 |
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
|
| 138 |
});
|
| 139 |
if (!res.ok) {
|
| 140 |
+
return NextResponse.json({ error: `Document not found on HF (${corpus.id})` }, { status: 404 });
|
| 141 |
}
|
| 142 |
pagesData = await res.json();
|
| 143 |
} else {
|
| 144 |
+
const filePath = getDocLocalPath(corpus, document_index);
|
| 145 |
if (!fs.existsSync(filePath)) {
|
| 146 |
+
return NextResponse.json({ error: `Document not found locally (${corpus.id})` }, { status: 404 });
|
| 147 |
}
|
| 148 |
pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
|
| 149 |
}
|
|
|
|
| 158 |
return NextResponse.json({ error: `Dataset index ${dataset_index} out of range` }, { status: 400 });
|
| 159 |
}
|
| 160 |
|
|
|
|
| 161 |
pagesData[pageIdx].datasets.splice(dataset_index, 1);
|
| 162 |
|
|
|
|
| 163 |
if (isHFSpace()) {
|
| 164 |
+
const repoPath = getDocRepoPath(corpus, document_index);
|
|
|
|
| 165 |
const content = JSON.stringify(pagesData, null, 2);
|
| 166 |
await commit({
|
| 167 |
repo: { type: 'dataset', name: HF_DATASET_ID },
|
| 168 |
+
credentials: { accessToken: process.env.HF_TOKEN },
|
| 169 |
+
title: `Delete from ${corpus.id}/doc_${document_index} page ${page_number}`,
|
| 170 |
operations: [{
|
| 171 |
operation: 'addOrUpdate',
|
| 172 |
path: repoPath,
|
|
|
|
| 174 |
}],
|
| 175 |
});
|
| 176 |
} else {
|
| 177 |
+
const filePath = getDocLocalPath(corpus, document_index);
|
| 178 |
fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
|
| 179 |
}
|
| 180 |
|
app/components/DocumentSelector.js
CHANGED
|
@@ -3,23 +3,28 @@
|
|
| 3 |
export default function DocumentSelector({
|
| 4 |
documents,
|
| 5 |
selectedDocIndex,
|
|
|
|
| 6 |
onDocChange,
|
| 7 |
}) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
return (
|
| 9 |
<div className="navigation-controls">
|
| 10 |
<div className="select-group">
|
| 11 |
<label htmlFor="doc-select">Document</label>
|
| 12 |
<select
|
| 13 |
id="doc-select"
|
| 14 |
-
value={
|
| 15 |
onChange={(e) => {
|
| 16 |
-
const
|
| 17 |
-
onDocChange(
|
| 18 |
}}
|
| 19 |
>
|
| 20 |
{documents.map(doc => (
|
| 21 |
-
<option key={doc.index} value={doc.index}>
|
| 22 |
-
|
| 23 |
</option>
|
| 24 |
))}
|
| 25 |
</select>
|
|
|
|
| 3 |
export default function DocumentSelector({
|
| 4 |
documents,
|
| 5 |
selectedDocIndex,
|
| 6 |
+
selectedCorpus,
|
| 7 |
onDocChange,
|
| 8 |
}) {
|
| 9 |
+
const currentValue = selectedCorpus && selectedDocIndex != null
|
| 10 |
+
? `${selectedCorpus}:${selectedDocIndex}`
|
| 11 |
+
: '';
|
| 12 |
+
|
| 13 |
return (
|
| 14 |
<div className="navigation-controls">
|
| 15 |
<div className="select-group">
|
| 16 |
<label htmlFor="doc-select">Document</label>
|
| 17 |
<select
|
| 18 |
id="doc-select"
|
| 19 |
+
value={currentValue}
|
| 20 |
onChange={(e) => {
|
| 21 |
+
const [corpus, idx] = e.target.value.split(':');
|
| 22 |
+
onDocChange(corpus, parseInt(idx, 10));
|
| 23 |
}}
|
| 24 |
>
|
| 25 |
{documents.map(doc => (
|
| 26 |
+
<option key={`${doc.corpus}:${doc.index}`} value={`${doc.corpus}:${doc.index}`}>
|
| 27 |
+
[{doc.corpus_name}] Doc {doc.index} ({doc.annotatable_pages.length} pages)
|
| 28 |
</option>
|
| 29 |
))}
|
| 30 |
</select>
|
app/page.js
CHANGED
|
@@ -13,6 +13,7 @@ import Leaderboard from './components/Leaderboard';
|
|
| 13 |
export default function Home() {
|
| 14 |
const [documents, setDocuments] = useState([]);
|
| 15 |
const [selectedDocIndex, setSelectedDocIndex] = useState(null);
|
|
|
|
| 16 |
const [currentDoc, setCurrentDoc] = useState(null);
|
| 17 |
|
| 18 |
// Page-by-page navigation: track the index into annotatable_pages array
|
|
@@ -60,17 +61,19 @@ export default function Home() {
|
|
| 60 |
.then(data => {
|
| 61 |
setDocuments(data);
|
| 62 |
if (data.length > 0) {
|
| 63 |
-
// Restore saved position from sessionStorage
|
| 64 |
const savedDoc = sessionStorage.getItem('selectedDocIndex');
|
|
|
|
| 65 |
const savedPage = sessionStorage.getItem('pageIdx');
|
| 66 |
const docIdx = savedDoc ? parseInt(savedDoc, 10) : null;
|
| 67 |
-
const restoredDoc = docIdx !== null && data.find(d => d.index === docIdx);
|
| 68 |
|
| 69 |
if (restoredDoc) {
|
| 70 |
setSelectedDocIndex(docIdx);
|
|
|
|
| 71 |
setPageIdx(savedPage ? parseInt(savedPage, 10) : 0);
|
| 72 |
} else {
|
| 73 |
setSelectedDocIndex(data[0].index);
|
|
|
|
| 74 |
setPageIdx(0);
|
| 75 |
}
|
| 76 |
}
|
|
@@ -101,10 +104,11 @@ export default function Home() {
|
|
| 101 |
|
| 102 |
// Update currentDoc when selection changes + persist to sessionStorage
|
| 103 |
useEffect(() => {
|
| 104 |
-
if (selectedDocIndex !== null) {
|
| 105 |
-
const doc = documents.find(d => d.index === selectedDocIndex);
|
| 106 |
setCurrentDoc(doc);
|
| 107 |
sessionStorage.setItem('selectedDocIndex', selectedDocIndex);
|
|
|
|
| 108 |
|
| 109 |
// Clamp pageIdx to valid range for this document
|
| 110 |
if (doc) {
|
|
@@ -112,7 +116,7 @@ export default function Home() {
|
|
| 112 |
setPageIdx(prev => Math.min(prev, Math.max(0, maxPage)));
|
| 113 |
}
|
| 114 |
}
|
| 115 |
-
}, [selectedDocIndex, documents]);
|
| 116 |
|
| 117 |
// Persist pageIdx to sessionStorage
|
| 118 |
useEffect(() => {
|
|
@@ -123,7 +127,7 @@ export default function Home() {
|
|
| 123 |
const refreshPageData = useCallback(() => {
|
| 124 |
if (selectedDocIndex !== null && currentPageNumber !== null) {
|
| 125 |
setLoadingPage(true);
|
| 126 |
-
fetch(`/api/document?index=${selectedDocIndex}&page=${currentPageNumber}`)
|
| 127 |
.then(res => res.json())
|
| 128 |
.then(data => {
|
| 129 |
setCurrentPageData(data);
|
|
@@ -175,7 +179,8 @@ export default function Home() {
|
|
| 175 |
localStorage.setItem('annotator_name', name);
|
| 176 |
};
|
| 177 |
|
| 178 |
-
const handleDocChange = (docIdx) => {
|
|
|
|
| 179 |
setSelectedDocIndex(docIdx);
|
| 180 |
setPageIdx(0);
|
| 181 |
};
|
|
@@ -270,6 +275,7 @@ export default function Home() {
|
|
| 270 |
dataset_tag: dataset_tag,
|
| 271 |
source: 'human',
|
| 272 |
annotator: annotatorName || "user",
|
|
|
|
| 273 |
document_index: selectedDocIndex,
|
| 274 |
page_number: currentPageNumber,
|
| 275 |
timestamp: new Date().toISOString(),
|
|
@@ -313,7 +319,7 @@ export default function Home() {
|
|
| 313 |
const rawIdx = ds._rawIndex ?? idx;
|
| 314 |
try {
|
| 315 |
const res = await fetch(
|
| 316 |
-
`/api/validate?doc=${selectedDocIndex}&page=${currentPageNumber}&idx=${rawIdx}`,
|
| 317 |
{ method: 'DELETE' }
|
| 318 |
);
|
| 319 |
if (res.ok) {
|
|
@@ -373,6 +379,7 @@ export default function Home() {
|
|
| 373 |
method: 'PUT',
|
| 374 |
headers: { 'Content-Type': 'application/json' },
|
| 375 |
body: JSON.stringify({
|
|
|
|
| 376 |
document_index: selectedDocIndex,
|
| 377 |
page_number: currentPageNumber,
|
| 378 |
dataset_index: datasetIdx,
|
|
@@ -474,6 +481,7 @@ export default function Home() {
|
|
| 474 |
<DocumentSelector
|
| 475 |
documents={documents}
|
| 476 |
selectedDocIndex={selectedDocIndex}
|
|
|
|
| 477 |
onDocChange={handleDocChange}
|
| 478 |
/>
|
| 479 |
</div>
|
|
|
|
| 13 |
export default function Home() {
|
| 14 |
const [documents, setDocuments] = useState([]);
|
| 15 |
const [selectedDocIndex, setSelectedDocIndex] = useState(null);
|
| 16 |
+
const [selectedCorpus, setSelectedCorpus] = useState(null);
|
| 17 |
const [currentDoc, setCurrentDoc] = useState(null);
|
| 18 |
|
| 19 |
// Page-by-page navigation: track the index into annotatable_pages array
|
|
|
|
| 61 |
.then(data => {
|
| 62 |
setDocuments(data);
|
| 63 |
if (data.length > 0) {
|
|
|
|
| 64 |
const savedDoc = sessionStorage.getItem('selectedDocIndex');
|
| 65 |
+
const savedCorpus = sessionStorage.getItem('selectedCorpus');
|
| 66 |
const savedPage = sessionStorage.getItem('pageIdx');
|
| 67 |
const docIdx = savedDoc ? parseInt(savedDoc, 10) : null;
|
| 68 |
+
const restoredDoc = docIdx !== null && data.find(d => d.index === docIdx && (!savedCorpus || d.corpus === savedCorpus));
|
| 69 |
|
| 70 |
if (restoredDoc) {
|
| 71 |
setSelectedDocIndex(docIdx);
|
| 72 |
+
setSelectedCorpus(restoredDoc.corpus);
|
| 73 |
setPageIdx(savedPage ? parseInt(savedPage, 10) : 0);
|
| 74 |
} else {
|
| 75 |
setSelectedDocIndex(data[0].index);
|
| 76 |
+
setSelectedCorpus(data[0].corpus);
|
| 77 |
setPageIdx(0);
|
| 78 |
}
|
| 79 |
}
|
|
|
|
| 104 |
|
| 105 |
// Update currentDoc when selection changes + persist to sessionStorage
|
| 106 |
useEffect(() => {
|
| 107 |
+
if (selectedDocIndex !== null && selectedCorpus !== null) {
|
| 108 |
+
const doc = documents.find(d => d.index === selectedDocIndex && d.corpus === selectedCorpus);
|
| 109 |
setCurrentDoc(doc);
|
| 110 |
sessionStorage.setItem('selectedDocIndex', selectedDocIndex);
|
| 111 |
+
sessionStorage.setItem('selectedCorpus', selectedCorpus);
|
| 112 |
|
| 113 |
// Clamp pageIdx to valid range for this document
|
| 114 |
if (doc) {
|
|
|
|
| 116 |
setPageIdx(prev => Math.min(prev, Math.max(0, maxPage)));
|
| 117 |
}
|
| 118 |
}
|
| 119 |
+
}, [selectedDocIndex, selectedCorpus, documents]);
|
| 120 |
|
| 121 |
// Persist pageIdx to sessionStorage
|
| 122 |
useEffect(() => {
|
|
|
|
| 127 |
const refreshPageData = useCallback(() => {
|
| 128 |
if (selectedDocIndex !== null && currentPageNumber !== null) {
|
| 129 |
setLoadingPage(true);
|
| 130 |
+
fetch(`/api/document?index=${selectedDocIndex}&page=${currentPageNumber}&corpus=${selectedCorpus || ''}`)
|
| 131 |
.then(res => res.json())
|
| 132 |
.then(data => {
|
| 133 |
setCurrentPageData(data);
|
|
|
|
| 179 |
localStorage.setItem('annotator_name', name);
|
| 180 |
};
|
| 181 |
|
| 182 |
+
const handleDocChange = (corpus, docIdx) => {
|
| 183 |
+
setSelectedCorpus(corpus);
|
| 184 |
setSelectedDocIndex(docIdx);
|
| 185 |
setPageIdx(0);
|
| 186 |
};
|
|
|
|
| 275 |
dataset_tag: dataset_tag,
|
| 276 |
source: 'human',
|
| 277 |
annotator: annotatorName || "user",
|
| 278 |
+
corpus: selectedCorpus,
|
| 279 |
document_index: selectedDocIndex,
|
| 280 |
page_number: currentPageNumber,
|
| 281 |
timestamp: new Date().toISOString(),
|
|
|
|
| 319 |
const rawIdx = ds._rawIndex ?? idx;
|
| 320 |
try {
|
| 321 |
const res = await fetch(
|
| 322 |
+
`/api/validate?doc=${selectedDocIndex}&page=${currentPageNumber}&idx=${rawIdx}&corpus=${selectedCorpus || ''}`,
|
| 323 |
{ method: 'DELETE' }
|
| 324 |
);
|
| 325 |
if (res.ok) {
|
|
|
|
| 379 |
method: 'PUT',
|
| 380 |
headers: { 'Content-Type': 'application/json' },
|
| 381 |
body: JSON.stringify({
|
| 382 |
+
corpus: selectedCorpus,
|
| 383 |
document_index: selectedDocIndex,
|
| 384 |
page_number: currentPageNumber,
|
| 385 |
dataset_index: datasetIdx,
|
|
|
|
| 481 |
<DocumentSelector
|
| 482 |
documents={documents}
|
| 483 |
selectedDocIndex={selectedDocIndex}
|
| 484 |
+
selectedCorpus={selectedCorpus}
|
| 485 |
onDocChange={handleDocChange}
|
| 486 |
/>
|
| 487 |
</div>
|
generate_assignments.py
CHANGED
|
@@ -2,8 +2,9 @@
|
|
| 2 |
"""
|
| 3 |
generate_assignments.py
|
| 4 |
|
| 5 |
-
Reads annotator_config.yaml, distributes available docs
|
| 6 |
-
with configurable overlap, and writes back
|
|
|
|
| 7 |
|
| 8 |
Usage:
|
| 9 |
python3 generate_assignments.py # Generate and save
|
|
@@ -26,7 +27,7 @@ except ImportError:
|
|
| 26 |
sys.exit(1)
|
| 27 |
|
| 28 |
CONFIG_PATH = Path(__file__).parent / "annotation_data" / "annotator_config.yaml"
|
| 29 |
-
|
| 30 |
|
| 31 |
|
| 32 |
def load_config():
|
|
@@ -37,17 +38,25 @@ def save_config(config):
|
|
| 37 |
CONFIG_PATH.write_text(yaml.dump(config, default_flow_style=False, sort_keys=False))
|
| 38 |
|
| 39 |
|
| 40 |
-
def
|
| 41 |
-
|
| 42 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
return sorted([
|
| 44 |
l["index"] for l in links
|
| 45 |
if l.get("has_revalidation") and l.get("status") == "success"
|
| 46 |
])
|
| 47 |
|
| 48 |
|
| 49 |
-
def generate_assignments(config, seed=42):
|
| 50 |
-
"""Distribute docs across annotators with overlap."""
|
| 51 |
settings = config.get("settings", {})
|
| 52 |
overlap_pct = settings.get("overlap_percent", 10)
|
| 53 |
annotators = config.get("annotators", [])
|
|
@@ -56,44 +65,49 @@ def generate_assignments(config, seed=42):
|
|
| 56 |
print("β No annotators defined in config.")
|
| 57 |
return config
|
| 58 |
|
| 59 |
-
all_docs = get_available_docs()
|
| 60 |
-
n_docs = len(all_docs)
|
| 61 |
n_annotators = len(annotators)
|
|
|
|
| 62 |
|
| 63 |
-
#
|
| 64 |
-
|
|
|
|
|
|
|
| 65 |
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
start
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
|
| 98 |
return config
|
| 99 |
|
|
@@ -130,14 +144,17 @@ def upload_config():
|
|
| 130 |
|
| 131 |
|
| 132 |
def main():
|
| 133 |
-
parser = argparse.ArgumentParser(description="Generate document assignments")
|
| 134 |
parser.add_argument("--dry-run", action="store_true", help="Preview only")
|
| 135 |
parser.add_argument("--upload", action="store_true", help="Upload config to HF")
|
| 136 |
parser.add_argument("--seed", type=int, default=42, help="Random seed")
|
| 137 |
args = parser.parse_args()
|
| 138 |
|
|
|
|
| 139 |
config = load_config()
|
| 140 |
-
|
|
|
|
|
|
|
| 141 |
|
| 142 |
if args.dry_run:
|
| 143 |
print("\n[DRY RUN] Would save:")
|
|
|
|
| 2 |
"""
|
| 3 |
generate_assignments.py
|
| 4 |
|
| 5 |
+
Reads corpora.json and annotator_config.yaml, distributes available docs
|
| 6 |
+
across annotators with configurable overlap per corpus, and writes back
|
| 7 |
+
the updated config.
|
| 8 |
|
| 9 |
Usage:
|
| 10 |
python3 generate_assignments.py # Generate and save
|
|
|
|
| 27 |
sys.exit(1)
|
| 28 |
|
| 29 |
CONFIG_PATH = Path(__file__).parent / "annotation_data" / "annotator_config.yaml"
|
| 30 |
+
CORPORA_PATH = Path(__file__).parent / "annotation_data" / "corpora.json"
|
| 31 |
|
| 32 |
|
| 33 |
def load_config():
|
|
|
|
| 38 |
CONFIG_PATH.write_text(yaml.dump(config, default_flow_style=False, sort_keys=False))
|
| 39 |
|
| 40 |
|
| 41 |
+
def load_corpora():
|
| 42 |
+
return json.loads(CORPORA_PATH.read_text())
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def get_available_docs(corpus):
|
| 46 |
+
"""Get list of active doc indices for a given corpus."""
|
| 47 |
+
links_path = Path(__file__).parent / "annotation_data" / corpus["links_file"]
|
| 48 |
+
if not links_path.exists():
|
| 49 |
+
print(f" β οΈ No links file for {corpus['id']}: {links_path}")
|
| 50 |
+
return []
|
| 51 |
+
links = json.loads(links_path.read_text())
|
| 52 |
return sorted([
|
| 53 |
l["index"] for l in links
|
| 54 |
if l.get("has_revalidation") and l.get("status") == "success"
|
| 55 |
])
|
| 56 |
|
| 57 |
|
| 58 |
+
def generate_assignments(config, corpora, seed=42):
|
| 59 |
+
"""Distribute docs across annotators with overlap, per corpus."""
|
| 60 |
settings = config.get("settings", {})
|
| 61 |
overlap_pct = settings.get("overlap_percent", 10)
|
| 62 |
annotators = config.get("annotators", [])
|
|
|
|
| 65 |
print("β No annotators defined in config.")
|
| 66 |
return config
|
| 67 |
|
|
|
|
|
|
|
| 68 |
n_annotators = len(annotators)
|
| 69 |
+
rng = random.Random(seed)
|
| 70 |
|
| 71 |
+
# Initialize per-corpus doc dicts
|
| 72 |
+
for ann in annotators:
|
| 73 |
+
if not isinstance(ann.get("docs"), dict):
|
| 74 |
+
ann["docs"] = {}
|
| 75 |
|
| 76 |
+
for corpus in corpora:
|
| 77 |
+
cid = corpus["id"]
|
| 78 |
+
all_docs = get_available_docs(corpus)
|
| 79 |
+
n_docs = len(all_docs)
|
| 80 |
+
|
| 81 |
+
if n_docs == 0:
|
| 82 |
+
print(f"\nπ {corpus['name']} ({cid}): no docs available")
|
| 83 |
+
continue
|
| 84 |
+
|
| 85 |
+
n_overlap = max(1, round(n_docs * overlap_pct / 100))
|
| 86 |
+
|
| 87 |
+
shuffled = all_docs.copy()
|
| 88 |
+
rng.shuffle(shuffled)
|
| 89 |
+
|
| 90 |
+
overlap_docs = sorted(shuffled[:n_overlap])
|
| 91 |
+
remaining = shuffled[n_overlap:]
|
| 92 |
+
|
| 93 |
+
per_annotator = len(remaining) // n_annotators
|
| 94 |
+
extra = len(remaining) % n_annotators
|
| 95 |
+
|
| 96 |
+
print(f"\nπ {corpus['name']} ({cid}):")
|
| 97 |
+
print(f" Total docs: {n_docs}")
|
| 98 |
+
print(f" Overlap ({overlap_pct}%): {n_overlap} docs shared by all")
|
| 99 |
+
print(f" Per annotator: ~{per_annotator + n_overlap} docs each")
|
| 100 |
+
print(f" Overlap docs: {overlap_docs}")
|
| 101 |
+
|
| 102 |
+
start = 0
|
| 103 |
+
for i, ann in enumerate(annotators):
|
| 104 |
+
count = per_annotator + (1 if i < extra else 0)
|
| 105 |
+
exclusive = sorted(remaining[start:start + count])
|
| 106 |
+
start += count
|
| 107 |
+
|
| 108 |
+
ann["docs"][cid] = sorted(overlap_docs + exclusive)
|
| 109 |
+
print(f" {ann['username']}: {len(ann['docs'][cid])} docs "
|
| 110 |
+
f"({n_overlap} overlap + {len(exclusive)} exclusive)")
|
| 111 |
|
| 112 |
return config
|
| 113 |
|
|
|
|
| 144 |
|
| 145 |
|
| 146 |
def main():
|
| 147 |
+
parser = argparse.ArgumentParser(description="Generate document assignments per corpus")
|
| 148 |
parser.add_argument("--dry-run", action="store_true", help="Preview only")
|
| 149 |
parser.add_argument("--upload", action="store_true", help="Upload config to HF")
|
| 150 |
parser.add_argument("--seed", type=int, default=42, help="Random seed")
|
| 151 |
args = parser.parse_args()
|
| 152 |
|
| 153 |
+
corpora = load_corpora()
|
| 154 |
config = load_config()
|
| 155 |
+
|
| 156 |
+
print(f"π Loaded {len(corpora)} corpora, {len(config.get('annotators', []))} annotators")
|
| 157 |
+
config = generate_assignments(config, corpora, seed=args.seed)
|
| 158 |
|
| 159 |
if args.dry_run:
|
| 160 |
print("\n[DRY RUN] Would save:")
|
utils/config.js
CHANGED
|
@@ -1,4 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
// Centralized configuration for the annotation app
|
| 2 |
export const HF_DATASET_ID = process.env.HF_DATASET_REPO || 'ai4data/annotation_data';
|
| 3 |
export const HF_DATASET_BASE_URL = `https://huggingface.co/datasets/${HF_DATASET_ID}`;
|
| 4 |
export const MAX_DOCS_TO_SCAN = parseInt(process.env.MAX_DOCS_TO_SCAN || '50', 10);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import fs from 'fs';
|
| 2 |
+
import path from 'path';
|
| 3 |
+
|
| 4 |
// Centralized configuration for the annotation app
|
| 5 |
export const HF_DATASET_ID = process.env.HF_DATASET_REPO || 'ai4data/annotation_data';
|
| 6 |
export const HF_DATASET_BASE_URL = `https://huggingface.co/datasets/${HF_DATASET_ID}`;
|
| 7 |
export const MAX_DOCS_TO_SCAN = parseInt(process.env.MAX_DOCS_TO_SCAN || '50', 10);
|
| 8 |
+
|
| 9 |
+
// βββ Corpus helpers ββββββββββββββββββββββββββββββββ
|
| 10 |
+
|
| 11 |
+
let _corporaCache = null;
|
| 12 |
+
|
| 13 |
+
/**
|
| 14 |
+
* Returns the list of available corpora from corpora.json.
|
| 15 |
+
* Cached after first load.
|
| 16 |
+
*/
|
| 17 |
+
export function getCorpora() {
|
| 18 |
+
if (_corporaCache) return _corporaCache;
|
| 19 |
+
const filePath = path.join(process.cwd(), 'annotation_data', 'corpora.json');
|
| 20 |
+
_corporaCache = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
|
| 21 |
+
return _corporaCache;
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
/**
|
| 25 |
+
* Find a corpus by its ID (e.g. "wbg", "unhcr").
|
| 26 |
+
* Returns the default (first) corpus if corpusId is null/undefined.
|
| 27 |
+
*/
|
| 28 |
+
export function getCorpus(corpusId) {
|
| 29 |
+
const corpora = getCorpora();
|
| 30 |
+
if (!corpusId) return corpora[0];
|
| 31 |
+
return corpora.find(c => c.id === corpusId) || corpora[0];
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
/**
|
| 35 |
+
* HF repo path for a corpus's PDF links file.
|
| 36 |
+
*/
|
| 37 |
+
export function getLinksRepoPath(corpus) {
|
| 38 |
+
return `annotation_data/${corpus.links_file}`;
|
| 39 |
+
}
|
| 40 |
+
|
| 41 |
+
/**
|
| 42 |
+
* HF repo path for a specific doc's raw JSON.
|
| 43 |
+
*/
|
| 44 |
+
export function getDocRepoPath(corpus, docIndex) {
|
| 45 |
+
return `annotation_data/${corpus.extractions_dir}/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`;
|
| 46 |
+
}
|
| 47 |
+
|
| 48 |
+
/**
|
| 49 |
+
* Local file path for a specific doc's raw JSON.
|
| 50 |
+
*/
|
| 51 |
+
export function getDocLocalPath(corpus, docIndex) {
|
| 52 |
+
return path.join(
|
| 53 |
+
process.cwd(),
|
| 54 |
+
'annotation_data', corpus.extractions_dir,
|
| 55 |
+
`doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl`
|
| 56 |
+
);
|
| 57 |
+
}
|
utils/storage.js
CHANGED
|
@@ -1,54 +1,32 @@
|
|
| 1 |
import fs from 'fs';
|
| 2 |
import path from 'path';
|
| 3 |
import { commit } from '@huggingface/hub';
|
| 4 |
-
import { HF_DATASET_ID, HF_DATASET_BASE_URL } from './config.js';
|
| 5 |
-
|
| 6 |
-
const getRootPath = () => process.cwd();
|
| 7 |
|
| 8 |
const isHFSpace = () => {
|
| 9 |
return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
|
| 10 |
};
|
| 11 |
|
| 12 |
-
/**
|
| 13 |
-
* Returns the local file path for a document's raw JSON
|
| 14 |
-
*/
|
| 15 |
-
function getDocFilePath(docIndex) {
|
| 16 |
-
return path.join(
|
| 17 |
-
getRootPath(),
|
| 18 |
-
'annotation_data', 'wbg_extractions',
|
| 19 |
-
`doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl`
|
| 20 |
-
);
|
| 21 |
-
}
|
| 22 |
-
|
| 23 |
-
/**
|
| 24 |
-
* Returns the HF repo path for a document's raw JSON
|
| 25 |
-
*/
|
| 26 |
-
function getDocRepoPath(docIndex) {
|
| 27 |
-
return `annotation_data/wbg_extractions/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`;
|
| 28 |
-
}
|
| 29 |
-
|
| 30 |
/**
|
| 31 |
* Reads the full document JSON (all pages) from local file
|
| 32 |
*/
|
| 33 |
-
function readDocLocal(docIndex) {
|
| 34 |
-
const filePath =
|
| 35 |
if (!fs.existsSync(filePath)) return null;
|
| 36 |
-
|
| 37 |
-
return JSON.parse(raw);
|
| 38 |
}
|
| 39 |
|
| 40 |
/**
|
| 41 |
* Writes the full document JSON (all pages) to local file
|
| 42 |
*/
|
| 43 |
-
function writeDocLocal(docIndex, pagesData) {
|
| 44 |
-
const filePath =
|
| 45 |
fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
|
| 46 |
-
console.log(`Saved doc_${docIndex}
|
| 47 |
}
|
| 48 |
|
| 49 |
/**
|
| 50 |
* Finds the page index in the pages array by page_number
|
| 51 |
-
* Uses document.pages[0] to match, consistent with the document/route.js API
|
| 52 |
*/
|
| 53 |
function findPageIndex(pagesData, pageNumber) {
|
| 54 |
return pagesData.findIndex(p => p.document?.pages?.[0] === pageNumber);
|
|
@@ -57,24 +35,25 @@ function findPageIndex(pagesData, pageNumber) {
|
|
| 57 |
/**
|
| 58 |
* Fetches the document JSON from HuggingFace
|
| 59 |
*/
|
| 60 |
-
async function fetchDocFromHF(docIndex) {
|
| 61 |
const token = process.env.HF_TOKEN;
|
| 62 |
-
const
|
|
|
|
| 63 |
const res = await fetch(url, {
|
| 64 |
headers: { 'Authorization': `Bearer ${token}` }
|
| 65 |
});
|
| 66 |
-
if (!res.ok) throw new Error(`Failed to fetch doc_${docIndex} from HF: ${res.status}`);
|
| 67 |
return res.json();
|
| 68 |
}
|
| 69 |
|
| 70 |
/**
|
| 71 |
* Commits the updated document JSON back to HuggingFace
|
| 72 |
*/
|
| 73 |
-
async function commitDocToHF(docIndex, pagesData, commitMessage) {
|
| 74 |
const token = process.env.HF_TOKEN;
|
| 75 |
if (!token) throw new Error("Missing HF_TOKEN");
|
| 76 |
|
| 77 |
-
const repoPath = getDocRepoPath(docIndex);
|
| 78 |
const content = JSON.stringify(pagesData, null, 2);
|
| 79 |
|
| 80 |
await commit({
|
|
@@ -93,15 +72,13 @@ async function commitDocToHF(docIndex, pagesData, commitMessage) {
|
|
| 93 |
// βββ Public API ββββββββββββββββββββββββββββββββββββ
|
| 94 |
|
| 95 |
/**
|
| 96 |
-
* Saves an annotation by appending it to the page's datasets array
|
| 97 |
-
*
|
| 98 |
-
*
|
| 99 |
-
* @param {Object} annotation - Must include document_index, page_number, and dataset fields
|
| 100 |
*/
|
| 101 |
export async function saveAnnotation(annotation) {
|
|
|
|
| 102 |
const { document_index: docIndex, page_number: pageNumber } = annotation;
|
| 103 |
|
| 104 |
-
// Build the dataset entry (strip routing fields β they stay at page/doc level)
|
| 105 |
const datasetEntry = {
|
| 106 |
dataset_name: annotation.dataset_name,
|
| 107 |
dataset_tag: annotation.dataset_tag,
|
|
@@ -122,33 +99,33 @@ export async function saveAnnotation(annotation) {
|
|
| 122 |
};
|
| 123 |
|
| 124 |
if (isHFSpace()) {
|
| 125 |
-
|
| 126 |
-
const pagesData = await fetchDocFromHF(docIndex);
|
| 127 |
const pageIdx = findPageIndex(pagesData, pageNumber);
|
| 128 |
-
if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex}`);
|
| 129 |
|
| 130 |
pagesData[pageIdx].datasets.push(datasetEntry);
|
| 131 |
-
await commitDocToHF(docIndex, pagesData,
|
| 132 |
-
`Add
|
| 133 |
} else {
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
if (!pagesData) throw new Error(`doc_${docIndex}_direct_judged.jsonl not found locally`);
|
| 137 |
|
| 138 |
const pageIdx = findPageIndex(pagesData, pageNumber);
|
| 139 |
-
if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex}`);
|
| 140 |
|
| 141 |
pagesData[pageIdx].datasets.push(datasetEntry);
|
| 142 |
-
writeDocLocal(docIndex, pagesData);
|
| 143 |
}
|
| 144 |
}
|
| 145 |
|
| 146 |
/**
|
| 147 |
-
* Deletes an annotation
|
| 148 |
*/
|
| 149 |
-
export async function deleteAnnotation(timestamp, docIndex, pageNumber) {
|
|
|
|
|
|
|
| 150 |
if (isHFSpace()) {
|
| 151 |
-
const pagesData = await fetchDocFromHF(docIndex);
|
| 152 |
const pageIdx = findPageIndex(pagesData, pageNumber);
|
| 153 |
if (pageIdx === -1) return false;
|
| 154 |
|
|
@@ -158,11 +135,11 @@ export async function deleteAnnotation(timestamp, docIndex, pageNumber) {
|
|
| 158 |
);
|
| 159 |
if (pagesData[pageIdx].datasets.length === before) return false;
|
| 160 |
|
| 161 |
-
await commitDocToHF(docIndex, pagesData,
|
| 162 |
-
`Delete annotation from doc_${docIndex} page ${pageNumber}`);
|
| 163 |
return true;
|
| 164 |
} else {
|
| 165 |
-
const pagesData = readDocLocal(docIndex);
|
| 166 |
if (!pagesData) return false;
|
| 167 |
|
| 168 |
const pageIdx = findPageIndex(pagesData, pageNumber);
|
|
@@ -174,17 +151,19 @@ export async function deleteAnnotation(timestamp, docIndex, pageNumber) {
|
|
| 174 |
);
|
| 175 |
if (pagesData[pageIdx].datasets.length === before) return false;
|
| 176 |
|
| 177 |
-
writeDocLocal(docIndex, pagesData);
|
| 178 |
return true;
|
| 179 |
}
|
| 180 |
}
|
| 181 |
|
| 182 |
/**
|
| 183 |
-
* Updates an annotation
|
| 184 |
*/
|
| 185 |
-
export async function updateAnnotation(timestamp, docIndex, pageNumber, updates) {
|
|
|
|
|
|
|
| 186 |
if (isHFSpace()) {
|
| 187 |
-
const pagesData = await fetchDocFromHF(docIndex);
|
| 188 |
const pageIdx = findPageIndex(pagesData, pageNumber);
|
| 189 |
if (pageIdx === -1) return null;
|
| 190 |
|
|
@@ -195,11 +174,11 @@ export async function updateAnnotation(timestamp, docIndex, pageNumber, updates)
|
|
| 195 |
...pagesData[pageIdx].datasets[dsIdx],
|
| 196 |
...updates
|
| 197 |
};
|
| 198 |
-
await commitDocToHF(docIndex, pagesData,
|
| 199 |
-
`Update annotation in doc_${docIndex} page ${pageNumber}`);
|
| 200 |
return pagesData[pageIdx].datasets[dsIdx];
|
| 201 |
} else {
|
| 202 |
-
const pagesData = readDocLocal(docIndex);
|
| 203 |
if (!pagesData) return null;
|
| 204 |
|
| 205 |
const pageIdx = findPageIndex(pagesData, pageNumber);
|
|
@@ -212,46 +191,50 @@ export async function updateAnnotation(timestamp, docIndex, pageNumber, updates)
|
|
| 212 |
...pagesData[pageIdx].datasets[dsIdx],
|
| 213 |
...updates
|
| 214 |
};
|
| 215 |
-
writeDocLocal(docIndex, pagesData);
|
| 216 |
return pagesData[pageIdx].datasets[dsIdx];
|
| 217 |
}
|
| 218 |
}
|
| 219 |
|
| 220 |
/**
|
| 221 |
-
* Retrieves all human annotations
|
| 222 |
-
* Scans all doc files and returns entries that have a timestamp (human-added).
|
| 223 |
*/
|
| 224 |
-
export async function getAnnotations(docIndex = null) {
|
| 225 |
-
const
|
| 226 |
-
|
| 227 |
-
|
| 228 |
const results = [];
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
}
|
| 251 |
}
|
|
|
|
|
|
|
| 252 |
}
|
| 253 |
-
} catch (e) {
|
| 254 |
-
console.error(`Error reading ${filePath}:`, e);
|
| 255 |
}
|
| 256 |
}
|
| 257 |
|
|
|
|
| 1 |
import fs from 'fs';
|
| 2 |
import path from 'path';
|
| 3 |
import { commit } from '@huggingface/hub';
|
| 4 |
+
import { HF_DATASET_ID, HF_DATASET_BASE_URL, getCorpus, getDocRepoPath, getDocLocalPath } from './config.js';
|
|
|
|
|
|
|
| 5 |
|
| 6 |
const isHFSpace = () => {
|
| 7 |
return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
|
| 8 |
};
|
| 9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
/**
|
| 11 |
* Reads the full document JSON (all pages) from local file
|
| 12 |
*/
|
| 13 |
+
function readDocLocal(corpus, docIndex) {
|
| 14 |
+
const filePath = getDocLocalPath(corpus, docIndex);
|
| 15 |
if (!fs.existsSync(filePath)) return null;
|
| 16 |
+
return JSON.parse(fs.readFileSync(filePath, 'utf-8'));
|
|
|
|
| 17 |
}
|
| 18 |
|
| 19 |
/**
|
| 20 |
* Writes the full document JSON (all pages) to local file
|
| 21 |
*/
|
| 22 |
+
function writeDocLocal(corpus, docIndex, pagesData) {
|
| 23 |
+
const filePath = getDocLocalPath(corpus, docIndex);
|
| 24 |
fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2));
|
| 25 |
+
console.log(`Saved doc_${docIndex} locally (${corpus.id})`);
|
| 26 |
}
|
| 27 |
|
| 28 |
/**
|
| 29 |
* Finds the page index in the pages array by page_number
|
|
|
|
| 30 |
*/
|
| 31 |
function findPageIndex(pagesData, pageNumber) {
|
| 32 |
return pagesData.findIndex(p => p.document?.pages?.[0] === pageNumber);
|
|
|
|
| 35 |
/**
|
| 36 |
* Fetches the document JSON from HuggingFace
|
| 37 |
*/
|
| 38 |
+
async function fetchDocFromHF(corpus, docIndex) {
|
| 39 |
const token = process.env.HF_TOKEN;
|
| 40 |
+
const repoPath = getDocRepoPath(corpus, docIndex);
|
| 41 |
+
const url = `${HF_DATASET_BASE_URL}/raw/main/${repoPath}`;
|
| 42 |
const res = await fetch(url, {
|
| 43 |
headers: { 'Authorization': `Bearer ${token}` }
|
| 44 |
});
|
| 45 |
+
if (!res.ok) throw new Error(`Failed to fetch doc_${docIndex} (${corpus.id}) from HF: ${res.status}`);
|
| 46 |
return res.json();
|
| 47 |
}
|
| 48 |
|
| 49 |
/**
|
| 50 |
* Commits the updated document JSON back to HuggingFace
|
| 51 |
*/
|
| 52 |
+
async function commitDocToHF(corpus, docIndex, pagesData, commitMessage) {
|
| 53 |
const token = process.env.HF_TOKEN;
|
| 54 |
if (!token) throw new Error("Missing HF_TOKEN");
|
| 55 |
|
| 56 |
+
const repoPath = getDocRepoPath(corpus, docIndex);
|
| 57 |
const content = JSON.stringify(pagesData, null, 2);
|
| 58 |
|
| 59 |
await commit({
|
|
|
|
| 72 |
// βββ Public API ββββββββββββββββββββββββββββββββββββ
|
| 73 |
|
| 74 |
/**
|
| 75 |
+
* Saves an annotation by appending it to the page's datasets array.
|
| 76 |
+
* @param {Object} annotation - Must include corpus (optional, defaults to first), document_index, page_number
|
|
|
|
|
|
|
| 77 |
*/
|
| 78 |
export async function saveAnnotation(annotation) {
|
| 79 |
+
const corpus = getCorpus(annotation.corpus);
|
| 80 |
const { document_index: docIndex, page_number: pageNumber } = annotation;
|
| 81 |
|
|
|
|
| 82 |
const datasetEntry = {
|
| 83 |
dataset_name: annotation.dataset_name,
|
| 84 |
dataset_tag: annotation.dataset_tag,
|
|
|
|
| 99 |
};
|
| 100 |
|
| 101 |
if (isHFSpace()) {
|
| 102 |
+
const pagesData = await fetchDocFromHF(corpus, docIndex);
|
|
|
|
| 103 |
const pageIdx = findPageIndex(pagesData, pageNumber);
|
| 104 |
+
if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex} (${corpus.id})`);
|
| 105 |
|
| 106 |
pagesData[pageIdx].datasets.push(datasetEntry);
|
| 107 |
+
await commitDocToHF(corpus, docIndex, pagesData,
|
| 108 |
+
`Add annotation to ${corpus.id}/doc_${docIndex} page ${pageNumber}`);
|
| 109 |
} else {
|
| 110 |
+
const pagesData = readDocLocal(corpus, docIndex);
|
| 111 |
+
if (!pagesData) throw new Error(`doc_${docIndex} not found locally (${corpus.id})`);
|
|
|
|
| 112 |
|
| 113 |
const pageIdx = findPageIndex(pagesData, pageNumber);
|
| 114 |
+
if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex} (${corpus.id})`);
|
| 115 |
|
| 116 |
pagesData[pageIdx].datasets.push(datasetEntry);
|
| 117 |
+
writeDocLocal(corpus, docIndex, pagesData);
|
| 118 |
}
|
| 119 |
}
|
| 120 |
|
| 121 |
/**
|
| 122 |
+
* Deletes an annotation by timestamp
|
| 123 |
*/
|
| 124 |
+
export async function deleteAnnotation(timestamp, docIndex, pageNumber, corpusId) {
|
| 125 |
+
const corpus = getCorpus(corpusId);
|
| 126 |
+
|
| 127 |
if (isHFSpace()) {
|
| 128 |
+
const pagesData = await fetchDocFromHF(corpus, docIndex);
|
| 129 |
const pageIdx = findPageIndex(pagesData, pageNumber);
|
| 130 |
if (pageIdx === -1) return false;
|
| 131 |
|
|
|
|
| 135 |
);
|
| 136 |
if (pagesData[pageIdx].datasets.length === before) return false;
|
| 137 |
|
| 138 |
+
await commitDocToHF(corpus, docIndex, pagesData,
|
| 139 |
+
`Delete annotation from ${corpus.id}/doc_${docIndex} page ${pageNumber}`);
|
| 140 |
return true;
|
| 141 |
} else {
|
| 142 |
+
const pagesData = readDocLocal(corpus, docIndex);
|
| 143 |
if (!pagesData) return false;
|
| 144 |
|
| 145 |
const pageIdx = findPageIndex(pagesData, pageNumber);
|
|
|
|
| 151 |
);
|
| 152 |
if (pagesData[pageIdx].datasets.length === before) return false;
|
| 153 |
|
| 154 |
+
writeDocLocal(corpus, docIndex, pagesData);
|
| 155 |
return true;
|
| 156 |
}
|
| 157 |
}
|
| 158 |
|
| 159 |
/**
|
| 160 |
+
* Updates an annotation by timestamp
|
| 161 |
*/
|
| 162 |
+
export async function updateAnnotation(timestamp, docIndex, pageNumber, updates, corpusId) {
|
| 163 |
+
const corpus = getCorpus(corpusId);
|
| 164 |
+
|
| 165 |
if (isHFSpace()) {
|
| 166 |
+
const pagesData = await fetchDocFromHF(corpus, docIndex);
|
| 167 |
const pageIdx = findPageIndex(pagesData, pageNumber);
|
| 168 |
if (pageIdx === -1) return null;
|
| 169 |
|
|
|
|
| 174 |
...pagesData[pageIdx].datasets[dsIdx],
|
| 175 |
...updates
|
| 176 |
};
|
| 177 |
+
await commitDocToHF(corpus, docIndex, pagesData,
|
| 178 |
+
`Update annotation in ${corpus.id}/doc_${docIndex} page ${pageNumber}`);
|
| 179 |
return pagesData[pageIdx].datasets[dsIdx];
|
| 180 |
} else {
|
| 181 |
+
const pagesData = readDocLocal(corpus, docIndex);
|
| 182 |
if (!pagesData) return null;
|
| 183 |
|
| 184 |
const pageIdx = findPageIndex(pagesData, pageNumber);
|
|
|
|
| 191 |
...pagesData[pageIdx].datasets[dsIdx],
|
| 192 |
...updates
|
| 193 |
};
|
| 194 |
+
writeDocLocal(corpus, docIndex, pagesData);
|
| 195 |
return pagesData[pageIdx].datasets[dsIdx];
|
| 196 |
}
|
| 197 |
}
|
| 198 |
|
| 199 |
/**
|
| 200 |
+
* Retrieves all human annotations from local files.
|
|
|
|
| 201 |
*/
|
| 202 |
+
export async function getAnnotations(docIndex = null, corpusId = null) {
|
| 203 |
+
const { getCorpora } = await import('./config.js');
|
| 204 |
+
const corporaList = corpusId ? [getCorpus(corpusId)] : getCorpora();
|
|
|
|
| 205 |
const results = [];
|
| 206 |
+
|
| 207 |
+
for (const corpus of corporaList) {
|
| 208 |
+
const extractionsDir = path.join(process.cwd(), 'annotation_data', corpus.extractions_dir);
|
| 209 |
+
if (!fs.existsSync(extractionsDir)) continue;
|
| 210 |
+
|
| 211 |
+
const dirs = fs.readdirSync(extractionsDir).filter(d => d.startsWith('doc_'));
|
| 212 |
+
|
| 213 |
+
for (const dir of dirs) {
|
| 214 |
+
const idx = parseInt(dir.replace('doc_', ''), 10);
|
| 215 |
+
if (docIndex !== null && idx !== docIndex) continue;
|
| 216 |
+
|
| 217 |
+
const filePath = path.join(extractionsDir, dir, 'raw', `${dir}_direct_judged.jsonl`);
|
| 218 |
+
if (!fs.existsSync(filePath)) continue;
|
| 219 |
+
|
| 220 |
+
try {
|
| 221 |
+
const pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
|
| 222 |
+
for (const page of pagesData) {
|
| 223 |
+
const pageNum = page.document?.pages?.[0];
|
| 224 |
+
for (const ds of (page.datasets || [])) {
|
| 225 |
+
if (ds.annotator) {
|
| 226 |
+
results.push({
|
| 227 |
+
...ds,
|
| 228 |
+
corpus: corpus.id,
|
| 229 |
+
document_index: idx,
|
| 230 |
+
page_number: pageNum,
|
| 231 |
+
});
|
| 232 |
+
}
|
| 233 |
}
|
| 234 |
}
|
| 235 |
+
} catch (e) {
|
| 236 |
+
console.error(`Error reading ${filePath}:`, e);
|
| 237 |
}
|
|
|
|
|
|
|
| 238 |
}
|
| 239 |
}
|
| 240 |
|