import fs from 'fs'; import path from 'path'; import { commit } from '@huggingface/hub'; import { HF_DATASET_ID, HF_DATASET_BASE_URL, getCorpus, getDocRepoPath, getDocLocalPath } from './config.js'; const isHFSpace = () => { return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development'; }; /** * Reads the full document JSON (all pages) from local file */ function readDocLocal(corpus, docIndex) { const filePath = getDocLocalPath(corpus, docIndex); if (!fs.existsSync(filePath)) return null; return JSON.parse(fs.readFileSync(filePath, 'utf-8')); } /** * Writes the full document JSON (all pages) to local file */ function writeDocLocal(corpus, docIndex, pagesData) { const filePath = getDocLocalPath(corpus, docIndex); fs.writeFileSync(filePath, JSON.stringify(pagesData, null, 2)); console.log(`Saved doc_${docIndex} locally (${corpus.id})`); } /** * Finds the page index in the pages array by page_number */ function findPageIndex(pagesData, pageNumber) { return pagesData.findIndex(p => p.document?.pages?.[0] === pageNumber); } /** * Fetches the document JSON from HuggingFace */ async function fetchDocFromHF(corpus, docIndex) { const token = process.env.HF_TOKEN; const repoPath = getDocRepoPath(corpus, docIndex); const url = `${HF_DATASET_BASE_URL}/raw/main/${repoPath}`; const res = await fetch(url, { headers: { 'Authorization': `Bearer ${token}` }, cache: 'no-store' }); if (!res.ok) throw new Error(`Failed to fetch doc_${docIndex} (${corpus.id}) from HF: ${res.status}`); return res.json(); } /** * Commits the updated document JSON back to HuggingFace */ async function commitDocToHF(corpus, docIndex, pagesData, commitMessage) { const token = process.env.HF_TOKEN; if (!token) throw new Error("Missing HF_TOKEN"); const repoPath = getDocRepoPath(corpus, docIndex); const content = JSON.stringify(pagesData, null, 2); await commit({ repo: { type: 'dataset', name: HF_DATASET_ID }, credentials: { accessToken: token }, title: commitMessage, operations: [{ operation: 'addOrUpdate', path: repoPath, content: new Blob([content], { type: 'application/json' }), }], }); console.log(`Committed ${repoPath} to HF dataset ${HF_DATASET_ID}`); } // ─── Public API ──────────────────────────────────── /** * Saves an annotation by appending it to the page's datasets array. * @param {Object} annotation - Must include corpus (optional, defaults to first), document_index, page_number */ export async function saveAnnotation(annotation) { const corpus = getCorpus(annotation.corpus); const { document_index: docIndex, page_number: pageNumber } = annotation; const datasetEntry = { dataset_name: annotation.dataset_name, dataset_tag: annotation.dataset_tag, source: annotation.source || 'human', annotator: annotation.annotator, timestamp: annotation.timestamp, description: annotation.description || null, data_type: annotation.data_type || null, acronym: annotation.acronym || null, author: annotation.author || null, producer: annotation.producer || null, geography: annotation.geography || null, publication_year: annotation.publication_year || null, reference_year: annotation.reference_year || null, reference_population: annotation.reference_population || null, is_used: annotation.is_used || null, usage_context: annotation.usage_context || null, }; if (isHFSpace()) { const pagesData = await fetchDocFromHF(corpus, docIndex); const pageIdx = findPageIndex(pagesData, pageNumber); if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex} (${corpus.id})`); pagesData[pageIdx].datasets.push(datasetEntry); await commitDocToHF(corpus, docIndex, pagesData, `Add annotation to ${corpus.id}/doc_${docIndex} page ${pageNumber}`); } else { const pagesData = readDocLocal(corpus, docIndex); if (!pagesData) throw new Error(`doc_${docIndex} not found locally (${corpus.id})`); const pageIdx = findPageIndex(pagesData, pageNumber); if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex} (${corpus.id})`); pagesData[pageIdx].datasets.push(datasetEntry); writeDocLocal(corpus, docIndex, pagesData); } } /** * Deletes an annotation by timestamp */ export async function deleteAnnotation(timestamp, docIndex, pageNumber, corpusId) { const corpus = getCorpus(corpusId); if (isHFSpace()) { const pagesData = await fetchDocFromHF(corpus, docIndex); const pageIdx = findPageIndex(pagesData, pageNumber); if (pageIdx === -1) return false; const before = pagesData[pageIdx].datasets.length; pagesData[pageIdx].datasets = pagesData[pageIdx].datasets.filter( ds => ds.timestamp !== timestamp ); if (pagesData[pageIdx].datasets.length === before) return false; await commitDocToHF(corpus, docIndex, pagesData, `Delete annotation from ${corpus.id}/doc_${docIndex} page ${pageNumber}`); return true; } else { const pagesData = readDocLocal(corpus, docIndex); if (!pagesData) return false; const pageIdx = findPageIndex(pagesData, pageNumber); if (pageIdx === -1) return false; const before = pagesData[pageIdx].datasets.length; pagesData[pageIdx].datasets = pagesData[pageIdx].datasets.filter( ds => ds.timestamp !== timestamp ); if (pagesData[pageIdx].datasets.length === before) return false; writeDocLocal(corpus, docIndex, pagesData); return true; } } /** * Updates an annotation by timestamp */ export async function updateAnnotation(timestamp, docIndex, pageNumber, updates, corpusId) { const corpus = getCorpus(corpusId); if (isHFSpace()) { const pagesData = await fetchDocFromHF(corpus, docIndex); const pageIdx = findPageIndex(pagesData, pageNumber); if (pageIdx === -1) return null; const dsIdx = pagesData[pageIdx].datasets.findIndex(ds => ds.timestamp === timestamp); if (dsIdx === -1) return null; pagesData[pageIdx].datasets[dsIdx] = { ...pagesData[pageIdx].datasets[dsIdx], ...updates }; await commitDocToHF(corpus, docIndex, pagesData, `Update annotation in ${corpus.id}/doc_${docIndex} page ${pageNumber}`); return pagesData[pageIdx].datasets[dsIdx]; } else { const pagesData = readDocLocal(corpus, docIndex); if (!pagesData) return null; const pageIdx = findPageIndex(pagesData, pageNumber); if (pageIdx === -1) return null; const dsIdx = pagesData[pageIdx].datasets.findIndex(ds => ds.timestamp === timestamp); if (dsIdx === -1) return null; pagesData[pageIdx].datasets[dsIdx] = { ...pagesData[pageIdx].datasets[dsIdx], ...updates }; writeDocLocal(corpus, docIndex, pagesData); return pagesData[pageIdx].datasets[dsIdx]; } } /** * Retrieves all human annotations. */ export async function getAnnotations(docIndex = null, corpusId = null) { const { getCorpora, HF_DATASET_BASE_URL, getLinksRepoPath, getDocRepoPath } = await import('./config.js'); const corporaList = corpusId ? [getCorpus(corpusId)] : getCorpora(); const results = []; for (const corpus of corporaList) { let links = []; if (isHFSpace()) { try { const linksPath = getLinksRepoPath(corpus); const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`; const res = await fetch(linksUrl, { headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }, cache: 'no-store' }); if (res.ok) { links = await res.json(); } } catch (e) { console.warn(`Error fetching links file for ${corpus.id} in getAnnotations:`, e.message); } } else { const filePath = path.join(process.cwd(), 'annotation_data', corpus.links_file); if (fs.existsSync(filePath)) { links = JSON.parse(fs.readFileSync(filePath, 'utf-8')); } } const targetLinks = links.filter(l => l.status === 'success' && (docIndex === null || l.index === docIndex)); for (const link of targetLinks) { let pagesData = null; try { if (isHFSpace()) { const docRepoPath = getDocRepoPath(corpus, link.index); const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`; const res = await fetch(docUrl, { headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }, cache: 'no-store' }); if (res.ok) { pagesData = await res.json(); } } else { const filePath = getDocLocalPath(corpus, link.index); if (fs.existsSync(filePath)) { pagesData = JSON.parse(fs.readFileSync(filePath, 'utf-8')); } } if (pagesData) { for (const page of pagesData) { const pageNum = page.document?.pages?.[0]; for (const ds of (page.datasets || [])) { if (ds.annotator) { results.push({ ...ds, corpus: corpus.id, document_index: link.index, page_number: pageNum, }); } } } } } catch (e) { console.error(`Error loading annotations for doc_${link.index}:`, e); } } } return results; }