import fs from 'fs'; import path from 'path'; // Centralized configuration for the annotation app export const HF_DATASET_ID = process.env.HF_DATASET_REPO || 'ai4data/annotation_data'; export const HF_DATASET_BASE_URL = `https://huggingface.co/datasets/${HF_DATASET_ID}`; export const MAX_DOCS_TO_SCAN = parseInt(process.env.MAX_DOCS_TO_SCAN || '50', 10); // ─── Corpus helpers ──────────────────────────────── let _corporaCache = null; /** * Returns the list of available corpora from corpora.json. * Cached after first load. */ export function getCorpora() { if (_corporaCache) return _corporaCache; const filePath = path.join(process.cwd(), 'annotation_data', 'corpora.json'); _corporaCache = JSON.parse(fs.readFileSync(filePath, 'utf-8')); return _corporaCache; } /** * Find a corpus by its ID (e.g. "wbg", "unhcr"). * Returns the default (first) corpus if corpusId is null/undefined. */ export function getCorpus(corpusId) { const corpora = getCorpora(); if (!corpusId) return corpora[0]; return corpora.find(c => c.id === corpusId) || corpora[0]; } /** * HF repo path for a corpus's PDF links file. */ export function getLinksRepoPath(corpus) { return `annotation_data/${corpus.links_file}`; } /** * HF repo path for a specific doc's raw JSON. */ export function getDocRepoPath(corpus, docIndex) { return `annotation_data/${corpus.extractions_dir}/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`; } /** * Local file path for a specific doc's raw JSON. */ export function getDocLocalPath(corpus, docIndex) { return path.join( process.cwd(), 'annotation_data', corpus.extractions_dir, `doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl` ); }