rafmacalaba's picture
feat: multi-corpus support
a2c885c
import fs from 'fs';
import path from 'path';
// Centralized configuration for the annotation app
export const HF_DATASET_ID = process.env.HF_DATASET_REPO || 'ai4data/annotation_data';
export const HF_DATASET_BASE_URL = `https://huggingface.co/datasets/${HF_DATASET_ID}`;
export const MAX_DOCS_TO_SCAN = parseInt(process.env.MAX_DOCS_TO_SCAN || '50', 10);
// ─── Corpus helpers ────────────────────────────────
let _corporaCache = null;
/**
* Returns the list of available corpora from corpora.json.
* Cached after first load.
*/
export function getCorpora() {
if (_corporaCache) return _corporaCache;
const filePath = path.join(process.cwd(), 'annotation_data', 'corpora.json');
_corporaCache = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
return _corporaCache;
}
/**
* Find a corpus by its ID (e.g. "wbg", "unhcr").
* Returns the default (first) corpus if corpusId is null/undefined.
*/
export function getCorpus(corpusId) {
const corpora = getCorpora();
if (!corpusId) return corpora[0];
return corpora.find(c => c.id === corpusId) || corpora[0];
}
/**
* HF repo path for a corpus's PDF links file.
*/
export function getLinksRepoPath(corpus) {
return `annotation_data/${corpus.links_file}`;
}
/**
* HF repo path for a specific doc's raw JSON.
*/
export function getDocRepoPath(corpus, docIndex) {
return `annotation_data/${corpus.extractions_dir}/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`;
}
/**
* Local file path for a specific doc's raw JSON.
*/
export function getDocLocalPath(corpus, docIndex) {
return path.join(
process.cwd(),
'annotation_data', corpus.extractions_dir,
`doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl`
);
}