Spaces:
Running
Running
File size: 1,825 Bytes
a2c885c da957b0 d140e69 a2c885c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 | import fs from 'fs';
import path from 'path';
// Centralized configuration for the annotation app
export const HF_DATASET_ID = process.env.HF_DATASET_REPO || 'ai4data/annotation_data';
export const HF_DATASET_BASE_URL = `https://huggingface.co/datasets/${HF_DATASET_ID}`;
export const MAX_DOCS_TO_SCAN = parseInt(process.env.MAX_DOCS_TO_SCAN || '50', 10);
// βββ Corpus helpers ββββββββββββββββββββββββββββββββ
let _corporaCache = null;
/**
* Returns the list of available corpora from corpora.json.
* Cached after first load.
*/
export function getCorpora() {
if (_corporaCache) return _corporaCache;
const filePath = path.join(process.cwd(), 'annotation_data', 'corpora.json');
_corporaCache = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
return _corporaCache;
}
/**
* Find a corpus by its ID (e.g. "wbg", "unhcr").
* Returns the default (first) corpus if corpusId is null/undefined.
*/
export function getCorpus(corpusId) {
const corpora = getCorpora();
if (!corpusId) return corpora[0];
return corpora.find(c => c.id === corpusId) || corpora[0];
}
/**
* HF repo path for a corpus's PDF links file.
*/
export function getLinksRepoPath(corpus) {
return `annotation_data/${corpus.links_file}`;
}
/**
* HF repo path for a specific doc's raw JSON.
*/
export function getDocRepoPath(corpus, docIndex) {
return `annotation_data/${corpus.extractions_dir}/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`;
}
/**
* Local file path for a specific doc's raw JSON.
*/
export function getDocLocalPath(corpus, docIndex) {
return path.join(
process.cwd(),
'annotation_data', corpus.extractions_dir,
`doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl`
);
}
|