Spaces:
Running
Running
| import fs from 'fs'; | |
| import path from 'path'; | |
| // Centralized configuration for the annotation app | |
| export const HF_DATASET_ID = process.env.HF_DATASET_REPO || 'ai4data/annotation_data'; | |
| export const HF_DATASET_BASE_URL = `https://huggingface.co/datasets/${HF_DATASET_ID}`; | |
| export const MAX_DOCS_TO_SCAN = parseInt(process.env.MAX_DOCS_TO_SCAN || '50', 10); | |
| // βββ Corpus helpers ββββββββββββββββββββββββββββββββ | |
| let _corporaCache = null; | |
| /** | |
| * Returns the list of available corpora from corpora.json. | |
| * Cached after first load. | |
| */ | |
| export function getCorpora() { | |
| if (_corporaCache) return _corporaCache; | |
| const filePath = path.join(process.cwd(), 'annotation_data', 'corpora.json'); | |
| _corporaCache = JSON.parse(fs.readFileSync(filePath, 'utf-8')); | |
| return _corporaCache; | |
| } | |
| /** | |
| * Find a corpus by its ID (e.g. "wbg", "unhcr"). | |
| * Returns the default (first) corpus if corpusId is null/undefined. | |
| */ | |
| export function getCorpus(corpusId) { | |
| const corpora = getCorpora(); | |
| if (!corpusId) return corpora[0]; | |
| return corpora.find(c => c.id === corpusId) || corpora[0]; | |
| } | |
| /** | |
| * HF repo path for a corpus's PDF links file. | |
| */ | |
| export function getLinksRepoPath(corpus) { | |
| return `annotation_data/${corpus.links_file}`; | |
| } | |
| /** | |
| * HF repo path for a specific doc's raw JSON. | |
| */ | |
| export function getDocRepoPath(corpus, docIndex) { | |
| return `annotation_data/${corpus.extractions_dir}/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`; | |
| } | |
| /** | |
| * Local file path for a specific doc's raw JSON. | |
| */ | |
| export function getDocLocalPath(corpus, docIndex) { | |
| return path.join( | |
| process.cwd(), | |
| 'annotation_data', corpus.extractions_dir, | |
| `doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl` | |
| ); | |
| } | |