File size: 1,825 Bytes
a2c885c
 
 
da957b0
 
 
d140e69
a2c885c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import fs from 'fs';
import path from 'path';

// Centralized configuration for the annotation app
export const HF_DATASET_ID = process.env.HF_DATASET_REPO || 'ai4data/annotation_data';
export const HF_DATASET_BASE_URL = `https://huggingface.co/datasets/${HF_DATASET_ID}`;
export const MAX_DOCS_TO_SCAN = parseInt(process.env.MAX_DOCS_TO_SCAN || '50', 10);

// ─── Corpus helpers ────────────────────────────────

let _corporaCache = null;

/**
 * Returns the list of available corpora from corpora.json.
 * Cached after first load.
 */
export function getCorpora() {
    if (_corporaCache) return _corporaCache;
    const filePath = path.join(process.cwd(), 'annotation_data', 'corpora.json');
    _corporaCache = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
    return _corporaCache;
}

/**
 * Find a corpus by its ID (e.g. "wbg", "unhcr").
 * Returns the default (first) corpus if corpusId is null/undefined.
 */
export function getCorpus(corpusId) {
    const corpora = getCorpora();
    if (!corpusId) return corpora[0];
    return corpora.find(c => c.id === corpusId) || corpora[0];
}

/**
 * HF repo path for a corpus's PDF links file.
 */
export function getLinksRepoPath(corpus) {
    return `annotation_data/${corpus.links_file}`;
}

/**
 * HF repo path for a specific doc's raw JSON.
 */
export function getDocRepoPath(corpus, docIndex) {
    return `annotation_data/${corpus.extractions_dir}/doc_${docIndex}/raw/doc_${docIndex}_direct_judged.jsonl`;
}

/**
 * Local file path for a specific doc's raw JSON.
 */
export function getDocLocalPath(corpus, docIndex) {
    return path.join(
        process.cwd(),
        'annotation_data', corpus.extractions_dir,
        `doc_${docIndex}`, 'raw', `doc_${docIndex}_direct_judged.jsonl`
    );
}