File size: 5,997 Bytes
a2c885c
79ba9a0
da957b0
79ba9a0
 
0880d65
 
 
79ba9a0
 
 
 
 
 
 
 
a2c885c
79ba9a0
0880d65
79ba9a0
 
 
 
 
0880d65
79ba9a0
a2c885c
 
 
 
 
 
 
 
 
 
 
 
 
0880d65
79ba9a0
 
0880d65
79ba9a0
 
 
 
42df2a9
79ba9a0
 
 
 
 
a2c885c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9cc2491
a2c885c
 
 
 
 
 
 
 
 
 
 
 
9cc2491
a2c885c
9cc2491
a2c885c
 
9cc2491
a2c885c
 
 
9cc2491
a2c885c
 
 
 
 
 
 
 
 
 
 
 
 
 
e715c5d
 
 
a2c885c
 
 
e715c5d
a2c885c
 
 
 
 
 
 
 
 
 
e715c5d
 
a2c885c
 
 
7c5c449
a2c885c
 
 
9cc2491
a2c885c
 
42df2a9
a2c885c
5b87eae
 
 
79ba9a0
5b87eae
 
42df2a9
 
da957b0
a2c885c
da957b0
 
42df2a9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN, getCorpus, getLinksRepoPath, getDocRepoPath } from '../../../utils/config.js';
import yaml from 'js-yaml';

/**
 * Fetch annotator_config.yaml and return the doc list for a given user.
 * Returns null only if no username is provided (unauthenticated).
 * Returns empty {} if user is not in config (sees no docs).
 * Returns per-corpus assignments: { wbg: Set([1,2]), unhcr: Set([3,4]) }
 */
async function getUserAssignedDocs(username) {
    if (!username) return null;

    try {
        const configUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/annotator_config.yaml`;
        const res = await fetch(configUrl, {
            headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
            cache: 'no-store'
        });
        if (!res.ok) return {}; // config missing — block access

        const text = await res.text();
        const config = yaml.load(text);

        const annotator = (config.annotators || []).find(a => a.username === username);
        if (!annotator || !annotator.docs) return {}; // user not in config — no docs

        // Support both old format (flat array) and new format (per-corpus object)
        if (Array.isArray(annotator.docs)) {
            // Legacy: flat array — treat as default corpus
            return { _flat: new Set(annotator.docs) };
        }

        // New format: { wbg: [1,2], unhcr: [3,4] }
        const result = {};
        for (const [corpusId, docList] of Object.entries(annotator.docs)) {
            if (Array.isArray(docList)) {
                result[corpusId] = new Set(docList);
            }
        }
        return result; // may be empty {} if user has no corpus assignments
    } catch (e) {
        console.warn('Could not load annotator_config.yaml:', e.message);
        return {}; // on error, block access rather than show all
    }
}

export async function GET(request) {
    try {
        const { searchParams } = new URL(request.url);
        const username = searchParams.get('user');

        const assignedDocs = await getUserAssignedDocs(username);

        // Import corpora list
        const { getCorpora } = await import('../../../utils/config.js');
        const corpora = getCorpora();

        const allDocuments = [];

        for (const corpus of corpora) {
            // Determine which doc indices this user has for this corpus
            let userDocSet = null;
            if (assignedDocs) {
                if (assignedDocs._flat) {
                    // Legacy flat format — only applies to first/default corpus
                    userDocSet = corpus === corpora[0] ? assignedDocs._flat : new Set();
                } else {
                    userDocSet = assignedDocs[corpus.id] || new Set();
                }
                if (userDocSet.size === 0) continue; // no docs for this corpus
            }

            // Fetch the links file for this corpus
            const linksPath = getLinksRepoPath(corpus);
            const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`;
            const linksRes = await fetch(linksUrl, {
                headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
                cache: 'no-store'
            });

            if (!linksRes.ok) {
                console.warn(`No links file for corpus ${corpus.id}: ${linksRes.status}`);
                continue;
            }

            const links = await linksRes.json();

            let successLinks = links
                .filter(l => l.status === 'success' && l.has_revalidation === true);

            if (userDocSet) {
                successLinks = successLinks.filter(l => userDocSet.has(l.index));
            }

            successLinks = successLinks.slice(0, MAX_DOCS_TO_SCAN);

            // Parallel fetch docs
            const results = await Promise.allSettled(
                successLinks.map(async (link) => {
                    const docRepoPath = getDocRepoPath(corpus, link.index);
                    const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
                    const docRes = await fetch(docUrl, {
                        headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
                    });

                    if (!docRes.ok) return null;

                    const pagesData = await docRes.json();
                    const allPages = pagesData
                        .map(page => page.document.pages[0]);
                    const pagesWithMentions = pagesData
                        .filter(page => page.datasets && page.datasets.length > 0)
                        .map(page => page.document.pages[0]);

                    if (allPages.length === 0) return null;

                    const pdfUrl = link.direct_pdf_url;
                    if (!pdfUrl) return null;

                    return {
                        corpus: corpus.id,
                        corpus_name: corpus.name,
                        index: link.index,
                        pdf_url: pdfUrl,
                        landing_page: link.landing_page_url,
                        annotatable_pages: allPages,
                        pages_with_mentions: pagesWithMentions
                    };
                })
            );

            const docs = results
                .filter(r => r.status === 'fulfilled' && r.value !== null)
                .map(r => r.value);

            allDocuments.push(...docs);
        }

        return new Response(JSON.stringify(allDocuments), {
            status: 200,
            headers: {
                'Content-Type': 'application/json',
                'Cache-Control': 'public, s-maxage=300, stale-while-revalidate=59'
            }
        });
    } catch (error) {
        console.error(error);
        return new Response(
            JSON.stringify({ error: "Failed to fetch documents" }),
            { status: 500, headers: { 'Content-Type': 'application/json' } }
        );
    }
}