Spaces:
Running
Running
| import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN, getCorpus, getLinksRepoPath, getDocRepoPath } from '../../../utils/config.js'; | |
| import yaml from 'js-yaml'; | |
| /** | |
| * Fetch annotator_config.yaml and return the doc list for a given user. | |
| * Returns null only if no username is provided (unauthenticated). | |
| * Returns empty {} if user is not in config (sees no docs). | |
| * Returns per-corpus assignments: { wbg: Set([1,2]), unhcr: Set([3,4]) } | |
| */ | |
| async function getUserAssignedDocs(username) { | |
| if (!username) return null; | |
| try { | |
| const configUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/annotator_config.yaml`; | |
| const res = await fetch(configUrl, { | |
| headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }, | |
| cache: 'no-store' | |
| }); | |
| if (!res.ok) return {}; // config missing β block access | |
| const text = await res.text(); | |
| const config = yaml.load(text); | |
| const annotator = (config.annotators || []).find(a => a.username === username); | |
| if (!annotator || !annotator.docs) return {}; // user not in config β no docs | |
| // Support both old format (flat array) and new format (per-corpus object) | |
| if (Array.isArray(annotator.docs)) { | |
| // Legacy: flat array β treat as default corpus | |
| return { _flat: new Set(annotator.docs) }; | |
| } | |
| // New format: { wbg: [1,2], unhcr: [3,4] } | |
| const result = {}; | |
| for (const [corpusId, docList] of Object.entries(annotator.docs)) { | |
| if (Array.isArray(docList)) { | |
| result[corpusId] = new Set(docList); | |
| } | |
| } | |
| return result; // may be empty {} if user has no corpus assignments | |
| } catch (e) { | |
| console.warn('Could not load annotator_config.yaml:', e.message); | |
| return {}; // on error, block access rather than show all | |
| } | |
| } | |
| export async function GET(request) { | |
| try { | |
| const { searchParams } = new URL(request.url); | |
| const username = searchParams.get('user'); | |
| const assignedDocs = await getUserAssignedDocs(username); | |
| // Import corpora list | |
| const { getCorpora } = await import('../../../utils/config.js'); | |
| const corpora = getCorpora(); | |
| const allDocuments = []; | |
| for (const corpus of corpora) { | |
| // Determine which doc indices this user has for this corpus | |
| let userDocSet = null; | |
| if (assignedDocs) { | |
| if (assignedDocs._flat) { | |
| // Legacy flat format β only applies to first/default corpus | |
| userDocSet = corpus === corpora[0] ? assignedDocs._flat : new Set(); | |
| } else { | |
| userDocSet = assignedDocs[corpus.id] || new Set(); | |
| } | |
| if (userDocSet.size === 0) continue; // no docs for this corpus | |
| } | |
| // Fetch the links file for this corpus | |
| const linksPath = getLinksRepoPath(corpus); | |
| const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`; | |
| const linksRes = await fetch(linksUrl, { | |
| headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }, | |
| cache: 'no-store' | |
| }); | |
| if (!linksRes.ok) { | |
| console.warn(`No links file for corpus ${corpus.id}: ${linksRes.status}`); | |
| continue; | |
| } | |
| const links = await linksRes.json(); | |
| let successLinks = links | |
| .filter(l => l.status === 'success' && l.has_revalidation === true); | |
| if (userDocSet) { | |
| successLinks = successLinks.filter(l => userDocSet.has(l.index)); | |
| } | |
| successLinks = successLinks.slice(0, MAX_DOCS_TO_SCAN); | |
| // Parallel fetch docs | |
| const results = await Promise.allSettled( | |
| successLinks.map(async (link) => { | |
| const docRepoPath = getDocRepoPath(corpus, link.index); | |
| const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`; | |
| const docRes = await fetch(docUrl, { | |
| headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` } | |
| }); | |
| if (!docRes.ok) return null; | |
| const pagesData = await docRes.json(); | |
| const allPages = pagesData | |
| .map(page => page.document.pages[0]); | |
| const pagesWithMentions = pagesData | |
| .filter(page => page.datasets && page.datasets.length > 0) | |
| .map(page => page.document.pages[0]); | |
| if (allPages.length === 0) return null; | |
| const pdfUrl = link.direct_pdf_url; | |
| if (!pdfUrl) return null; | |
| return { | |
| corpus: corpus.id, | |
| corpus_name: corpus.name, | |
| index: link.index, | |
| pdf_url: pdfUrl, | |
| landing_page: link.landing_page_url, | |
| annotatable_pages: allPages, | |
| pages_with_mentions: pagesWithMentions | |
| }; | |
| }) | |
| ); | |
| const docs = results | |
| .filter(r => r.status === 'fulfilled' && r.value !== null) | |
| .map(r => r.value); | |
| allDocuments.push(...docs); | |
| } | |
| return new Response(JSON.stringify(allDocuments), { | |
| status: 200, | |
| headers: { | |
| 'Content-Type': 'application/json', | |
| 'Cache-Control': 'public, s-maxage=300, stale-while-revalidate=59' | |
| } | |
| }); | |
| } catch (error) { | |
| console.error(error); | |
| return new Response( | |
| JSON.stringify({ error: "Failed to fetch documents" }), | |
| { status: 500, headers: { 'Content-Type': 'application/json' } } | |
| ); | |
| } | |
| } | |