rafmacalaba's picture
feat: show all pages with jump-to-mention navigation
e715c5d
import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN, getCorpus, getLinksRepoPath, getDocRepoPath } from '../../../utils/config.js';
import yaml from 'js-yaml';
/**
* Fetch annotator_config.yaml and return the doc list for a given user.
* Returns null only if no username is provided (unauthenticated).
* Returns empty {} if user is not in config (sees no docs).
* Returns per-corpus assignments: { wbg: Set([1,2]), unhcr: Set([3,4]) }
*/
async function getUserAssignedDocs(username) {
if (!username) return null;
try {
const configUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/annotator_config.yaml`;
const res = await fetch(configUrl, {
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
cache: 'no-store'
});
if (!res.ok) return {}; // config missing β€” block access
const text = await res.text();
const config = yaml.load(text);
const annotator = (config.annotators || []).find(a => a.username === username);
if (!annotator || !annotator.docs) return {}; // user not in config β€” no docs
// Support both old format (flat array) and new format (per-corpus object)
if (Array.isArray(annotator.docs)) {
// Legacy: flat array β€” treat as default corpus
return { _flat: new Set(annotator.docs) };
}
// New format: { wbg: [1,2], unhcr: [3,4] }
const result = {};
for (const [corpusId, docList] of Object.entries(annotator.docs)) {
if (Array.isArray(docList)) {
result[corpusId] = new Set(docList);
}
}
return result; // may be empty {} if user has no corpus assignments
} catch (e) {
console.warn('Could not load annotator_config.yaml:', e.message);
return {}; // on error, block access rather than show all
}
}
export async function GET(request) {
try {
const { searchParams } = new URL(request.url);
const username = searchParams.get('user');
const assignedDocs = await getUserAssignedDocs(username);
// Import corpora list
const { getCorpora } = await import('../../../utils/config.js');
const corpora = getCorpora();
const allDocuments = [];
for (const corpus of corpora) {
// Determine which doc indices this user has for this corpus
let userDocSet = null;
if (assignedDocs) {
if (assignedDocs._flat) {
// Legacy flat format β€” only applies to first/default corpus
userDocSet = corpus === corpora[0] ? assignedDocs._flat : new Set();
} else {
userDocSet = assignedDocs[corpus.id] || new Set();
}
if (userDocSet.size === 0) continue; // no docs for this corpus
}
// Fetch the links file for this corpus
const linksPath = getLinksRepoPath(corpus);
const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`;
const linksRes = await fetch(linksUrl, {
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
cache: 'no-store'
});
if (!linksRes.ok) {
console.warn(`No links file for corpus ${corpus.id}: ${linksRes.status}`);
continue;
}
const links = await linksRes.json();
let successLinks = links
.filter(l => l.status === 'success' && l.has_revalidation === true);
if (userDocSet) {
successLinks = successLinks.filter(l => userDocSet.has(l.index));
}
successLinks = successLinks.slice(0, MAX_DOCS_TO_SCAN);
// Parallel fetch docs
const results = await Promise.allSettled(
successLinks.map(async (link) => {
const docRepoPath = getDocRepoPath(corpus, link.index);
const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
const docRes = await fetch(docUrl, {
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
});
if (!docRes.ok) return null;
const pagesData = await docRes.json();
const allPages = pagesData
.map(page => page.document.pages[0]);
const pagesWithMentions = pagesData
.filter(page => page.datasets && page.datasets.length > 0)
.map(page => page.document.pages[0]);
if (allPages.length === 0) return null;
const pdfUrl = link.direct_pdf_url;
if (!pdfUrl) return null;
return {
corpus: corpus.id,
corpus_name: corpus.name,
index: link.index,
pdf_url: pdfUrl,
landing_page: link.landing_page_url,
annotatable_pages: allPages,
pages_with_mentions: pagesWithMentions
};
})
);
const docs = results
.filter(r => r.status === 'fulfilled' && r.value !== null)
.map(r => r.value);
allDocuments.push(...docs);
}
return new Response(JSON.stringify(allDocuments), {
status: 200,
headers: {
'Content-Type': 'application/json',
'Cache-Control': 'public, s-maxage=300, stale-while-revalidate=59'
}
});
} catch (error) {
console.error(error);
return new Response(
JSON.stringify({ error: "Failed to fetch documents" }),
{ status: 500, headers: { 'Content-Type': 'application/json' } }
);
}
}