Spaces:
Running
Running
File size: 5,997 Bytes
a2c885c 79ba9a0 da957b0 79ba9a0 0880d65 79ba9a0 a2c885c 79ba9a0 0880d65 79ba9a0 0880d65 79ba9a0 a2c885c 0880d65 79ba9a0 0880d65 79ba9a0 42df2a9 79ba9a0 a2c885c 9cc2491 a2c885c 9cc2491 a2c885c 9cc2491 a2c885c 9cc2491 a2c885c 9cc2491 a2c885c e715c5d a2c885c e715c5d a2c885c e715c5d a2c885c 7c5c449 a2c885c 9cc2491 a2c885c 42df2a9 a2c885c 5b87eae 79ba9a0 5b87eae 42df2a9 da957b0 a2c885c da957b0 42df2a9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 | import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN, getCorpus, getLinksRepoPath, getDocRepoPath } from '../../../utils/config.js';
import yaml from 'js-yaml';
/**
* Fetch annotator_config.yaml and return the doc list for a given user.
* Returns null only if no username is provided (unauthenticated).
* Returns empty {} if user is not in config (sees no docs).
* Returns per-corpus assignments: { wbg: Set([1,2]), unhcr: Set([3,4]) }
*/
async function getUserAssignedDocs(username) {
if (!username) return null;
try {
const configUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/annotator_config.yaml`;
const res = await fetch(configUrl, {
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
cache: 'no-store'
});
if (!res.ok) return {}; // config missing — block access
const text = await res.text();
const config = yaml.load(text);
const annotator = (config.annotators || []).find(a => a.username === username);
if (!annotator || !annotator.docs) return {}; // user not in config — no docs
// Support both old format (flat array) and new format (per-corpus object)
if (Array.isArray(annotator.docs)) {
// Legacy: flat array — treat as default corpus
return { _flat: new Set(annotator.docs) };
}
// New format: { wbg: [1,2], unhcr: [3,4] }
const result = {};
for (const [corpusId, docList] of Object.entries(annotator.docs)) {
if (Array.isArray(docList)) {
result[corpusId] = new Set(docList);
}
}
return result; // may be empty {} if user has no corpus assignments
} catch (e) {
console.warn('Could not load annotator_config.yaml:', e.message);
return {}; // on error, block access rather than show all
}
}
export async function GET(request) {
try {
const { searchParams } = new URL(request.url);
const username = searchParams.get('user');
const assignedDocs = await getUserAssignedDocs(username);
// Import corpora list
const { getCorpora } = await import('../../../utils/config.js');
const corpora = getCorpora();
const allDocuments = [];
for (const corpus of corpora) {
// Determine which doc indices this user has for this corpus
let userDocSet = null;
if (assignedDocs) {
if (assignedDocs._flat) {
// Legacy flat format — only applies to first/default corpus
userDocSet = corpus === corpora[0] ? assignedDocs._flat : new Set();
} else {
userDocSet = assignedDocs[corpus.id] || new Set();
}
if (userDocSet.size === 0) continue; // no docs for this corpus
}
// Fetch the links file for this corpus
const linksPath = getLinksRepoPath(corpus);
const linksUrl = `${HF_DATASET_BASE_URL}/raw/main/${linksPath}`;
const linksRes = await fetch(linksUrl, {
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` },
cache: 'no-store'
});
if (!linksRes.ok) {
console.warn(`No links file for corpus ${corpus.id}: ${linksRes.status}`);
continue;
}
const links = await linksRes.json();
let successLinks = links
.filter(l => l.status === 'success' && l.has_revalidation === true);
if (userDocSet) {
successLinks = successLinks.filter(l => userDocSet.has(l.index));
}
successLinks = successLinks.slice(0, MAX_DOCS_TO_SCAN);
// Parallel fetch docs
const results = await Promise.allSettled(
successLinks.map(async (link) => {
const docRepoPath = getDocRepoPath(corpus, link.index);
const docUrl = `${HF_DATASET_BASE_URL}/raw/main/${docRepoPath}`;
const docRes = await fetch(docUrl, {
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
});
if (!docRes.ok) return null;
const pagesData = await docRes.json();
const allPages = pagesData
.map(page => page.document.pages[0]);
const pagesWithMentions = pagesData
.filter(page => page.datasets && page.datasets.length > 0)
.map(page => page.document.pages[0]);
if (allPages.length === 0) return null;
const pdfUrl = link.direct_pdf_url;
if (!pdfUrl) return null;
return {
corpus: corpus.id,
corpus_name: corpus.name,
index: link.index,
pdf_url: pdfUrl,
landing_page: link.landing_page_url,
annotatable_pages: allPages,
pages_with_mentions: pagesWithMentions
};
})
);
const docs = results
.filter(r => r.status === 'fulfilled' && r.value !== null)
.map(r => r.value);
allDocuments.push(...docs);
}
return new Response(JSON.stringify(allDocuments), {
status: 200,
headers: {
'Content-Type': 'application/json',
'Cache-Control': 'public, s-maxage=300, stale-while-revalidate=59'
}
});
} catch (error) {
console.error(error);
return new Response(
JSON.stringify({ error: "Failed to fetch documents" }),
{ status: 500, headers: { 'Content-Type': 'application/json' } }
);
}
}
|