Spaces:
Sleeping
Sleeping
Commit Β·
b584890
1
Parent(s): 559363f
feat: on-demand document loading, no limit, PDF loading indicator
Browse files- /api/documents now returns just the link list (no pre-fetching)
- /api/document returns annotatable_pages on-demand (no page param)
- page.js fetches + caches annotatable_pages when doc is selected
- PdfViewer shows loading spinner overlay while iframe loads
- Removed MAX_DOCS_TO_SCAN concept entirely
- app/api/document/route.js +50 -37
- app/api/documents/route.js +12 -33
- app/components/PdfViewer.js +20 -8
- app/globals.css +26 -0
- app/page.js +27 -2
app/api/document/route.js
CHANGED
|
@@ -6,67 +6,80 @@ const isHFSpace = () => {
|
|
| 6 |
return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
|
| 7 |
};
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
export async function GET(request) {
|
| 10 |
const { searchParams } = new URL(request.url);
|
| 11 |
const index = searchParams.get('index');
|
| 12 |
const page = searchParams.get('page');
|
| 13 |
|
| 14 |
-
|
| 15 |
-
if (index === null || page === null) {
|
| 16 |
return new Response(
|
| 17 |
-
JSON.stringify({ error: "Missing index
|
| 18 |
{ status: 400, headers: { 'Content-Type': 'application/json' } }
|
| 19 |
);
|
| 20 |
}
|
| 21 |
|
| 22 |
-
// Validate numeric values
|
| 23 |
const indexNum = parseInt(index, 10);
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
if (isNaN(indexNum) || isNaN(pageNum) || indexNum < 0 || pageNum < 0) {
|
| 27 |
return new Response(
|
| 28 |
-
JSON.stringify({ error: "index
|
| 29 |
{ status: 400, headers: { 'Content-Type': 'application/json' } }
|
| 30 |
);
|
| 31 |
}
|
| 32 |
|
| 33 |
try {
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
const res = await fetch(docUrl, {
|
| 40 |
-
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
|
| 41 |
});
|
|
|
|
| 42 |
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
)
|
| 48 |
-
|
| 49 |
-
pagesData = await res.json();
|
| 50 |
-
} else {
|
| 51 |
-
// Local dev: read from local file (reflects saved annotations immediately)
|
| 52 |
-
const filePath = path.join(
|
| 53 |
-
process.cwd(),
|
| 54 |
-
'annotation_data', 'wbg_extractions',
|
| 55 |
-
`doc_${indexNum}`, 'raw', `doc_${indexNum}_direct_judged.jsonl`
|
| 56 |
);
|
| 57 |
-
|
| 58 |
-
if (!fs.existsSync(filePath)) {
|
| 59 |
-
return new Response(
|
| 60 |
-
JSON.stringify({ error: `doc_${indexNum}_direct_judged.jsonl not found locally` }),
|
| 61 |
-
{ status: 404, headers: { 'Content-Type': 'application/json' } }
|
| 62 |
-
);
|
| 63 |
-
}
|
| 64 |
-
const raw = fs.readFileSync(filePath, 'utf-8');
|
| 65 |
-
pagesData = JSON.parse(raw);
|
| 66 |
}
|
| 67 |
|
| 68 |
const pageData = pagesData.find(p => p.document?.pages?.[0] === pageNum);
|
| 69 |
-
|
| 70 |
if (!pageData) {
|
| 71 |
return new Response(
|
| 72 |
JSON.stringify({ error: `Page ${pageNum} not found in doc ${indexNum}` }),
|
|
|
|
| 6 |
return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
|
| 7 |
};
|
| 8 |
|
| 9 |
+
/**
|
| 10 |
+
* Loads ALL pages data for a given document index.
|
| 11 |
+
*/
|
| 12 |
+
async function loadPagesData(indexNum) {
|
| 13 |
+
if (isHFSpace()) {
|
| 14 |
+
const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${indexNum}/raw/doc_${indexNum}_direct_judged.jsonl`;
|
| 15 |
+
const res = await fetch(docUrl, {
|
| 16 |
+
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
|
| 17 |
+
});
|
| 18 |
+
if (!res.ok) return null;
|
| 19 |
+
return await res.json();
|
| 20 |
+
} else {
|
| 21 |
+
const filePath = path.join(
|
| 22 |
+
process.cwd(),
|
| 23 |
+
'annotation_data', 'wbg_extractions',
|
| 24 |
+
`doc_${indexNum}`, 'raw', `doc_${indexNum}_direct_judged.jsonl`
|
| 25 |
+
);
|
| 26 |
+
if (!fs.existsSync(filePath)) return null;
|
| 27 |
+
return JSON.parse(fs.readFileSync(filePath, 'utf-8'));
|
| 28 |
+
}
|
| 29 |
+
}
|
| 30 |
+
|
| 31 |
export async function GET(request) {
|
| 32 |
const { searchParams } = new URL(request.url);
|
| 33 |
const index = searchParams.get('index');
|
| 34 |
const page = searchParams.get('page');
|
| 35 |
|
| 36 |
+
if (index === null) {
|
|
|
|
| 37 |
return new Response(
|
| 38 |
+
JSON.stringify({ error: "Missing index parameter" }),
|
| 39 |
{ status: 400, headers: { 'Content-Type': 'application/json' } }
|
| 40 |
);
|
| 41 |
}
|
| 42 |
|
|
|
|
| 43 |
const indexNum = parseInt(index, 10);
|
| 44 |
+
if (isNaN(indexNum) || indexNum < 0) {
|
|
|
|
|
|
|
| 45 |
return new Response(
|
| 46 |
+
JSON.stringify({ error: "index must be a non-negative integer" }),
|
| 47 |
{ status: 400, headers: { 'Content-Type': 'application/json' } }
|
| 48 |
);
|
| 49 |
}
|
| 50 |
|
| 51 |
try {
|
| 52 |
+
const pagesData = await loadPagesData(indexNum);
|
| 53 |
+
|
| 54 |
+
if (!pagesData) {
|
| 55 |
+
return new Response(
|
| 56 |
+
JSON.stringify({ error: `doc_${indexNum}_direct_judged.jsonl not found` }),
|
| 57 |
+
{ status: 404, headers: { 'Content-Type': 'application/json' } }
|
| 58 |
+
);
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
+
// If no page specified, return just the annotatable pages list
|
| 62 |
+
if (page === null || page === undefined) {
|
| 63 |
+
const annotatablePages = pagesData
|
| 64 |
+
.filter(p => p.datasets && p.datasets.length > 0)
|
| 65 |
+
.map(p => p.document?.pages?.[0]);
|
| 66 |
|
| 67 |
+
return new Response(JSON.stringify({ annotatable_pages: annotatablePages }), {
|
| 68 |
+
status: 200,
|
| 69 |
+
headers: { 'Content-Type': 'application/json' }
|
|
|
|
|
|
|
| 70 |
});
|
| 71 |
+
}
|
| 72 |
|
| 73 |
+
// Specific page requested
|
| 74 |
+
const pageNum = parseInt(page, 10);
|
| 75 |
+
if (isNaN(pageNum) || pageNum < 0) {
|
| 76 |
+
return new Response(
|
| 77 |
+
JSON.stringify({ error: "page must be a non-negative integer" }),
|
| 78 |
+
{ status: 400, headers: { 'Content-Type': 'application/json' } }
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
}
|
| 81 |
|
| 82 |
const pageData = pagesData.find(p => p.document?.pages?.[0] === pageNum);
|
|
|
|
| 83 |
if (!pageData) {
|
| 84 |
return new Response(
|
| 85 |
JSON.stringify({ error: `Page ${pageNum} not found in doc ${indexNum}` }),
|
app/api/documents/route.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import { HF_DATASET_BASE_URL
|
| 2 |
|
| 3 |
export async function GET() {
|
| 4 |
try {
|
|
@@ -21,38 +21,17 @@ export async function GET() {
|
|
| 21 |
|
| 22 |
const links = await linksRes.json();
|
| 23 |
|
| 24 |
-
//
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
if (!docRes.ok) return null;
|
| 36 |
-
|
| 37 |
-
const pagesData = await docRes.json();
|
| 38 |
-
const annotatablePages = pagesData
|
| 39 |
-
.filter(page => page.datasets && page.datasets.length > 0)
|
| 40 |
-
.map(page => page.document.pages[0]);
|
| 41 |
-
|
| 42 |
-
if (annotatablePages.length === 0) return null;
|
| 43 |
-
|
| 44 |
-
return {
|
| 45 |
-
index: link.index,
|
| 46 |
-
pdf_url: link.direct_pdf_url,
|
| 47 |
-
landing_page: link.landing_page_url,
|
| 48 |
-
annotatable_pages: annotatablePages
|
| 49 |
-
};
|
| 50 |
-
})
|
| 51 |
-
);
|
| 52 |
-
|
| 53 |
-
const documents = results
|
| 54 |
-
.filter(r => r.status === 'fulfilled' && r.value !== null)
|
| 55 |
-
.map(r => r.value);
|
| 56 |
|
| 57 |
return new Response(JSON.stringify(documents), {
|
| 58 |
status: 200,
|
|
|
|
| 1 |
+
import { HF_DATASET_BASE_URL } from '../../../utils/config.js';
|
| 2 |
|
| 3 |
export async function GET() {
|
| 4 |
try {
|
|
|
|
| 21 |
|
| 22 |
const links = await linksRes.json();
|
| 23 |
|
| 24 |
+
// Return ALL successful links β no limit, no data pre-fetching.
|
| 25 |
+
// Page data is loaded on-demand when the user selects a document.
|
| 26 |
+
const documents = links
|
| 27 |
+
.filter(l => l.status === 'success')
|
| 28 |
+
.map(link => ({
|
| 29 |
+
index: link.index,
|
| 30 |
+
pdf_url: link.direct_pdf_url,
|
| 31 |
+
landing_page: link.landing_page_url,
|
| 32 |
+
// annotatable_pages will be fetched on-demand via /api/document
|
| 33 |
+
annotatable_pages: null,
|
| 34 |
+
}));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
|
| 36 |
return new Response(JSON.stringify(documents), {
|
| 37 |
status: 200,
|
app/components/PdfViewer.js
CHANGED
|
@@ -1,6 +1,10 @@
|
|
| 1 |
"use client";
|
| 2 |
|
|
|
|
|
|
|
| 3 |
export default function PdfViewer({ pdfUrl, pageNumber }) {
|
|
|
|
|
|
|
| 4 |
if (!pdfUrl) {
|
| 5 |
return (
|
| 6 |
<div className="pdf-placeholder">
|
|
@@ -13,16 +17,24 @@ export default function PdfViewer({ pdfUrl, pageNumber }) {
|
|
| 13 |
const viewerPage = (pageNumber ?? 0) + 1;
|
| 14 |
|
| 15 |
// Use Mozilla's hosted PDF.js viewer β supports #page=N for direct page navigation.
|
| 16 |
-
// This avoids X-Frame-Options restrictions from the source server.
|
| 17 |
const pdfJsViewerUrl = `https://mozilla.github.io/pdf.js/web/viewer.html?file=${encodeURIComponent(pdfUrl)}#page=${viewerPage}`;
|
| 18 |
|
| 19 |
return (
|
| 20 |
-
<
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
);
|
| 28 |
}
|
|
|
|
| 1 |
"use client";
|
| 2 |
|
| 3 |
+
import { useState } from 'react';
|
| 4 |
+
|
| 5 |
export default function PdfViewer({ pdfUrl, pageNumber }) {
|
| 6 |
+
const [pdfLoading, setPdfLoading] = useState(true);
|
| 7 |
+
|
| 8 |
if (!pdfUrl) {
|
| 9 |
return (
|
| 10 |
<div className="pdf-placeholder">
|
|
|
|
| 17 |
const viewerPage = (pageNumber ?? 0) + 1;
|
| 18 |
|
| 19 |
// Use Mozilla's hosted PDF.js viewer β supports #page=N for direct page navigation.
|
|
|
|
| 20 |
const pdfJsViewerUrl = `https://mozilla.github.io/pdf.js/web/viewer.html?file=${encodeURIComponent(pdfUrl)}#page=${viewerPage}`;
|
| 21 |
|
| 22 |
return (
|
| 23 |
+
<div className="pdf-container">
|
| 24 |
+
{pdfLoading && (
|
| 25 |
+
<div className="pdf-loading-overlay">
|
| 26 |
+
<div className="loading-spinner" />
|
| 27 |
+
<p>Loading PDF...</p>
|
| 28 |
+
</div>
|
| 29 |
+
)}
|
| 30 |
+
<iframe
|
| 31 |
+
key={`pdf-${pdfUrl}-page-${viewerPage}`}
|
| 32 |
+
src={pdfJsViewerUrl}
|
| 33 |
+
className="pdf-frame"
|
| 34 |
+
title={`PDF Page ${viewerPage}`}
|
| 35 |
+
allow="fullscreen"
|
| 36 |
+
onLoad={() => setPdfLoading(false)}
|
| 37 |
+
/>
|
| 38 |
+
</div>
|
| 39 |
);
|
| 40 |
}
|
app/globals.css
CHANGED
|
@@ -228,6 +228,32 @@ h4 {
|
|
| 228 |
|
| 229 |
/* ββ PDF Viewer βββββββββββββββββββββββββββββββββββ */
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
.pdf-frame {
|
| 232 |
flex: 1;
|
| 233 |
width: 100%;
|
|
|
|
| 228 |
|
| 229 |
/* ββ PDF Viewer βββββββββββββββββββββββββββββββββββ */
|
| 230 |
|
| 231 |
+
.pdf-container {
|
| 232 |
+
flex: 1;
|
| 233 |
+
display: flex;
|
| 234 |
+
flex-direction: column;
|
| 235 |
+
position: relative;
|
| 236 |
+
min-height: 0;
|
| 237 |
+
}
|
| 238 |
+
|
| 239 |
+
.pdf-loading-overlay {
|
| 240 |
+
position: absolute;
|
| 241 |
+
inset: 0;
|
| 242 |
+
z-index: 5;
|
| 243 |
+
display: flex;
|
| 244 |
+
flex-direction: column;
|
| 245 |
+
align-items: center;
|
| 246 |
+
justify-content: center;
|
| 247 |
+
gap: 12px;
|
| 248 |
+
background: var(--pane-bg);
|
| 249 |
+
border-radius: 12px;
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
.pdf-loading-overlay p {
|
| 253 |
+
font-size: 0.85rem;
|
| 254 |
+
color: #94a3b8;
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
.pdf-frame {
|
| 258 |
flex: 1;
|
| 259 |
width: 100%;
|
app/page.js
CHANGED
|
@@ -78,11 +78,36 @@ export default function Home() {
|
|
| 78 |
}
|
| 79 |
}, []);
|
| 80 |
|
| 81 |
-
//
|
| 82 |
useEffect(() => {
|
| 83 |
if (selectedDocIndex !== null) {
|
| 84 |
const doc = documents.find(d => d.index === selectedDocIndex);
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
}
|
| 87 |
}, [selectedDocIndex, documents]);
|
| 88 |
|
|
|
|
| 78 |
}
|
| 79 |
}, []);
|
| 80 |
|
| 81 |
+
// Fetch annotatable pages on-demand when document selection changes
|
| 82 |
useEffect(() => {
|
| 83 |
if (selectedDocIndex !== null) {
|
| 84 |
const doc = documents.find(d => d.index === selectedDocIndex);
|
| 85 |
+
if (!doc) return;
|
| 86 |
+
|
| 87 |
+
// If annotatable_pages already loaded (cached), use it
|
| 88 |
+
if (doc.annotatable_pages) {
|
| 89 |
+
setCurrentDoc(doc);
|
| 90 |
+
return;
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
// Fetch annotatable pages on-demand
|
| 94 |
+
setLoadingPage(true);
|
| 95 |
+
fetch(`/api/document?index=${selectedDocIndex}`)
|
| 96 |
+
.then(res => res.json())
|
| 97 |
+
.then(data => {
|
| 98 |
+
const updatedDoc = { ...doc, annotatable_pages: data.annotatable_pages || [] };
|
| 99 |
+
setCurrentDoc(updatedDoc);
|
| 100 |
+
// Cache in the documents array
|
| 101 |
+
setDocuments(prev => prev.map(d =>
|
| 102 |
+
d.index === selectedDocIndex ? updatedDoc : d
|
| 103 |
+
));
|
| 104 |
+
setLoadingPage(false);
|
| 105 |
+
})
|
| 106 |
+
.catch(err => {
|
| 107 |
+
console.error("Failed to load document pages", err);
|
| 108 |
+
setCurrentDoc({ ...doc, annotatable_pages: [] });
|
| 109 |
+
setLoadingPage(false);
|
| 110 |
+
});
|
| 111 |
}
|
| 112 |
}, [selectedDocIndex, documents]);
|
| 113 |
|