rafmacalaba commited on
Commit
b584890
Β·
1 Parent(s): 559363f

feat: on-demand document loading, no limit, PDF loading indicator

Browse files

- /api/documents now returns just the link list (no pre-fetching)
- /api/document returns annotatable_pages on-demand (no page param)
- page.js fetches + caches annotatable_pages when doc is selected
- PdfViewer shows loading spinner overlay while iframe loads
- Removed MAX_DOCS_TO_SCAN concept entirely

app/api/document/route.js CHANGED
@@ -6,67 +6,80 @@ const isHFSpace = () => {
6
  return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
7
  };
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  export async function GET(request) {
10
  const { searchParams } = new URL(request.url);
11
  const index = searchParams.get('index');
12
  const page = searchParams.get('page');
13
 
14
- // Validate required params
15
- if (index === null || page === null) {
16
  return new Response(
17
- JSON.stringify({ error: "Missing index or page parameter" }),
18
  { status: 400, headers: { 'Content-Type': 'application/json' } }
19
  );
20
  }
21
 
22
- // Validate numeric values
23
  const indexNum = parseInt(index, 10);
24
- const pageNum = parseInt(page, 10);
25
-
26
- if (isNaN(indexNum) || isNaN(pageNum) || indexNum < 0 || pageNum < 0) {
27
  return new Response(
28
- JSON.stringify({ error: "index and page must be non-negative integers" }),
29
  { status: 400, headers: { 'Content-Type': 'application/json' } }
30
  );
31
  }
32
 
33
  try {
34
- let pagesData;
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
- if (isHFSpace()) {
37
- // Production: fetch from HuggingFace
38
- const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${indexNum}/raw/doc_${indexNum}_direct_judged.jsonl`;
39
- const res = await fetch(docUrl, {
40
- headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
41
  });
 
42
 
43
- if (!res.ok) {
44
- return new Response(
45
- JSON.stringify({ error: `doc_${indexNum}_direct_judged.jsonl not found on HF Datasets` }),
46
- { status: res.status, headers: { 'Content-Type': 'application/json' } }
47
- );
48
- }
49
- pagesData = await res.json();
50
- } else {
51
- // Local dev: read from local file (reflects saved annotations immediately)
52
- const filePath = path.join(
53
- process.cwd(),
54
- 'annotation_data', 'wbg_extractions',
55
- `doc_${indexNum}`, 'raw', `doc_${indexNum}_direct_judged.jsonl`
56
  );
57
-
58
- if (!fs.existsSync(filePath)) {
59
- return new Response(
60
- JSON.stringify({ error: `doc_${indexNum}_direct_judged.jsonl not found locally` }),
61
- { status: 404, headers: { 'Content-Type': 'application/json' } }
62
- );
63
- }
64
- const raw = fs.readFileSync(filePath, 'utf-8');
65
- pagesData = JSON.parse(raw);
66
  }
67
 
68
  const pageData = pagesData.find(p => p.document?.pages?.[0] === pageNum);
69
-
70
  if (!pageData) {
71
  return new Response(
72
  JSON.stringify({ error: `Page ${pageNum} not found in doc ${indexNum}` }),
 
6
  return process.env.HF_TOKEN && process.env.NODE_ENV !== 'development';
7
  };
8
 
9
+ /**
10
+ * Loads ALL pages data for a given document index.
11
+ */
12
+ async function loadPagesData(indexNum) {
13
+ if (isHFSpace()) {
14
+ const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${indexNum}/raw/doc_${indexNum}_direct_judged.jsonl`;
15
+ const res = await fetch(docUrl, {
16
+ headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
17
+ });
18
+ if (!res.ok) return null;
19
+ return await res.json();
20
+ } else {
21
+ const filePath = path.join(
22
+ process.cwd(),
23
+ 'annotation_data', 'wbg_extractions',
24
+ `doc_${indexNum}`, 'raw', `doc_${indexNum}_direct_judged.jsonl`
25
+ );
26
+ if (!fs.existsSync(filePath)) return null;
27
+ return JSON.parse(fs.readFileSync(filePath, 'utf-8'));
28
+ }
29
+ }
30
+
31
  export async function GET(request) {
32
  const { searchParams } = new URL(request.url);
33
  const index = searchParams.get('index');
34
  const page = searchParams.get('page');
35
 
36
+ if (index === null) {
 
37
  return new Response(
38
+ JSON.stringify({ error: "Missing index parameter" }),
39
  { status: 400, headers: { 'Content-Type': 'application/json' } }
40
  );
41
  }
42
 
 
43
  const indexNum = parseInt(index, 10);
44
+ if (isNaN(indexNum) || indexNum < 0) {
 
 
45
  return new Response(
46
+ JSON.stringify({ error: "index must be a non-negative integer" }),
47
  { status: 400, headers: { 'Content-Type': 'application/json' } }
48
  );
49
  }
50
 
51
  try {
52
+ const pagesData = await loadPagesData(indexNum);
53
+
54
+ if (!pagesData) {
55
+ return new Response(
56
+ JSON.stringify({ error: `doc_${indexNum}_direct_judged.jsonl not found` }),
57
+ { status: 404, headers: { 'Content-Type': 'application/json' } }
58
+ );
59
+ }
60
+
61
+ // If no page specified, return just the annotatable pages list
62
+ if (page === null || page === undefined) {
63
+ const annotatablePages = pagesData
64
+ .filter(p => p.datasets && p.datasets.length > 0)
65
+ .map(p => p.document?.pages?.[0]);
66
 
67
+ return new Response(JSON.stringify({ annotatable_pages: annotatablePages }), {
68
+ status: 200,
69
+ headers: { 'Content-Type': 'application/json' }
 
 
70
  });
71
+ }
72
 
73
+ // Specific page requested
74
+ const pageNum = parseInt(page, 10);
75
+ if (isNaN(pageNum) || pageNum < 0) {
76
+ return new Response(
77
+ JSON.stringify({ error: "page must be a non-negative integer" }),
78
+ { status: 400, headers: { 'Content-Type': 'application/json' } }
 
 
 
 
 
 
 
79
  );
 
 
 
 
 
 
 
 
 
80
  }
81
 
82
  const pageData = pagesData.find(p => p.document?.pages?.[0] === pageNum);
 
83
  if (!pageData) {
84
  return new Response(
85
  JSON.stringify({ error: `Page ${pageNum} not found in doc ${indexNum}` }),
app/api/documents/route.js CHANGED
@@ -1,4 +1,4 @@
1
- import { HF_DATASET_BASE_URL, MAX_DOCS_TO_SCAN } from '../../../utils/config.js';
2
 
3
  export async function GET() {
4
  try {
@@ -21,38 +21,17 @@ export async function GET() {
21
 
22
  const links = await linksRes.json();
23
 
24
- // Filter to successful links and take the first N
25
- const successLinks = links.filter(l => l.status === 'success').slice(0, MAX_DOCS_TO_SCAN);
26
-
27
- // Parallel fetch β€” much faster than sequential scanning
28
- const results = await Promise.allSettled(
29
- successLinks.map(async (link) => {
30
- const docUrl = `${HF_DATASET_BASE_URL}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_direct_judged.jsonl`;
31
- const docRes = await fetch(docUrl, {
32
- headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
33
- });
34
-
35
- if (!docRes.ok) return null;
36
-
37
- const pagesData = await docRes.json();
38
- const annotatablePages = pagesData
39
- .filter(page => page.datasets && page.datasets.length > 0)
40
- .map(page => page.document.pages[0]);
41
-
42
- if (annotatablePages.length === 0) return null;
43
-
44
- return {
45
- index: link.index,
46
- pdf_url: link.direct_pdf_url,
47
- landing_page: link.landing_page_url,
48
- annotatable_pages: annotatablePages
49
- };
50
- })
51
- );
52
-
53
- const documents = results
54
- .filter(r => r.status === 'fulfilled' && r.value !== null)
55
- .map(r => r.value);
56
 
57
  return new Response(JSON.stringify(documents), {
58
  status: 200,
 
1
+ import { HF_DATASET_BASE_URL } from '../../../utils/config.js';
2
 
3
  export async function GET() {
4
  try {
 
21
 
22
  const links = await linksRes.json();
23
 
24
+ // Return ALL successful links β€” no limit, no data pre-fetching.
25
+ // Page data is loaded on-demand when the user selects a document.
26
+ const documents = links
27
+ .filter(l => l.status === 'success')
28
+ .map(link => ({
29
+ index: link.index,
30
+ pdf_url: link.direct_pdf_url,
31
+ landing_page: link.landing_page_url,
32
+ // annotatable_pages will be fetched on-demand via /api/document
33
+ annotatable_pages: null,
34
+ }));
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
  return new Response(JSON.stringify(documents), {
37
  status: 200,
app/components/PdfViewer.js CHANGED
@@ -1,6 +1,10 @@
1
  "use client";
2
 
 
 
3
  export default function PdfViewer({ pdfUrl, pageNumber }) {
 
 
4
  if (!pdfUrl) {
5
  return (
6
  <div className="pdf-placeholder">
@@ -13,16 +17,24 @@ export default function PdfViewer({ pdfUrl, pageNumber }) {
13
  const viewerPage = (pageNumber ?? 0) + 1;
14
 
15
  // Use Mozilla's hosted PDF.js viewer β€” supports #page=N for direct page navigation.
16
- // This avoids X-Frame-Options restrictions from the source server.
17
  const pdfJsViewerUrl = `https://mozilla.github.io/pdf.js/web/viewer.html?file=${encodeURIComponent(pdfUrl)}#page=${viewerPage}`;
18
 
19
  return (
20
- <iframe
21
- key={`pdf-${pdfUrl}-page-${viewerPage}`}
22
- src={pdfJsViewerUrl}
23
- className="pdf-frame"
24
- title={`PDF Page ${viewerPage}`}
25
- allow="fullscreen"
26
- />
 
 
 
 
 
 
 
 
 
27
  );
28
  }
 
1
  "use client";
2
 
3
+ import { useState } from 'react';
4
+
5
  export default function PdfViewer({ pdfUrl, pageNumber }) {
6
+ const [pdfLoading, setPdfLoading] = useState(true);
7
+
8
  if (!pdfUrl) {
9
  return (
10
  <div className="pdf-placeholder">
 
17
  const viewerPage = (pageNumber ?? 0) + 1;
18
 
19
  // Use Mozilla's hosted PDF.js viewer β€” supports #page=N for direct page navigation.
 
20
  const pdfJsViewerUrl = `https://mozilla.github.io/pdf.js/web/viewer.html?file=${encodeURIComponent(pdfUrl)}#page=${viewerPage}`;
21
 
22
  return (
23
+ <div className="pdf-container">
24
+ {pdfLoading && (
25
+ <div className="pdf-loading-overlay">
26
+ <div className="loading-spinner" />
27
+ <p>Loading PDF...</p>
28
+ </div>
29
+ )}
30
+ <iframe
31
+ key={`pdf-${pdfUrl}-page-${viewerPage}`}
32
+ src={pdfJsViewerUrl}
33
+ className="pdf-frame"
34
+ title={`PDF Page ${viewerPage}`}
35
+ allow="fullscreen"
36
+ onLoad={() => setPdfLoading(false)}
37
+ />
38
+ </div>
39
  );
40
  }
app/globals.css CHANGED
@@ -228,6 +228,32 @@ h4 {
228
 
229
  /* ── PDF Viewer ─────────────────────────────────── */
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  .pdf-frame {
232
  flex: 1;
233
  width: 100%;
 
228
 
229
  /* ── PDF Viewer ─────────────────────────────────── */
230
 
231
+ .pdf-container {
232
+ flex: 1;
233
+ display: flex;
234
+ flex-direction: column;
235
+ position: relative;
236
+ min-height: 0;
237
+ }
238
+
239
+ .pdf-loading-overlay {
240
+ position: absolute;
241
+ inset: 0;
242
+ z-index: 5;
243
+ display: flex;
244
+ flex-direction: column;
245
+ align-items: center;
246
+ justify-content: center;
247
+ gap: 12px;
248
+ background: var(--pane-bg);
249
+ border-radius: 12px;
250
+ }
251
+
252
+ .pdf-loading-overlay p {
253
+ font-size: 0.85rem;
254
+ color: #94a3b8;
255
+ }
256
+
257
  .pdf-frame {
258
  flex: 1;
259
  width: 100%;
app/page.js CHANGED
@@ -78,11 +78,36 @@ export default function Home() {
78
  }
79
  }, []);
80
 
81
- // Update currentDoc when selection changes
82
  useEffect(() => {
83
  if (selectedDocIndex !== null) {
84
  const doc = documents.find(d => d.index === selectedDocIndex);
85
- setCurrentDoc(doc);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  }
87
  }, [selectedDocIndex, documents]);
88
 
 
78
  }
79
  }, []);
80
 
81
+ // Fetch annotatable pages on-demand when document selection changes
82
  useEffect(() => {
83
  if (selectedDocIndex !== null) {
84
  const doc = documents.find(d => d.index === selectedDocIndex);
85
+ if (!doc) return;
86
+
87
+ // If annotatable_pages already loaded (cached), use it
88
+ if (doc.annotatable_pages) {
89
+ setCurrentDoc(doc);
90
+ return;
91
+ }
92
+
93
+ // Fetch annotatable pages on-demand
94
+ setLoadingPage(true);
95
+ fetch(`/api/document?index=${selectedDocIndex}`)
96
+ .then(res => res.json())
97
+ .then(data => {
98
+ const updatedDoc = { ...doc, annotatable_pages: data.annotatable_pages || [] };
99
+ setCurrentDoc(updatedDoc);
100
+ // Cache in the documents array
101
+ setDocuments(prev => prev.map(d =>
102
+ d.index === selectedDocIndex ? updatedDoc : d
103
+ ));
104
+ setLoadingPage(false);
105
+ })
106
+ .catch(err => {
107
+ console.error("Failed to load document pages", err);
108
+ setCurrentDoc({ ...doc, annotatable_pages: [] });
109
+ setLoadingPage(false);
110
+ });
111
  }
112
  }, [selectedDocIndex, documents]);
113