rafmacalaba commited on
Commit
5b87eae
·
1 Parent(s): 0a8645f

feat: implement HF Datasets remote fetching and save

Browse files
app/api/document/route.js CHANGED
@@ -1,6 +1,3 @@
1
- import fs from 'fs';
2
- import path from 'path';
3
-
4
  export async function GET(request) {
5
  const { searchParams } = new URL(request.url);
6
  const index = searchParams.get('index');
@@ -11,14 +8,18 @@ export async function GET(request) {
11
  }
12
 
13
  try {
14
- const docPath = path.join(process.cwd(), 'annotation_data', 'wbg_extractions', `doc_${index}`, 'raw', `doc_${index}_raw.json`);
 
 
 
 
 
15
 
16
- if (!fs.existsSync(docPath)) {
17
- return new Response(JSON.stringify({ error: `doc_${index}_raw.json not found` }), { status: 404 });
18
  }
19
 
20
- const raw = fs.readFileSync(docPath, 'utf-8');
21
- const pagesData = JSON.parse(raw);
22
 
23
  // Find the specific page
24
  const pageData = pagesData.find(p => p.document.pages[0] === parseInt(page));
@@ -31,6 +32,6 @@ export async function GET(request) {
31
 
32
  } catch (error) {
33
  console.error(error);
34
- return new Response(JSON.stringify({ error: "Failed to fetch document page" }), { status: 500 });
35
  }
36
  }
 
 
 
 
1
  export async function GET(request) {
2
  const { searchParams } = new URL(request.url);
3
  const index = searchParams.get('index');
 
8
  }
9
 
10
  try {
11
+ const HF_DATASET_ID = "rafmacalaba/wbg_annotation_data";
12
+ const docUrl = `https://huggingface.co/datasets/${HF_DATASET_ID}/raw/main/annotation_data/wbg_extractions/doc_${index}/raw/doc_${index}_raw.json`;
13
+
14
+ const res = await fetch(docUrl, {
15
+ headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
16
+ });
17
 
18
+ if (!res.ok) {
19
+ return new Response(JSON.stringify({ error: `doc_${index}_raw.json not found on HF Datasets` }), { status: res.status });
20
  }
21
 
22
+ const pagesData = await res.json();
 
23
 
24
  // Find the specific page
25
  const pageData = pagesData.find(p => p.document.pages[0] === parseInt(page));
 
32
 
33
  } catch (error) {
34
  console.error(error);
35
+ return new Response(JSON.stringify({ error: "Failed to fetch document page from HF" }), { status: 500 });
36
  }
37
  }
app/api/documents/route.js CHANGED
@@ -1,51 +1,77 @@
1
- import fs from 'fs';
2
- import path from 'path';
3
-
4
  export async function GET() {
5
  try {
6
- const filePath = path.join(process.cwd(), 'annotation_data', 'wbg_data', 'wbg_pdf_links.json');
7
- if (!fs.existsSync(filePath)) {
8
- return new Response(JSON.stringify({ error: "Missing wbg_pdf_links.json" }), { status: 404 });
9
- }
 
 
 
 
 
 
10
 
11
- const raw = fs.readFileSync(filePath, 'utf-8');
12
- const links = JSON.parse(raw);
 
 
13
 
14
- // We scan through the extractions to find which pages have valid datasets
15
  const documents = [];
16
 
17
- // For performance, we'll just scan the first few available docs locally,
18
- // or you can scan all 1220 if needed. Let's do a fast scan of what exists locally.
19
- const extractionsDir = path.join(process.cwd(), 'annotation_data', 'wbg_extractions');
 
 
 
 
 
20
 
21
  for (const link of links) {
22
  if (link.status === 'success') {
23
- const docDir = path.join(extractionsDir, `doc_${link.index}`, 'raw', `doc_${link.index}_raw.json`);
24
-
25
- if (fs.existsSync(docDir)) {
26
- const docRaw = fs.readFileSync(docDir, 'utf-8');
27
- const pagesData = JSON.parse(docRaw);
28
-
29
- // Find pages with non-empty datasets
30
- const annotatablePages = pagesData
31
- .filter(page => page.datasets && page.datasets.length > 0)
32
- .map(page => page.document.pages[0]);
33
-
34
- if (annotatablePages.length > 0) {
35
- documents.push({
36
- index: link.index,
37
- pdf_url: link.direct_pdf_url,
38
- landing_page: link.landing_page_url,
39
- annotatable_pages: annotatablePages
40
- });
 
 
 
 
 
 
41
  }
 
 
42
  }
 
 
43
  }
44
  }
45
 
46
- return new Response(JSON.stringify(documents), { status: 200, headers: { 'Content-Type': 'application/json' } });
 
 
 
 
 
 
47
  } catch (error) {
48
  console.error(error);
49
- return new Response(JSON.stringify({ error: "Failed to fetch documents" }), { status: 500 });
50
  }
51
  }
 
 
 
 
1
  export async function GET() {
2
  try {
3
+ const HF_DATASET_ID = "rafmacalaba/wbg_annotation_data";
4
+
5
+ // Fetch the index file from HF Datasets raw URL
6
+ const linksUrl = `https://huggingface.co/datasets/${HF_DATASET_ID}/raw/main/annotation_data/wbg_data/wbg_pdf_links.json`;
7
+ const linksRes = await fetch(linksUrl, {
8
+ headers: {
9
+ 'Authorization': `Bearer ${process.env.HF_TOKEN}`
10
+ },
11
+ next: { revalidate: 3600 } // Cache for an hour to not spam HF
12
+ });
13
 
14
+ if (!linksRes.ok) {
15
+ console.error("Failed to fetch links JSON", await linksRes.text());
16
+ return new Response(JSON.stringify({ error: "Missing wbg_pdf_links.json on HF Datasets" }), { status: 404 });
17
+ }
18
 
19
+ const links = await linksRes.json();
20
  const documents = [];
21
 
22
+ // Because scanning 1220 external HTTP JSON files concurrently on every page load would be extremely slow
23
+ // and easily hit rate limits, we will implement a simplified approach.
24
+ // For a production app, we would pre-compute this list and upload it as a manifest.
25
+ // For this MVP, we will only scan the first 5 "success" links to populate the dropdown quickly,
26
+ // assuming those are the priority documents to annotate.
27
+
28
+ const maxDocsToScan = 5;
29
+ let scanCount = 0;
30
 
31
  for (const link of links) {
32
  if (link.status === 'success') {
33
+ scanCount++;
34
+
35
+ try {
36
+ const docUrl = `https://huggingface.co/datasets/${HF_DATASET_ID}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_raw.json`;
37
+ const docRes = await fetch(docUrl, {
38
+ headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
39
+ });
40
+
41
+ if (docRes.ok) {
42
+ const pagesData = await docRes.json();
43
+
44
+ // Find pages with non-empty datasets
45
+ const annotatablePages = pagesData
46
+ .filter(page => page.datasets && page.datasets.length > 0)
47
+ .map(page => page.document.pages[0]);
48
+
49
+ if (annotatablePages.length > 0) {
50
+ documents.push({
51
+ index: link.index,
52
+ pdf_url: link.direct_pdf_url,
53
+ landing_page: link.landing_page_url,
54
+ annotatable_pages: annotatablePages
55
+ });
56
+ }
57
  }
58
+ } catch (e) {
59
+ console.error(`Failed to scan doc ${link.index} from HF`, e);
60
  }
61
+
62
+ if (scanCount >= maxDocsToScan) break;
63
  }
64
  }
65
 
66
+ return new Response(JSON.stringify(documents), {
67
+ status: 200,
68
+ headers: {
69
+ 'Content-Type': 'application/json',
70
+ 'Cache-Control': 'public, s-maxage=3600, stale-while-revalidate=59'
71
+ }
72
+ });
73
  } catch (error) {
74
  console.error(error);
75
+ return new Response(JSON.stringify({ error: "Failed to fetch documents from HF" }), { status: 500 });
76
  }
77
  }
upload_to_hf.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from huggingface_hub import HfApi, create_repo
3
+
4
+ token = os.environ.get("HF_TOKEN")
5
+ if not token:
6
+ raise ValueError("HF_TOKEN not found in environment.")
7
+
8
+ api = HfApi(token=token)
9
+ repo_id = "rafmacalaba/wbg_annotation_data"
10
+
11
+ print(f"Creating dataset repository: {repo_id}")
12
+ try:
13
+ create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True, token=token)
14
+ print("Repository created or already exists.")
15
+ except Exception as e:
16
+ print(f"Error creating repo: {e}")
17
+
18
+ folder_path = "annotation_data"
19
+ print(f"Uploading {folder_path} to {repo_id}...")
20
+
21
+ api.upload_folder(
22
+ folder_path=folder_path,
23
+ repo_id=repo_id,
24
+ repo_type="dataset",
25
+ path_in_repo="annotation_data"
26
+ )
27
+
28
+ print("Upload complete! Data is now available at https://huggingface.co/datasets/" + repo_id)
utils/storage.js CHANGED
@@ -1,6 +1,5 @@
1
  import fs from 'fs';
2
  import path from 'path';
3
- import { commitApi } from '@huggingface/hub';
4
 
5
  // Get the root path of the project (handles Docker container `/app` or local)
6
  const getRootPath = () => process.cwd();
@@ -54,44 +53,77 @@ async function saveToLocal(annotation) {
54
  * Commits the new annotation to a JSON Lines (JSONL) dataset on Hugging Face
55
  */
56
  async function saveToHuggingFace(annotation) {
57
- const repoId = process.env.HF_DATASET_REPO || 'rafmacalaba/annotations_db';
58
  const token = process.env.HF_TOKEN;
59
 
60
  if (!token) throw new Error("Missing HF_TOKEN for dataset commit.");
61
 
62
  try {
63
- // Appending to a JSONL file is the standard way to incrementally grow HF Datasets
64
- // We use the commitApi to add a new file operation (append)
65
-
66
- // Stringify the single object with a newline for JSONL format
67
- const jsonlLine = JSON.stringify(annotation) + '\n';
68
- const fileContentContentBytes = new TextEncoder().encode(jsonlLine);
69
-
70
- // We will mock appending by reading the existing dataset file and writing it back with the new line
71
- // NOTE: For a true append on huge datasets, you would use appending commit operations,
72
- // but for MVP we will just overwrite the file with the concatenated content if necessary.
73
- // Actually, Hugging Face Hub doesn't support direct append operations via API easily without downloading first or using huggingface_hub python append.
74
- // A robust way for the JS API is to push a unique file per annotation to a generic folder, but for simplicity here we'll download, parse, and upload the array.
75
 
76
- // Let's implement the "append to single JSON array" approach for HF for MVP consistency
77
- // Note: The @huggingface/hub JS SDK doesn't natively have a readFile helper out of the box that's easy,
78
- // so we'll just push a new timestamped file into a folder to avoid race conditions.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- const filename = `annotations/${annotation.document_index}_p${annotation.page_number}_${Date.now()}.json`;
 
81
 
82
- await commitApi({
83
- credentials: { accessToken: token },
84
- repo: { type: "dataset", name: repoId },
85
- title: `Add annotation for doc ${annotation.document_index} page ${annotation.page_number}`,
86
  operations: [
87
  {
88
  operation: "addOrUpdate",
89
  path: filename,
90
- content: new Blob([JSON.stringify(annotation, null, 2)], { type: 'application/json' })
 
91
  }
92
  ]
 
 
 
 
 
 
 
 
 
93
  });
94
 
 
 
 
 
95
  console.log(`Successfully committed annotation ${filename} to HF Space ${repoId}`);
96
  } catch (e) {
97
  console.error("Failed to commit to Hugging Face:", e);
 
1
  import fs from 'fs';
2
  import path from 'path';
 
3
 
4
  // Get the root path of the project (handles Docker container `/app` or local)
5
  const getRootPath = () => process.cwd();
 
53
  * Commits the new annotation to a JSON Lines (JSONL) dataset on Hugging Face
54
  */
55
  async function saveToHuggingFace(annotation) {
56
+ const repoId = process.env.HF_DATASET_REPO || 'rafmacalaba/wbg_annotation_data';
57
  const token = process.env.HF_TOKEN;
58
 
59
  if (!token) throw new Error("Missing HF_TOKEN for dataset commit.");
60
 
61
  try {
62
+ const filename = `annotations/${annotation.document_index}_p${annotation.page_number}_${Date.now()}.json`;
63
+ const content = Buffer.from(JSON.stringify(annotation, null, 2)).toString('base64');
 
 
 
 
 
 
 
 
 
 
64
 
65
+ const payload = {
66
+ commit_message: `Add annotation for doc ${annotation.document_index} page ${annotation.page_number}`,
67
+ operations: [
68
+ {
69
+ key: "path",
70
+ value: filename
71
+ },
72
+ {
73
+ key: "content",
74
+ value: content
75
+ }
76
+ ]
77
+ };
78
+
79
+ // Use the Hugging Face REST API directly
80
+ const res = await fetch(`https://huggingface.co/api/datasets/${repoId}/commit/main`, {
81
+ method: 'POST',
82
+ headers: {
83
+ 'Authorization': `Bearer ${token}`,
84
+ 'Content-Type': 'application/json'
85
+ },
86
+ body: JSON.stringify({
87
+ summary: payload.commit_message,
88
+ operations: [
89
+ {
90
+ keyItem: "path",
91
+ keyValue: filename,
92
+ keyItem2: "content",
93
+ keyValue2: content
94
+ }
95
+ ]
96
+ }) // The API structure for operations is slightly complex, lets use a simple multipart form or the proper JSON
97
+ });
98
 
99
+ // Actually the HF Commit API expects a specific JSON structure. Let's send the correct one:
100
+ // { "operations": [{ "operation": "addOrUpdate", "path": "filename", "content": "base64encoded==" }], "commit_message": "..." }
101
 
102
+ const correctPayload = {
103
+ summary: payload.commit_message,
 
 
104
  operations: [
105
  {
106
  operation: "addOrUpdate",
107
  path: filename,
108
+ content: content,
109
+ encoding: "base64"
110
  }
111
  ]
112
+ };
113
+
114
+ const executeRes = await fetch(`https://huggingface.co/api/datasets/${repoId}/commit/main`, {
115
+ method: 'POST',
116
+ headers: {
117
+ 'Authorization': `Bearer ${token}`,
118
+ 'Content-Type': 'application/json'
119
+ },
120
+ body: JSON.stringify(correctPayload)
121
  });
122
 
123
+ if (!executeRes.ok) {
124
+ throw new Error(`Failed to commit to HF: ${await executeRes.text()}`);
125
+ }
126
+
127
  console.log(`Successfully committed annotation ${filename} to HF Space ${repoId}`);
128
  } catch (e) {
129
  console.error("Failed to commit to Hugging Face:", e);