Spaces:
Running
Running
Commit ·
5b87eae
1
Parent(s): 0a8645f
feat: implement HF Datasets remote fetching and save
Browse files- app/api/document/route.js +10 -9
- app/api/documents/route.js +59 -33
- upload_to_hf.py +28 -0
- utils/storage.js +55 -23
app/api/document/route.js
CHANGED
|
@@ -1,6 +1,3 @@
|
|
| 1 |
-
import fs from 'fs';
|
| 2 |
-
import path from 'path';
|
| 3 |
-
|
| 4 |
export async function GET(request) {
|
| 5 |
const { searchParams } = new URL(request.url);
|
| 6 |
const index = searchParams.get('index');
|
|
@@ -11,14 +8,18 @@ export async function GET(request) {
|
|
| 11 |
}
|
| 12 |
|
| 13 |
try {
|
| 14 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
if (!
|
| 17 |
-
return new Response(JSON.stringify({ error: `doc_${index}_raw.json not found` }), { status:
|
| 18 |
}
|
| 19 |
|
| 20 |
-
const
|
| 21 |
-
const pagesData = JSON.parse(raw);
|
| 22 |
|
| 23 |
// Find the specific page
|
| 24 |
const pageData = pagesData.find(p => p.document.pages[0] === parseInt(page));
|
|
@@ -31,6 +32,6 @@ export async function GET(request) {
|
|
| 31 |
|
| 32 |
} catch (error) {
|
| 33 |
console.error(error);
|
| 34 |
-
return new Response(JSON.stringify({ error: "Failed to fetch document page" }), { status: 500 });
|
| 35 |
}
|
| 36 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
export async function GET(request) {
|
| 2 |
const { searchParams } = new URL(request.url);
|
| 3 |
const index = searchParams.get('index');
|
|
|
|
| 8 |
}
|
| 9 |
|
| 10 |
try {
|
| 11 |
+
const HF_DATASET_ID = "rafmacalaba/wbg_annotation_data";
|
| 12 |
+
const docUrl = `https://huggingface.co/datasets/${HF_DATASET_ID}/raw/main/annotation_data/wbg_extractions/doc_${index}/raw/doc_${index}_raw.json`;
|
| 13 |
+
|
| 14 |
+
const res = await fetch(docUrl, {
|
| 15 |
+
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
|
| 16 |
+
});
|
| 17 |
|
| 18 |
+
if (!res.ok) {
|
| 19 |
+
return new Response(JSON.stringify({ error: `doc_${index}_raw.json not found on HF Datasets` }), { status: res.status });
|
| 20 |
}
|
| 21 |
|
| 22 |
+
const pagesData = await res.json();
|
|
|
|
| 23 |
|
| 24 |
// Find the specific page
|
| 25 |
const pageData = pagesData.find(p => p.document.pages[0] === parseInt(page));
|
|
|
|
| 32 |
|
| 33 |
} catch (error) {
|
| 34 |
console.error(error);
|
| 35 |
+
return new Response(JSON.stringify({ error: "Failed to fetch document page from HF" }), { status: 500 });
|
| 36 |
}
|
| 37 |
}
|
app/api/documents/route.js
CHANGED
|
@@ -1,51 +1,77 @@
|
|
| 1 |
-
import fs from 'fs';
|
| 2 |
-
import path from 'path';
|
| 3 |
-
|
| 4 |
export async function GET() {
|
| 5 |
try {
|
| 6 |
-
const
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
| 13 |
|
| 14 |
-
|
| 15 |
const documents = [];
|
| 16 |
|
| 17 |
-
//
|
| 18 |
-
//
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
|
| 21 |
for (const link of links) {
|
| 22 |
if (link.status === 'success') {
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
const
|
| 27 |
-
const
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
}
|
|
|
|
|
|
|
| 42 |
}
|
|
|
|
|
|
|
| 43 |
}
|
| 44 |
}
|
| 45 |
|
| 46 |
-
return new Response(JSON.stringify(documents), {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
} catch (error) {
|
| 48 |
console.error(error);
|
| 49 |
-
return new Response(JSON.stringify({ error: "Failed to fetch documents" }), { status: 500 });
|
| 50 |
}
|
| 51 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
export async function GET() {
|
| 2 |
try {
|
| 3 |
+
const HF_DATASET_ID = "rafmacalaba/wbg_annotation_data";
|
| 4 |
+
|
| 5 |
+
// Fetch the index file from HF Datasets raw URL
|
| 6 |
+
const linksUrl = `https://huggingface.co/datasets/${HF_DATASET_ID}/raw/main/annotation_data/wbg_data/wbg_pdf_links.json`;
|
| 7 |
+
const linksRes = await fetch(linksUrl, {
|
| 8 |
+
headers: {
|
| 9 |
+
'Authorization': `Bearer ${process.env.HF_TOKEN}`
|
| 10 |
+
},
|
| 11 |
+
next: { revalidate: 3600 } // Cache for an hour to not spam HF
|
| 12 |
+
});
|
| 13 |
|
| 14 |
+
if (!linksRes.ok) {
|
| 15 |
+
console.error("Failed to fetch links JSON", await linksRes.text());
|
| 16 |
+
return new Response(JSON.stringify({ error: "Missing wbg_pdf_links.json on HF Datasets" }), { status: 404 });
|
| 17 |
+
}
|
| 18 |
|
| 19 |
+
const links = await linksRes.json();
|
| 20 |
const documents = [];
|
| 21 |
|
| 22 |
+
// Because scanning 1220 external HTTP JSON files concurrently on every page load would be extremely slow
|
| 23 |
+
// and easily hit rate limits, we will implement a simplified approach.
|
| 24 |
+
// For a production app, we would pre-compute this list and upload it as a manifest.
|
| 25 |
+
// For this MVP, we will only scan the first 5 "success" links to populate the dropdown quickly,
|
| 26 |
+
// assuming those are the priority documents to annotate.
|
| 27 |
+
|
| 28 |
+
const maxDocsToScan = 5;
|
| 29 |
+
let scanCount = 0;
|
| 30 |
|
| 31 |
for (const link of links) {
|
| 32 |
if (link.status === 'success') {
|
| 33 |
+
scanCount++;
|
| 34 |
+
|
| 35 |
+
try {
|
| 36 |
+
const docUrl = `https://huggingface.co/datasets/${HF_DATASET_ID}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_raw.json`;
|
| 37 |
+
const docRes = await fetch(docUrl, {
|
| 38 |
+
headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
|
| 39 |
+
});
|
| 40 |
+
|
| 41 |
+
if (docRes.ok) {
|
| 42 |
+
const pagesData = await docRes.json();
|
| 43 |
+
|
| 44 |
+
// Find pages with non-empty datasets
|
| 45 |
+
const annotatablePages = pagesData
|
| 46 |
+
.filter(page => page.datasets && page.datasets.length > 0)
|
| 47 |
+
.map(page => page.document.pages[0]);
|
| 48 |
+
|
| 49 |
+
if (annotatablePages.length > 0) {
|
| 50 |
+
documents.push({
|
| 51 |
+
index: link.index,
|
| 52 |
+
pdf_url: link.direct_pdf_url,
|
| 53 |
+
landing_page: link.landing_page_url,
|
| 54 |
+
annotatable_pages: annotatablePages
|
| 55 |
+
});
|
| 56 |
+
}
|
| 57 |
}
|
| 58 |
+
} catch (e) {
|
| 59 |
+
console.error(`Failed to scan doc ${link.index} from HF`, e);
|
| 60 |
}
|
| 61 |
+
|
| 62 |
+
if (scanCount >= maxDocsToScan) break;
|
| 63 |
}
|
| 64 |
}
|
| 65 |
|
| 66 |
+
return new Response(JSON.stringify(documents), {
|
| 67 |
+
status: 200,
|
| 68 |
+
headers: {
|
| 69 |
+
'Content-Type': 'application/json',
|
| 70 |
+
'Cache-Control': 'public, s-maxage=3600, stale-while-revalidate=59'
|
| 71 |
+
}
|
| 72 |
+
});
|
| 73 |
} catch (error) {
|
| 74 |
console.error(error);
|
| 75 |
+
return new Response(JSON.stringify({ error: "Failed to fetch documents from HF" }), { status: 500 });
|
| 76 |
}
|
| 77 |
}
|
upload_to_hf.py
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from huggingface_hub import HfApi, create_repo
|
| 3 |
+
|
| 4 |
+
token = os.environ.get("HF_TOKEN")
|
| 5 |
+
if not token:
|
| 6 |
+
raise ValueError("HF_TOKEN not found in environment.")
|
| 7 |
+
|
| 8 |
+
api = HfApi(token=token)
|
| 9 |
+
repo_id = "rafmacalaba/wbg_annotation_data"
|
| 10 |
+
|
| 11 |
+
print(f"Creating dataset repository: {repo_id}")
|
| 12 |
+
try:
|
| 13 |
+
create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True, token=token)
|
| 14 |
+
print("Repository created or already exists.")
|
| 15 |
+
except Exception as e:
|
| 16 |
+
print(f"Error creating repo: {e}")
|
| 17 |
+
|
| 18 |
+
folder_path = "annotation_data"
|
| 19 |
+
print(f"Uploading {folder_path} to {repo_id}...")
|
| 20 |
+
|
| 21 |
+
api.upload_folder(
|
| 22 |
+
folder_path=folder_path,
|
| 23 |
+
repo_id=repo_id,
|
| 24 |
+
repo_type="dataset",
|
| 25 |
+
path_in_repo="annotation_data"
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
print("Upload complete! Data is now available at https://huggingface.co/datasets/" + repo_id)
|
utils/storage.js
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import fs from 'fs';
|
| 2 |
import path from 'path';
|
| 3 |
-
import { commitApi } from '@huggingface/hub';
|
| 4 |
|
| 5 |
// Get the root path of the project (handles Docker container `/app` or local)
|
| 6 |
const getRootPath = () => process.cwd();
|
|
@@ -54,44 +53,77 @@ async function saveToLocal(annotation) {
|
|
| 54 |
* Commits the new annotation to a JSON Lines (JSONL) dataset on Hugging Face
|
| 55 |
*/
|
| 56 |
async function saveToHuggingFace(annotation) {
|
| 57 |
-
const repoId = process.env.HF_DATASET_REPO || 'rafmacalaba/
|
| 58 |
const token = process.env.HF_TOKEN;
|
| 59 |
|
| 60 |
if (!token) throw new Error("Missing HF_TOKEN for dataset commit.");
|
| 61 |
|
| 62 |
try {
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
// Stringify the single object with a newline for JSONL format
|
| 67 |
-
const jsonlLine = JSON.stringify(annotation) + '\n';
|
| 68 |
-
const fileContentContentBytes = new TextEncoder().encode(jsonlLine);
|
| 69 |
-
|
| 70 |
-
// We will mock appending by reading the existing dataset file and writing it back with the new line
|
| 71 |
-
// NOTE: For a true append on huge datasets, you would use appending commit operations,
|
| 72 |
-
// but for MVP we will just overwrite the file with the concatenated content if necessary.
|
| 73 |
-
// Actually, Hugging Face Hub doesn't support direct append operations via API easily without downloading first or using huggingface_hub python append.
|
| 74 |
-
// A robust way for the JS API is to push a unique file per annotation to a generic folder, but for simplicity here we'll download, parse, and upload the array.
|
| 75 |
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
|
| 80 |
-
|
|
|
|
| 81 |
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
repo: { type: "dataset", name: repoId },
|
| 85 |
-
title: `Add annotation for doc ${annotation.document_index} page ${annotation.page_number}`,
|
| 86 |
operations: [
|
| 87 |
{
|
| 88 |
operation: "addOrUpdate",
|
| 89 |
path: filename,
|
| 90 |
-
content:
|
|
|
|
| 91 |
}
|
| 92 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
});
|
| 94 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 95 |
console.log(`Successfully committed annotation ${filename} to HF Space ${repoId}`);
|
| 96 |
} catch (e) {
|
| 97 |
console.error("Failed to commit to Hugging Face:", e);
|
|
|
|
| 1 |
import fs from 'fs';
|
| 2 |
import path from 'path';
|
|
|
|
| 3 |
|
| 4 |
// Get the root path of the project (handles Docker container `/app` or local)
|
| 5 |
const getRootPath = () => process.cwd();
|
|
|
|
| 53 |
* Commits the new annotation to a JSON Lines (JSONL) dataset on Hugging Face
|
| 54 |
*/
|
| 55 |
async function saveToHuggingFace(annotation) {
|
| 56 |
+
const repoId = process.env.HF_DATASET_REPO || 'rafmacalaba/wbg_annotation_data';
|
| 57 |
const token = process.env.HF_TOKEN;
|
| 58 |
|
| 59 |
if (!token) throw new Error("Missing HF_TOKEN for dataset commit.");
|
| 60 |
|
| 61 |
try {
|
| 62 |
+
const filename = `annotations/${annotation.document_index}_p${annotation.page_number}_${Date.now()}.json`;
|
| 63 |
+
const content = Buffer.from(JSON.stringify(annotation, null, 2)).toString('base64');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
+
const payload = {
|
| 66 |
+
commit_message: `Add annotation for doc ${annotation.document_index} page ${annotation.page_number}`,
|
| 67 |
+
operations: [
|
| 68 |
+
{
|
| 69 |
+
key: "path",
|
| 70 |
+
value: filename
|
| 71 |
+
},
|
| 72 |
+
{
|
| 73 |
+
key: "content",
|
| 74 |
+
value: content
|
| 75 |
+
}
|
| 76 |
+
]
|
| 77 |
+
};
|
| 78 |
+
|
| 79 |
+
// Use the Hugging Face REST API directly
|
| 80 |
+
const res = await fetch(`https://huggingface.co/api/datasets/${repoId}/commit/main`, {
|
| 81 |
+
method: 'POST',
|
| 82 |
+
headers: {
|
| 83 |
+
'Authorization': `Bearer ${token}`,
|
| 84 |
+
'Content-Type': 'application/json'
|
| 85 |
+
},
|
| 86 |
+
body: JSON.stringify({
|
| 87 |
+
summary: payload.commit_message,
|
| 88 |
+
operations: [
|
| 89 |
+
{
|
| 90 |
+
keyItem: "path",
|
| 91 |
+
keyValue: filename,
|
| 92 |
+
keyItem2: "content",
|
| 93 |
+
keyValue2: content
|
| 94 |
+
}
|
| 95 |
+
]
|
| 96 |
+
}) // The API structure for operations is slightly complex, lets use a simple multipart form or the proper JSON
|
| 97 |
+
});
|
| 98 |
|
| 99 |
+
// Actually the HF Commit API expects a specific JSON structure. Let's send the correct one:
|
| 100 |
+
// { "operations": [{ "operation": "addOrUpdate", "path": "filename", "content": "base64encoded==" }], "commit_message": "..." }
|
| 101 |
|
| 102 |
+
const correctPayload = {
|
| 103 |
+
summary: payload.commit_message,
|
|
|
|
|
|
|
| 104 |
operations: [
|
| 105 |
{
|
| 106 |
operation: "addOrUpdate",
|
| 107 |
path: filename,
|
| 108 |
+
content: content,
|
| 109 |
+
encoding: "base64"
|
| 110 |
}
|
| 111 |
]
|
| 112 |
+
};
|
| 113 |
+
|
| 114 |
+
const executeRes = await fetch(`https://huggingface.co/api/datasets/${repoId}/commit/main`, {
|
| 115 |
+
method: 'POST',
|
| 116 |
+
headers: {
|
| 117 |
+
'Authorization': `Bearer ${token}`,
|
| 118 |
+
'Content-Type': 'application/json'
|
| 119 |
+
},
|
| 120 |
+
body: JSON.stringify(correctPayload)
|
| 121 |
});
|
| 122 |
|
| 123 |
+
if (!executeRes.ok) {
|
| 124 |
+
throw new Error(`Failed to commit to HF: ${await executeRes.text()}`);
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
console.log(`Successfully committed annotation ${filename} to HF Space ${repoId}`);
|
| 128 |
} catch (e) {
|
| 129 |
console.error("Failed to commit to Hugging Face:", e);
|