Spaces:

ai4data
/

data-use-annotation

Running

App Files Files Community

rafmacalaba commited on Feb 23

Commit

5b87eae

1 Parent(s): 0a8645f

feat: implement HF Datasets remote fetching and save

Browse files

Files changed (4) hide show

app/api/document/route.js +10 -9
app/api/documents/route.js +59 -33
upload_to_hf.py +28 -0
utils/storage.js +55 -23

app/api/document/route.js CHANGED Viewed

@@ -1,6 +1,3 @@
-import fs from 'fs';
-import path from 'path';
 export async function GET(request) {
     const { searchParams } = new URL(request.url);
     const index = searchParams.get('index');
@@ -11,14 +8,18 @@ export async function GET(request) {
     }
     try {
-        const docPath = path.join(process.cwd(), 'annotation_data', 'wbg_extractions', `doc_${index}`, 'raw', `doc_${index}_raw.json`);
-        if (!fs.existsSync(docPath)) {
-            return new Response(JSON.stringify({ error: `doc_${index}_raw.json not found` }), { status: 404 });
         }
-        const raw = fs.readFileSync(docPath, 'utf-8');
-        const pagesData = JSON.parse(raw);
         // Find the specific page
         const pageData = pagesData.find(p => p.document.pages[0] === parseInt(page));
@@ -31,6 +32,6 @@ export async function GET(request) {
     } catch (error) {
         console.error(error);
-        return new Response(JSON.stringify({ error: "Failed to fetch document page" }), { status: 500 });
     }
 }

 export async function GET(request) {
     const { searchParams } = new URL(request.url);
     const index = searchParams.get('index');
     }
     try {
+        const HF_DATASET_ID = "rafmacalaba/wbg_annotation_data";
+        const docUrl = `https://huggingface.co/datasets/${HF_DATASET_ID}/raw/main/annotation_data/wbg_extractions/doc_${index}/raw/doc_${index}_raw.json`;
+        const res = await fetch(docUrl, {
+            headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
+        });
+        if (!res.ok) {
+            return new Response(JSON.stringify({ error: `doc_${index}_raw.json not found on HF Datasets` }), { status: res.status });
         }
+        const pagesData = await res.json();
         // Find the specific page
         const pageData = pagesData.find(p => p.document.pages[0] === parseInt(page));
     } catch (error) {
         console.error(error);
+        return new Response(JSON.stringify({ error: "Failed to fetch document page from HF" }), { status: 500 });
     }
 }

app/api/documents/route.js CHANGED Viewed

@@ -1,51 +1,77 @@
-import fs from 'fs';
-import path from 'path';
 export async function GET() {
     try {
-        const filePath = path.join(process.cwd(), 'annotation_data', 'wbg_data', 'wbg_pdf_links.json');
-        if (!fs.existsSync(filePath)) {
-            return new Response(JSON.stringify({ error: "Missing wbg_pdf_links.json" }), { status: 404 });
-        }
-        const raw = fs.readFileSync(filePath, 'utf-8');
-        const links = JSON.parse(raw);
-        // We scan through the extractions to find which pages have valid datasets
         const documents = [];
-        // For performance, we'll just scan the first few available docs locally,
-        // or you can scan all 1220 if needed. Let's do a fast scan of what exists locally.
-        const extractionsDir = path.join(process.cwd(), 'annotation_data', 'wbg_extractions');
         for (const link of links) {
             if (link.status === 'success') {
-                const docDir = path.join(extractionsDir, `doc_${link.index}`, 'raw', `doc_${link.index}_raw.json`);
-                if (fs.existsSync(docDir)) {
-                    const docRaw = fs.readFileSync(docDir, 'utf-8');
-                    const pagesData = JSON.parse(docRaw);
-                    // Find pages with non-empty datasets
-                    const annotatablePages = pagesData
-                        .filter(page => page.datasets && page.datasets.length > 0)
-                        .map(page => page.document.pages[0]);
-                    if (annotatablePages.length > 0) {
-                        documents.push({
-                            index: link.index,
-                            pdf_url: link.direct_pdf_url,
-                            landing_page: link.landing_page_url,
-                            annotatable_pages: annotatablePages
-                        });
                     }
                 }
             }
         }
-        return new Response(JSON.stringify(documents), { status: 200, headers: { 'Content-Type': 'application/json' } });
     } catch (error) {
         console.error(error);
-        return new Response(JSON.stringify({ error: "Failed to fetch documents" }), { status: 500 });
     }
 }

 export async function GET() {
     try {
+        const HF_DATASET_ID = "rafmacalaba/wbg_annotation_data";
+        // Fetch the index file from HF Datasets raw URL
+        const linksUrl = `https://huggingface.co/datasets/${HF_DATASET_ID}/raw/main/annotation_data/wbg_data/wbg_pdf_links.json`;
+        const linksRes = await fetch(linksUrl, {
+            headers: {
+                'Authorization': `Bearer ${process.env.HF_TOKEN}`
+            },
+            next: { revalidate: 3600 } // Cache for an hour to not spam HF
+        });
+        if (!linksRes.ok) {
+            console.error("Failed to fetch links JSON", await linksRes.text());
+            return new Response(JSON.stringify({ error: "Missing wbg_pdf_links.json on HF Datasets" }), { status: 404 });
+        }
+        const links = await linksRes.json();
         const documents = [];
+        // Because scanning 1220 external HTTP JSON files concurrently on every page load would be extremely slow
+        // and easily hit rate limits, we will implement a simplified approach.
+        // For a production app, we would pre-compute this list and upload it as a manifest.
+        // For this MVP, we will only scan the first 5 "success" links to populate the dropdown quickly,
+        // assuming those are the priority documents to annotate.
+        const maxDocsToScan = 5;
+        let scanCount = 0;
         for (const link of links) {
             if (link.status === 'success') {
+                scanCount++;
+                try {
+                    const docUrl = `https://huggingface.co/datasets/${HF_DATASET_ID}/raw/main/annotation_data/wbg_extractions/doc_${link.index}/raw/doc_${link.index}_raw.json`;
+                    const docRes = await fetch(docUrl, {
+                        headers: { 'Authorization': `Bearer ${process.env.HF_TOKEN}` }
+                    });
+                    if (docRes.ok) {
+                        const pagesData = await docRes.json();
+                        // Find pages with non-empty datasets
+                        const annotatablePages = pagesData
+                            .filter(page => page.datasets && page.datasets.length > 0)
+                            .map(page => page.document.pages[0]);
+                        if (annotatablePages.length > 0) {
+                            documents.push({
+                                index: link.index,
+                                pdf_url: link.direct_pdf_url,
+                                landing_page: link.landing_page_url,
+                                annotatable_pages: annotatablePages
+                            });
+                        }
                     }
+                } catch (e) {
+                    console.error(`Failed to scan doc ${link.index} from HF`, e);
                 }
+                if (scanCount >= maxDocsToScan) break;
             }
         }
+        return new Response(JSON.stringify(documents), {
+            status: 200,
+            headers: {
+                'Content-Type': 'application/json',
+                'Cache-Control': 'public, s-maxage=3600, stale-while-revalidate=59'
+            }
+        });
     } catch (error) {
         console.error(error);
+        return new Response(JSON.stringify({ error: "Failed to fetch documents from HF" }), { status: 500 });
     }
 }

upload_to_hf.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import os
+from huggingface_hub import HfApi, create_repo
+token = os.environ.get("HF_TOKEN")
+if not token:
+    raise ValueError("HF_TOKEN not found in environment.")
+api = HfApi(token=token)
+repo_id = "rafmacalaba/wbg_annotation_data"
+print(f"Creating dataset repository: {repo_id}")
+try:
+    create_repo(repo_id=repo_id, repo_type="dataset", exist_ok=True, token=token)
+    print("Repository created or already exists.")
+except Exception as e:
+    print(f"Error creating repo: {e}")
+folder_path = "annotation_data"
+print(f"Uploading {folder_path} to {repo_id}...")
+api.upload_folder(
+    folder_path=folder_path,
+    repo_id=repo_id,
+    repo_type="dataset",
+    path_in_repo="annotation_data"
+)
+print("Upload complete! Data is now available at https://huggingface.co/datasets/" + repo_id)

utils/storage.js CHANGED Viewed

@@ -1,6 +1,5 @@
 import fs from 'fs';
 import path from 'path';
-import { commitApi } from '@huggingface/hub';
 // Get the root path of the project (handles Docker container `/app` or local)
 const getRootPath = () => process.cwd();
@@ -54,44 +53,77 @@ async function saveToLocal(annotation) {
  * Commits the new annotation to a JSON Lines (JSONL) dataset on Hugging Face
  */
 async function saveToHuggingFace(annotation) {
-    const repoId = process.env.HF_DATASET_REPO || 'rafmacalaba/annotations_db';
     const token = process.env.HF_TOKEN;
     if (!token) throw new Error("Missing HF_TOKEN for dataset commit.");
     try {
-        // Appending to a JSONL file is the standard way to incrementally grow HF Datasets
-        // We use the commitApi to add a new file operation (append)
-        // Stringify the single object with a newline for JSONL format
-        const jsonlLine = JSON.stringify(annotation) + '\n';
-        const fileContentContentBytes = new TextEncoder().encode(jsonlLine);
-        // We will mock appending by reading the existing dataset file and writing it back with the new line
-        // NOTE: For a true append on huge datasets, you would use appending commit operations,
-        // but for MVP we will just overwrite the file with the concatenated content if necessary.
-        // Actually, Hugging Face Hub doesn't support direct append operations via API easily without downloading first or using huggingface_hub python append.
-        // A robust way for the JS API is to push a unique file per annotation to a generic folder, but for simplicity here we'll download, parse, and upload the array.
-        // Let's implement the "append to single JSON array" approach for HF for MVP consistency
-        // Note: The @huggingface/hub JS SDK doesn't natively have a readFile helper out of the box that's easy,
-        // so we'll just push a new timestamped file into a folder to avoid race conditions.
-        const filename = `annotations/${annotation.document_index}_p${annotation.page_number}_${Date.now()}.json`;
-        await commitApi({
-            credentials: { accessToken: token },
-            repo: { type: "dataset", name: repoId },
-            title: `Add annotation for doc ${annotation.document_index} page ${annotation.page_number}`,
             operations: [
                 {
                     operation: "addOrUpdate",
                     path: filename,
-                    content: new Blob([JSON.stringify(annotation, null, 2)], { type: 'application/json' })
                 }
             ]
         });
         console.log(`Successfully committed annotation ${filename} to HF Space ${repoId}`);
     } catch (e) {
         console.error("Failed to commit to Hugging Face:", e);

 import fs from 'fs';
 import path from 'path';
 // Get the root path of the project (handles Docker container `/app` or local)
 const getRootPath = () => process.cwd();
  * Commits the new annotation to a JSON Lines (JSONL) dataset on Hugging Face
  */
 async function saveToHuggingFace(annotation) {
+    const repoId = process.env.HF_DATASET_REPO || 'rafmacalaba/wbg_annotation_data';
     const token = process.env.HF_TOKEN;
     if (!token) throw new Error("Missing HF_TOKEN for dataset commit.");
     try {
+        const filename = `annotations/${annotation.document_index}_p${annotation.page_number}_${Date.now()}.json`;
+        const content = Buffer.from(JSON.stringify(annotation, null, 2)).toString('base64');
+        const payload = {
+            commit_message: `Add annotation for doc ${annotation.document_index} page ${annotation.page_number}`,
+            operations: [
+                {
+                    key: "path",
+                    value: filename
+                },
+                {
+                    key: "content",
+                    value: content
+                }
+            ]
+        };
+        // Use the Hugging Face REST API directly
+        const res = await fetch(`https://huggingface.co/api/datasets/${repoId}/commit/main`, {
+            method: 'POST',
+            headers: {
+                'Authorization': `Bearer ${token}`,
+                'Content-Type': 'application/json'
+            },
+            body: JSON.stringify({
+                summary: payload.commit_message,
+                operations: [
+                    {
+                        keyItem: "path",
+                        keyValue: filename,
+                        keyItem2: "content",
+                        keyValue2: content
+                    }
+                ]
+            }) // The API structure for operations is slightly complex, lets use a simple multipart form or the proper JSON
+        });
+        // Actually the HF Commit API expects a specific JSON structure. Let's send the correct one:
+        // { "operations": [{ "operation": "addOrUpdate", "path": "filename", "content": "base64encoded==" }], "commit_message": "..." }
+        const correctPayload = {
+            summary: payload.commit_message,
             operations: [
                 {
                     operation: "addOrUpdate",
                     path: filename,
+                    content: content,
+                    encoding: "base64"
                 }
             ]
+        };
+        const executeRes = await fetch(`https://huggingface.co/api/datasets/${repoId}/commit/main`, {
+            method: 'POST',
+            headers: {
+                'Authorization': `Bearer ${token}`,
+                'Content-Type': 'application/json'
+            },
+            body: JSON.stringify(correctPayload)
         });
+        if (!executeRes.ok) {
+            throw new Error(`Failed to commit to HF: ${await executeRes.text()}`);
+        }
         console.log(`Successfully committed annotation ${filename} to HF Space ${repoId}`);
     } catch (e) {
         console.error("Failed to commit to Hugging Face:", e);