Spaces:

ai4data
/

data-use-annotation

Sleeping

rafmacalaba commited on 20 days ago

Commit

fb404c5

1 Parent(s): 7c5c449

feat: prepare_data.py script + use has_revalidation filter in documents API

- prepare_data.py: scans local docs, uploads missing _direct_judged.jsonl
to HF, generates updated wbg_pdf_links.json with src_docname and
has_revalidation fields
- documents API: only lists docs with has_revalidation=true
- Fixed misleading error message in storage.js

Files changed (3) hide show

app/api/documents/route.js +4 -2
prepare_data.py +242 -0
utils/storage.js +1 -1

app/api/documents/route.js CHANGED Viewed

@@ -21,8 +21,10 @@ export async function GET() {
         const links = await linksRes.json();
-        // Filter to successful links and take the first N
-        const successLinks = links.filter(l => l.status === 'success').slice(0, MAX_DOCS_TO_SCAN);
         // Parallel fetch — much faster than sequential scanning
         const results = await Promise.allSettled(

         const links = await linksRes.json();
+        // Filter to docs with revalidation data and take the first N
+        const successLinks = links
+            .filter(l => l.status === 'success' && l.has_revalidation === true)
+            .slice(0, MAX_DOCS_TO_SCAN);
         // Parallel fetch — much faster than sequential scanning
         const results = await Promise.allSettled(

prepare_data.py ADDED Viewed

	@@ -0,0 +1,242 @@

+#!/usr/bin/env python3
+"""
+prepare_data.py
+Scans local wbg_extractions, identifies docs with real _direct_judged.jsonl
+(not dummy), uploads them to HF, and generates/uploads an updated wbg_pdf_links.json.
+Usage:
+    # Dry run (scan only, no uploads):
+    python prepare_data.py --dry-run
+    # Upload missing docs + generate new pdf_links:
+    python prepare_data.py
+    # Only generate pdf_links without uploading docs:
+    python prepare_data.py --links-only
+Requires: pip install huggingface_hub requests
+"""
+import argparse
+import json
+import os
+import sys
+import requests
+from pathlib import Path
+from huggingface_hub import HfApi
+# ─── Configuration ───────────────────────────────
+HF_TOKEN = os.environ.get("HF_TOKEN")
+REPO_ID = "ai4data/annotation_data"
+LOCAL_BASE = Path(__file__).parent / "annotation_data" / "wbg_extractions"
+LINKS_REPO_PATH = "annotation_data/wbg_data/wbg_pdf_links.json"
+def get_hf_token():
+    """Get HF token from env, .env file, or cached token."""
+    if HF_TOKEN:
+        return HF_TOKEN
+    # Try .env file
+    env_path = Path(__file__).parent / ".env"
+    if env_path.exists():
+        for line in env_path.read_text().splitlines():
+            if line.startswith("HF_TOKEN="):
+                return line.split("=", 1)[1].strip()
+    # Try cached token
+    cached = Path.home() / ".cache" / "huggingface" / "token"
+    if cached.exists():
+        return cached.read_text().strip()
+    return None
+def scan_local_docs():
+    """Scan local wbg_extractions and classify docs."""
+    docs = sorted(
+        [d for d in os.listdir(LOCAL_BASE) if d.startswith("doc_")],
+        key=lambda x: int(x.split("_")[1]),
+    )
+    results = {"real": [], "dummy": [], "no_file": []}
+    for doc in docs:
+        idx = int(doc.split("_")[1])
+        raw_dir = LOCAL_BASE / doc / "raw"
+        real_file = raw_dir / f"{doc}_direct_judged.jsonl"
+        dummy_file = raw_dir / f"{doc}_dummy_direct_judged.jsonl"
+        if real_file.exists():
+            results["real"].append({"name": doc, "index": idx, "path": str(real_file)})
+        elif dummy_file.exists():
+            results["dummy"].append({"name": doc, "index": idx})
+        else:
+            results["no_file"].append({"name": doc, "index": idx})
+    return results
+def get_existing_hf_docs(api):
+    """Check which docs already have _direct_judged.jsonl on HF."""
+    try:
+        items = list(api.list_repo_tree(
+            REPO_ID, repo_type="dataset",
+            path_in_repo="annotation_data/wbg_extractions"
+        ))
+        doc_names = [item.path.split("/")[-1] for item in items if hasattr(item, "path")]
+        return set(doc_names)
+    except Exception as e:
+        print(f"  Warning: Could not list HF repo: {e}")
+        return set()
+def upload_docs(api, docs_to_upload, dry_run=False):
+    """Upload _direct_judged.jsonl files to HF for docs that are missing."""
+    uploaded = 0
+    skipped = 0
+    for doc in docs_to_upload:
+        repo_path = f"annotation_data/wbg_extractions/{doc['name']}/raw/{doc['name']}_direct_judged.jsonl"
+        if dry_run:
+            print(f"  [DRY RUN] Would upload: {doc['name']}")
+            continue
+        try:
+            api.upload_file(
+                path_or_fileobj=doc["path"],
+                path_in_repo=repo_path,
+                repo_id=REPO_ID,
+                repo_type="dataset",
+                commit_message=f"Upload {doc['name']}_direct_judged.jsonl",
+            )
+            print(f"  ✅ Uploaded: {doc['name']}")
+            uploaded += 1
+        except Exception as e:
+            print(f"  ❌ Failed {doc['name']}: {e}")
+            skipped += 1
+    return uploaded, skipped
+def fetch_current_links(api, token):
+    """Fetch current wbg_pdf_links.json from HF."""
+    url = f"https://huggingface.co/datasets/{REPO_ID}/raw/main/{LINKS_REPO_PATH}"
+    resp = requests.get(url, headers={"Authorization": f"Bearer {token}"})
+    if resp.status_code == 200:
+        return resp.json()
+    print(f"  Warning: Could not fetch existing links (HTTP {resp.status_code})")
+    return []
+def generate_updated_links(current_links, local_docs, token):
+    """
+    Generate updated wbg_pdf_links.json with:
+    - src_docname: doc_{index}
+    - has_revalidation: true if _direct_judged.jsonl exists (not dummy)
+    """
+    real_indices = {d["index"] for d in local_docs["real"]}
+    dummy_indices = {d["index"] for d in local_docs["dummy"]}
+    # Build a lookup from current links
+    links_by_index = {link["index"]: link for link in current_links}
+    updated_links = []
+    for link in current_links:
+        idx = link["index"]
+        entry = {
+            "index": idx,
+            "src_docname": f"doc_{idx}",
+            "landing_page_url": link.get("landing_page_url", ""),
+            "direct_pdf_url": link.get("direct_pdf_url", ""),
+            "status": link.get("status", "unknown"),
+            "has_revalidation": idx in real_indices,
+        }
+        updated_links.append(entry)
+    return updated_links
+def upload_links(api, links, dry_run=False):
+    """Upload the updated wbg_pdf_links.json to HF."""
+    content = json.dumps(links, indent=2)
+    if dry_run:
+        print(f"  [DRY RUN] Would upload updated wbg_pdf_links.json ({len(links)} entries)")
+        return
+    # Save locally first
+    local_path = Path(__file__).parent / "annotation_data" / "wbg_data"
+    local_path.mkdir(parents=True, exist_ok=True)
+    local_file = local_path / "wbg_pdf_links.json"
+    local_file.write_text(content)
+    print(f"  💾 Saved locally: {local_file}")
+    api.upload_file(
+        path_or_fileobj=str(local_file),
+        path_in_repo=LINKS_REPO_PATH,
+        repo_id=REPO_ID,
+        repo_type="dataset",
+        commit_message="Update wbg_pdf_links.json with src_docname and has_revalidation",
+    )
+    print(f"  ✅ Uploaded wbg_pdf_links.json to HF")
+def main():
+    parser = argparse.ArgumentParser(description="Prepare and upload annotation data")
+    parser.add_argument("--dry-run", action="store_true", help="Scan only, don't upload")
+    parser.add_argument("--links-only", action="store_true",
+                        help="Only generate/upload updated pdf_links, skip doc uploads")
+    args = parser.parse_args()
+    token = get_hf_token()
+    if not token:
+        print("❌ No HF_TOKEN found. Set it via environment variable or .env file.")
+        sys.exit(1)
+    api = HfApi(token=token)
+    # 1. Scan local docs
+    print("\n📂 Scanning local wbg_extractions...")
+    local_docs = scan_local_docs()
+    print(f"  Real _direct_judged.jsonl: {len(local_docs['real'])}")
+    print(f"  Dummy (skipped):           {len(local_docs['dummy'])}")
+    print(f"  No file:                   {len(local_docs['no_file'])}")
+    if not args.links_only:
+        # 2. Check what's already on HF
+        print("\n🔍 Checking existing docs on HF...")
+        existing = get_existing_hf_docs(api)
+        print(f"  Found {len(existing)} doc folders on HF")
+        # 3. Find docs to upload (real but not yet on HF, or need update)
+        to_upload = [d for d in local_docs["real"] if d["name"] not in existing]
+        already_on_hf = [d for d in local_docs["real"] if d["name"] in existing]
+        print(f"\n📤 Docs to upload: {len(to_upload)}")
+        print(f"  Already on HF:   {len(already_on_hf)}")
+        if to_upload:
+            print("\n🚀 Uploading missing docs...")
+            uploaded, skipped = upload_docs(api, to_upload, dry_run=args.dry_run)
+            if not args.dry_run:
+                print(f"  Uploaded: {uploaded}, Skipped: {skipped}")
+    # 4. Generate updated pdf_links
+    print("\n📋 Generating updated wbg_pdf_links.json...")
+    current_links = fetch_current_links(api, token)
+    updated_links = generate_updated_links(current_links, local_docs, token)
+    with_revalidation = sum(1 for l in updated_links if l["has_revalidation"])
+    print(f"  Total entries:       {len(updated_links)}")
+    print(f"  With revalidation:   {with_revalidation}")
+    print(f"  Without:             {len(updated_links) - with_revalidation}")
+    # 5. Upload
+    print("\n📤 Uploading updated wbg_pdf_links.json...")
+    upload_links(api, updated_links, dry_run=args.dry_run)
+    print("\n✅ Done!")
+if __name__ == "__main__":
+    main()

utils/storage.js CHANGED Viewed

@@ -132,7 +132,7 @@ export async function saveAnnotation(annotation) {
     } else {
         // Local: read, modify, write
         const pagesData = readDocLocal(docIndex);
-        if (!pagesData) throw new Error(`doc_${docIndex}_raw.json not found locally`);
         const pageIdx = findPageIndex(pagesData, pageNumber);
         if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex}`);

     } else {
         // Local: read, modify, write
         const pagesData = readDocLocal(docIndex);
+        if (!pagesData) throw new Error(`doc_${docIndex}_direct_judged.jsonl not found locally`);
         const pageIdx = findPageIndex(pagesData, pageNumber);
         if (pageIdx === -1) throw new Error(`Page ${pageNumber} not found in doc_${docIndex}`);