Spaces:

ai4data
/

data-use-annotation

Running

rafmacalaba commited on Feb 24

Commit

85eb22c

1 Parent(s): e7f7858

feat: add language detection to prepare_data.py, exclude non-English docs

- Uses langdetect to identify document language (samples pages 2-5)
- 12 non-English docs excluded (8 Arabic, 4 French)
- 99 English docs with has_revalidation=true
- Added 'language' field to wbg_pdf_links.json entries

Files changed (1) hide show

prepare_data.py +64 -30

prepare_data.py CHANGED Viewed

@@ -3,19 +3,20 @@
 prepare_data.py
 Scans local wbg_extractions, identifies docs with real _direct_judged.jsonl
-(not dummy), uploads them to HF, and generates/uploads an updated wbg_pdf_links.json.
 Usage:
     # Dry run (scan only, no uploads):
-    python prepare_data.py --dry-run
     # Upload missing docs + generate new pdf_links:
-    python prepare_data.py
     # Only generate pdf_links without uploading docs:
-    python prepare_data.py --links-only
-Requires: pip install huggingface_hub requests
 """
 import argparse
@@ -25,6 +26,7 @@ import sys
 import requests
 from pathlib import Path
 from huggingface_hub import HfApi
 # ─── Configuration ───────────────────────────────
 HF_TOKEN = os.environ.get("HF_TOKEN")
@@ -37,27 +39,43 @@ def get_hf_token():
     """Get HF token from env, .env file, or cached token."""
     if HF_TOKEN:
         return HF_TOKEN
-    # Try .env file
     env_path = Path(__file__).parent / ".env"
     if env_path.exists():
         for line in env_path.read_text().splitlines():
             if line.startswith("HF_TOKEN="):
                 return line.split("=", 1)[1].strip()
-    # Try cached token
     cached = Path.home() / ".cache" / "huggingface" / "token"
     if cached.exists():
         return cached.read_text().strip()
     return None
 def scan_local_docs():
-    """Scan local wbg_extractions and classify docs."""
     docs = sorted(
         [d for d in os.listdir(LOCAL_BASE) if d.startswith("doc_")],
         key=lambda x: int(x.split("_")[1]),
     )
-    results = {"real": [], "dummy": [], "no_file": []}
     for doc in docs:
         idx = int(doc.split("_")[1])
@@ -67,7 +85,12 @@ def scan_local_docs():
         dummy_file = raw_dir / f"{doc}_dummy_direct_judged.jsonl"
         if real_file.exists():
-            results["real"].append({"name": doc, "index": idx, "path": str(real_file)})
         elif dummy_file.exists():
             results["dummy"].append({"name": doc, "index": idx})
         else:
@@ -133,13 +156,17 @@ def generate_updated_links(current_links, local_docs, token):
     """
     Generate updated wbg_pdf_links.json with:
     - src_docname: doc_{index}
-    - has_revalidation: true if _direct_judged.jsonl exists (not dummy)
     """
-    real_indices = {d["index"] for d in local_docs["real"]}
-    dummy_indices = {d["index"] for d in local_docs["dummy"]}
-    # Build a lookup from current links
-    links_by_index = {link["index"]: link for link in current_links}
     updated_links = []
     for link in current_links:
@@ -150,7 +177,8 @@ def generate_updated_links(current_links, local_docs, token):
             "landing_page_url": link.get("landing_page_url", ""),
             "direct_pdf_url": link.get("direct_pdf_url", ""),
             "status": link.get("status", "unknown"),
-            "has_revalidation": idx in real_indices,
         }
         updated_links.append(entry)
@@ -165,7 +193,6 @@ def upload_links(api, links, dry_run=False):
         print(f"  [DRY RUN] Would upload updated wbg_pdf_links.json ({len(links)} entries)")
         return
-    # Save locally first
     local_path = Path(__file__).parent / "annotation_data" / "wbg_data"
     local_path.mkdir(parents=True, exist_ok=True)
     local_file = local_path / "wbg_pdf_links.json"
@@ -177,7 +204,7 @@ def upload_links(api, links, dry_run=False):
         path_in_repo=LINKS_REPO_PATH,
         repo_id=REPO_ID,
         repo_type="dataset",
-        commit_message="Update wbg_pdf_links.json with src_docname and has_revalidation",
     )
     print(f"  ✅ Uploaded wbg_pdf_links.json to HF")
@@ -196,12 +223,18 @@ def main():
     api = HfApi(token=token)
-    # 1. Scan local docs
-    print("\n📂 Scanning local wbg_extractions...")
     local_docs = scan_local_docs()
-    print(f"  Real _direct_judged.jsonl: {len(local_docs['real'])}")
-    print(f"  Dummy (skipped):           {len(local_docs['dummy'])}")
-    print(f"  No file:                   {len(local_docs['no_file'])}")
     if not args.links_only:
         # 2. Check what's already on HF
@@ -209,14 +242,14 @@ def main():
         existing = get_existing_hf_docs(api)
         print(f"  Found {len(existing)} doc folders on HF")
-        # 3. Find docs to upload (real but not yet on HF, or need update)
         to_upload = [d for d in local_docs["real"] if d["name"] not in existing]
         already_on_hf = [d for d in local_docs["real"] if d["name"] in existing]
-        print(f"\n📤 Docs to upload: {len(to_upload)}")
-        print(f"  Already on HF:   {len(already_on_hf)}")
         if to_upload:
-            print("\n🚀 Uploading missing docs...")
             uploaded, skipped = upload_docs(api, to_upload, dry_run=args.dry_run)
             if not args.dry_run:
                 print(f"  Uploaded: {uploaded}, Skipped: {skipped}")
@@ -227,9 +260,10 @@ def main():
     updated_links = generate_updated_links(current_links, local_docs, token)
     with_revalidation = sum(1 for l in updated_links if l["has_revalidation"])
-    print(f"  Total entries:       {len(updated_links)}")
-    print(f"  With revalidation:   {with_revalidation}")
-    print(f"  Without:             {len(updated_links) - with_revalidation}")
     # 5. Upload
     print("\n📤 Uploading updated wbg_pdf_links.json...")

 prepare_data.py
 Scans local wbg_extractions, identifies docs with real _direct_judged.jsonl
+(not dummy), detects language, uploads English docs to HF, and generates/uploads
+an updated wbg_pdf_links.json.
 Usage:
     # Dry run (scan only, no uploads):
+    uv run --with huggingface_hub,requests,langdetect python3 prepare_data.py --dry-run
     # Upload missing docs + generate new pdf_links:
+    uv run --with huggingface_hub,requests,langdetect python3 prepare_data.py
     # Only generate pdf_links without uploading docs:
+    uv run --with huggingface_hub,requests,langdetect python3 prepare_data.py --links-only
+Requires: huggingface_hub, requests, langdetect
 """
 import argparse
 import requests
 from pathlib import Path
 from huggingface_hub import HfApi
+from langdetect import detect, LangDetectException
 # ─── Configuration ───────────────────────────────
 HF_TOKEN = os.environ.get("HF_TOKEN")
     """Get HF token from env, .env file, or cached token."""
     if HF_TOKEN:
         return HF_TOKEN
     env_path = Path(__file__).parent / ".env"
     if env_path.exists():
         for line in env_path.read_text().splitlines():
             if line.startswith("HF_TOKEN="):
                 return line.split("=", 1)[1].strip()
     cached = Path.home() / ".cache" / "huggingface" / "token"
     if cached.exists():
         return cached.read_text().strip()
     return None
+def detect_language(doc_path):
+    """
+    Detect language of a document by sampling pages 2-5 (skipping first page
+    which often contains abbreviation tables / currency equivalents).
+    Returns ISO 639-1 language code (e.g. 'en', 'fr', 'ar').
+    """
+    try:
+        data = json.loads(Path(doc_path).read_text())
+        # Sample from pages 2-5 to avoid abbreviation-heavy first pages
+        texts = " ".join(p.get("input_text", "")[:500] for p in data[1:5])
+        if len(texts.strip()) < 50:
+            # Fallback to first 3 pages if later pages are empty
+            texts = " ".join(p.get("input_text", "")[:500] for p in data[:3])
+        return detect(texts)
+    except (LangDetectException, json.JSONDecodeError, FileNotFoundError):
+        return "unknown"
 def scan_local_docs():
+    """Scan local wbg_extractions and classify docs with language detection."""
     docs = sorted(
         [d for d in os.listdir(LOCAL_BASE) if d.startswith("doc_")],
         key=lambda x: int(x.split("_")[1]),
     )
+    results = {"real": [], "real_non_english": [], "dummy": [], "no_file": []}
     for doc in docs:
         idx = int(doc.split("_")[1])
         dummy_file = raw_dir / f"{doc}_dummy_direct_judged.jsonl"
         if real_file.exists():
+            lang = detect_language(str(real_file))
+            entry = {"name": doc, "index": idx, "path": str(real_file), "language": lang}
+            if lang == "en":
+                results["real"].append(entry)
+            else:
+                results["real_non_english"].append(entry)
         elif dummy_file.exists():
             results["dummy"].append({"name": doc, "index": idx})
         else:
     """
     Generate updated wbg_pdf_links.json with:
     - src_docname: doc_{index}
+    - has_revalidation: true if English _direct_judged.jsonl exists
+    - language: detected language code
     """
+    # Build lookup: index → language
+    lang_map = {}
+    for d in local_docs["real"]:
+        lang_map[d["index"]] = d.get("language", "en")
+    for d in local_docs["real_non_english"]:
+        lang_map[d["index"]] = d.get("language", "unknown")
+    real_english_indices = {d["index"] for d in local_docs["real"]}
     updated_links = []
     for link in current_links:
             "landing_page_url": link.get("landing_page_url", ""),
             "direct_pdf_url": link.get("direct_pdf_url", ""),
             "status": link.get("status", "unknown"),
+            "has_revalidation": idx in real_english_indices,
+            "language": lang_map.get(idx, "unknown"),
         }
         updated_links.append(entry)
         print(f"  [DRY RUN] Would upload updated wbg_pdf_links.json ({len(links)} entries)")
         return
     local_path = Path(__file__).parent / "annotation_data" / "wbg_data"
     local_path.mkdir(parents=True, exist_ok=True)
     local_file = local_path / "wbg_pdf_links.json"
         path_in_repo=LINKS_REPO_PATH,
         repo_id=REPO_ID,
         repo_type="dataset",
+        commit_message="Update wbg_pdf_links.json with language field, exclude non-English",
     )
     print(f"  ✅ Uploaded wbg_pdf_links.json to HF")
     api = HfApi(token=token)
+    # 1. Scan local docs with language detection
+    print("\n📂 Scanning local wbg_extractions (with language detection)...")
     local_docs = scan_local_docs()
+    print(f"  Real (English):     {len(local_docs['real'])}")
+    print(f"  Real (non-English): {len(local_docs['real_non_english'])}")
+    print(f"  Dummy (skipped):    {len(local_docs['dummy'])}")
+    print(f"  No file:            {len(local_docs['no_file'])}")
+    if local_docs["real_non_english"]:
+        print("\n  Non-English docs excluded:")
+        for d in local_docs["real_non_english"]:
+            print(f"    {d['name']}: {d['language']}")
     if not args.links_only:
         # 2. Check what's already on HF
         existing = get_existing_hf_docs(api)
         print(f"  Found {len(existing)} doc folders on HF")
+        # 3. Upload only English docs not yet on HF
         to_upload = [d for d in local_docs["real"] if d["name"] not in existing]
         already_on_hf = [d for d in local_docs["real"] if d["name"] in existing]
+        print(f"\n📤 English docs to upload: {len(to_upload)}")
+        print(f"  Already on HF:           {len(already_on_hf)}")
         if to_upload:
+            print("\n🚀 Uploading missing English docs...")
             uploaded, skipped = upload_docs(api, to_upload, dry_run=args.dry_run)
             if not args.dry_run:
                 print(f"  Uploaded: {uploaded}, Skipped: {skipped}")
     updated_links = generate_updated_links(current_links, local_docs, token)
     with_revalidation = sum(1 for l in updated_links if l["has_revalidation"])
+    non_english = sum(1 for l in updated_links if l["language"] not in ("en", "unknown"))
+    print(f"  Total entries:            {len(updated_links)}")
+    print(f"  English with revalidation: {with_revalidation}")
+    print(f"  Non-English (excluded):    {non_english}")
     # 5. Upload
     print("\n📤 Uploading updated wbg_pdf_links.json...")