#!/usr/bin/env python3 """ prepare_unhcr.py Converts UNHCR raw_mentions.json files into the doc_N/_direct_judged.jsonl format and generates unhcr_pdf_links.json. Structure expected: annotation_data/unhcr_extractions//raw/raw_mentions.json Output: annotation_data/unhcr_extractions/doc_N/raw/doc_N_direct_judged.jsonl annotation_data/unhcr_data/unhcr_pdf_links.json Usage: python3 prepare_unhcr.py # Dry run python3 prepare_unhcr.py --execute # Actually restructure files python3 prepare_unhcr.py --execute --upload # Restructure + upload to HF """ import argparse import json import os import shutil import sys from pathlib import Path UNHCR_DIR = Path(__file__).parent / "annotation_data" / "unhcr_extractions" LINKS_DIR = Path(__file__).parent / "annotation_data" / "unhcr_data" LINKS_FILE = LINKS_DIR / "unhcr_pdf_links.json" def scan_raw_dirs(): """Find all directories with raw_mentions.json.""" results = [] if not UNHCR_DIR.exists(): print("āŒ unhcr_extractions directory not found") return results for d in sorted(UNHCR_DIR.iterdir()): if not d.is_dir(): continue # Skip already-converted doc_N directories if d.name.startswith("doc_"): continue raw_file = d / "raw" / "raw_mentions.json" if raw_file.exists(): results.append(d) return results def extract_pdf_url(pages_data): """Get PDF URL from document.source in first page.""" for page in pages_data: source = page.get("document", {}).get("source") if source: return source return None def has_datasets(pages_data): """Check if any page has dataset mentions.""" for page in pages_data: if page.get("datasets") and len(page["datasets"]) > 0: return True return False def convert_directory(src_dir, doc_index, execute=False): """ Convert a named directory into doc_N format. - Reads raw/raw_mentions.json - Writes doc_N/raw/doc_N_direct_judged.jsonl - Returns link entry dict """ raw_file = src_dir / "raw" / "raw_mentions.json" pages_data = json.loads(raw_file.read_text()) pdf_url = extract_pdf_url(pages_data) has_ds = has_datasets(pages_data) target_dir = UNHCR_DIR / f"doc_{doc_index}" / "raw" target_file = target_dir / f"doc_{doc_index}_direct_judged.jsonl" link_entry = { "index": doc_index, "original_name": src_dir.name, "direct_pdf_url": pdf_url, "landing_page_url": pdf_url, "status": "success", "has_revalidation": True, "has_datasets": has_ds, "num_pages": len(pages_data), } if execute: target_dir.mkdir(parents=True, exist_ok=True) target_file.write_text(json.dumps(pages_data, indent=2)) return link_entry def main(): parser = argparse.ArgumentParser(description="Prepare UNHCR corpus data") parser.add_argument("--execute", action="store_true", help="Actually create files (default: dry run)") parser.add_argument("--upload", action="store_true", help="Upload to HF after conversion") parser.add_argument("--limit", type=int, default=None, help="Limit number of docs to process (for testing)") args = parser.parse_args() dirs = scan_raw_dirs() print(f"šŸ“‚ Found {len(dirs)} UNHCR documents with raw_mentions.json") if not dirs: return if args.limit: dirs = dirs[:args.limit] print(f"āš ļø Limited to {args.limit} docs") links = [] docs_with_datasets = 0 for i, d in enumerate(dirs): doc_index = i + 1 link = convert_directory(d, doc_index, execute=args.execute) links.append(link) if link["has_datasets"]: docs_with_datasets += 1 if (i + 1) % 100 == 0: print(f" Processed {i + 1}/{len(dirs)}...") print(f"\nšŸ“Š Summary:") print(f" Total docs: {len(links)}") print(f" Docs with datasets: {docs_with_datasets}") print(f" Docs without: {len(links) - docs_with_datasets}") if args.execute: # Write links file LINKS_DIR.mkdir(parents=True, exist_ok=True) LINKS_FILE.write_text(json.dumps(links, indent=2)) print(f"\nšŸ’¾ Saved {LINKS_FILE}") print(f"šŸ’¾ Created {len(links)} doc_N directories in {UNHCR_DIR}") # Clean up original dirs (optional — keep for now) print("\nāš ļø Original named directories preserved. Remove manually if desired.") if args.upload: upload_to_hf(links) else: print(f"\n[DRY RUN] Would create {len(links)} doc_N dirs and unhcr_pdf_links.json") print(f"[DRY RUN] Run with --execute to create files") # Show sample if links: print(f"\nSample link entry:") print(json.dumps(links[0], indent=2)) def upload_to_hf(links): """Upload links file and all doc files to HF.""" try: from huggingface_hub import HfApi token = None env_path = Path(__file__).parent / ".env" if env_path.exists(): for line in env_path.read_text().splitlines(): if line.startswith("HF_TOKEN="): token = line.split("=", 1)[1].strip() if not token: token = os.environ.get("HF_TOKEN") if not token: print("āŒ No HF_TOKEN found") return api = HfApi(token=token) repo_id = "ai4data/annotation_data" # Upload links file api.upload_file( path_or_fileobj=str(LINKS_FILE), path_in_repo="annotation_data/unhcr_data/unhcr_pdf_links.json", repo_id=repo_id, repo_type="dataset", commit_message="Add UNHCR PDF links", ) print("āœ… Uploaded unhcr_pdf_links.json") # Upload all doc files api.upload_folder( folder_path=str(UNHCR_DIR), path_in_repo="annotation_data/unhcr_extractions", repo_id=repo_id, repo_type="dataset", commit_message="Add UNHCR extraction data", allow_patterns=["doc_*/raw/*_direct_judged.jsonl"], ) print("āœ… Uploaded UNHCR extraction files") except ImportError: print("āŒ huggingface_hub required: uv pip install huggingface_hub") if __name__ == "__main__": main()