Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """ | |
| prepare_unhcr.py | |
| Converts UNHCR raw_mentions.json files into the doc_N/_direct_judged.jsonl format | |
| and generates unhcr_pdf_links.json. | |
| Structure expected: | |
| annotation_data/unhcr_extractions/<dir_name>/raw/raw_mentions.json | |
| Output: | |
| annotation_data/unhcr_extractions/doc_N/raw/doc_N_direct_judged.jsonl | |
| annotation_data/unhcr_data/unhcr_pdf_links.json | |
| Usage: | |
| python3 prepare_unhcr.py # Dry run | |
| python3 prepare_unhcr.py --execute # Actually restructure files | |
| python3 prepare_unhcr.py --execute --upload # Restructure + upload to HF | |
| """ | |
| import argparse | |
| import json | |
| import os | |
| import shutil | |
| import sys | |
| from pathlib import Path | |
| UNHCR_DIR = Path(__file__).parent / "annotation_data" / "unhcr_extractions" | |
| LINKS_DIR = Path(__file__).parent / "annotation_data" / "unhcr_data" | |
| LINKS_FILE = LINKS_DIR / "unhcr_pdf_links.json" | |
| def scan_raw_dirs(): | |
| """Find all directories with raw_mentions.json.""" | |
| results = [] | |
| if not UNHCR_DIR.exists(): | |
| print("β unhcr_extractions directory not found") | |
| return results | |
| for d in sorted(UNHCR_DIR.iterdir()): | |
| if not d.is_dir(): | |
| continue | |
| # Skip already-converted doc_N directories | |
| if d.name.startswith("doc_"): | |
| continue | |
| raw_file = d / "raw" / "raw_mentions.json" | |
| if raw_file.exists(): | |
| results.append(d) | |
| return results | |
| def extract_pdf_url(pages_data): | |
| """Get PDF URL from document.source in first page.""" | |
| for page in pages_data: | |
| source = page.get("document", {}).get("source") | |
| if source: | |
| return source | |
| return None | |
| def has_datasets(pages_data): | |
| """Check if any page has dataset mentions.""" | |
| for page in pages_data: | |
| if page.get("datasets") and len(page["datasets"]) > 0: | |
| return True | |
| return False | |
| def convert_directory(src_dir, doc_index, execute=False): | |
| """ | |
| Convert a named directory into doc_N format. | |
| - Reads raw/raw_mentions.json | |
| - Writes doc_N/raw/doc_N_direct_judged.jsonl | |
| - Returns link entry dict | |
| """ | |
| raw_file = src_dir / "raw" / "raw_mentions.json" | |
| pages_data = json.loads(raw_file.read_text()) | |
| pdf_url = extract_pdf_url(pages_data) | |
| has_ds = has_datasets(pages_data) | |
| target_dir = UNHCR_DIR / f"doc_{doc_index}" / "raw" | |
| target_file = target_dir / f"doc_{doc_index}_direct_judged.jsonl" | |
| link_entry = { | |
| "index": doc_index, | |
| "original_name": src_dir.name, | |
| "direct_pdf_url": pdf_url, | |
| "landing_page_url": pdf_url, | |
| "status": "success", | |
| "has_revalidation": True, | |
| "has_datasets": has_ds, | |
| "num_pages": len(pages_data), | |
| } | |
| if execute: | |
| target_dir.mkdir(parents=True, exist_ok=True) | |
| target_file.write_text(json.dumps(pages_data, indent=2)) | |
| return link_entry | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Prepare UNHCR corpus data") | |
| parser.add_argument("--execute", action="store_true", help="Actually create files (default: dry run)") | |
| parser.add_argument("--upload", action="store_true", help="Upload to HF after conversion") | |
| parser.add_argument("--limit", type=int, default=None, help="Limit number of docs to process (for testing)") | |
| args = parser.parse_args() | |
| dirs = scan_raw_dirs() | |
| print(f"π Found {len(dirs)} UNHCR documents with raw_mentions.json") | |
| if not dirs: | |
| return | |
| if args.limit: | |
| dirs = dirs[:args.limit] | |
| print(f"β οΈ Limited to {args.limit} docs") | |
| links = [] | |
| docs_with_datasets = 0 | |
| for i, d in enumerate(dirs): | |
| doc_index = i + 1 | |
| link = convert_directory(d, doc_index, execute=args.execute) | |
| links.append(link) | |
| if link["has_datasets"]: | |
| docs_with_datasets += 1 | |
| if (i + 1) % 100 == 0: | |
| print(f" Processed {i + 1}/{len(dirs)}...") | |
| print(f"\nπ Summary:") | |
| print(f" Total docs: {len(links)}") | |
| print(f" Docs with datasets: {docs_with_datasets}") | |
| print(f" Docs without: {len(links) - docs_with_datasets}") | |
| if args.execute: | |
| # Write links file | |
| LINKS_DIR.mkdir(parents=True, exist_ok=True) | |
| LINKS_FILE.write_text(json.dumps(links, indent=2)) | |
| print(f"\nπΎ Saved {LINKS_FILE}") | |
| print(f"πΎ Created {len(links)} doc_N directories in {UNHCR_DIR}") | |
| # Clean up original dirs (optional β keep for now) | |
| print("\nβ οΈ Original named directories preserved. Remove manually if desired.") | |
| if args.upload: | |
| upload_to_hf(links) | |
| else: | |
| print(f"\n[DRY RUN] Would create {len(links)} doc_N dirs and unhcr_pdf_links.json") | |
| print(f"[DRY RUN] Run with --execute to create files") | |
| # Show sample | |
| if links: | |
| print(f"\nSample link entry:") | |
| print(json.dumps(links[0], indent=2)) | |
| def upload_to_hf(links): | |
| """Upload links file and all doc files to HF.""" | |
| try: | |
| from huggingface_hub import HfApi | |
| token = None | |
| env_path = Path(__file__).parent / ".env" | |
| if env_path.exists(): | |
| for line in env_path.read_text().splitlines(): | |
| if line.startswith("HF_TOKEN="): | |
| token = line.split("=", 1)[1].strip() | |
| if not token: | |
| token = os.environ.get("HF_TOKEN") | |
| if not token: | |
| print("β No HF_TOKEN found") | |
| return | |
| api = HfApi(token=token) | |
| repo_id = "ai4data/annotation_data" | |
| # Upload links file | |
| api.upload_file( | |
| path_or_fileobj=str(LINKS_FILE), | |
| path_in_repo="annotation_data/unhcr_data/unhcr_pdf_links.json", | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| commit_message="Add UNHCR PDF links", | |
| ) | |
| print("β Uploaded unhcr_pdf_links.json") | |
| # Upload all doc files | |
| api.upload_folder( | |
| folder_path=str(UNHCR_DIR), | |
| path_in_repo="annotation_data/unhcr_extractions", | |
| repo_id=repo_id, | |
| repo_type="dataset", | |
| commit_message="Add UNHCR extraction data", | |
| allow_patterns=["doc_*/raw/*_direct_judged.jsonl"], | |
| ) | |
| print("β Uploaded UNHCR extraction files") | |
| except ImportError: | |
| print("β huggingface_hub required: uv pip install huggingface_hub") | |
| if __name__ == "__main__": | |
| main() | |