data-use-annotation / prepare_unhcr.py
rafmacalaba's picture
feat: add UNHCR corpus to registry and prepare_unhcr.py script
4a9a5a6
#!/usr/bin/env python3
"""
prepare_unhcr.py
Converts UNHCR raw_mentions.json files into the doc_N/_direct_judged.jsonl format
and generates unhcr_pdf_links.json.
Structure expected:
annotation_data/unhcr_extractions/<dir_name>/raw/raw_mentions.json
Output:
annotation_data/unhcr_extractions/doc_N/raw/doc_N_direct_judged.jsonl
annotation_data/unhcr_data/unhcr_pdf_links.json
Usage:
python3 prepare_unhcr.py # Dry run
python3 prepare_unhcr.py --execute # Actually restructure files
python3 prepare_unhcr.py --execute --upload # Restructure + upload to HF
"""
import argparse
import json
import os
import shutil
import sys
from pathlib import Path
UNHCR_DIR = Path(__file__).parent / "annotation_data" / "unhcr_extractions"
LINKS_DIR = Path(__file__).parent / "annotation_data" / "unhcr_data"
LINKS_FILE = LINKS_DIR / "unhcr_pdf_links.json"
def scan_raw_dirs():
"""Find all directories with raw_mentions.json."""
results = []
if not UNHCR_DIR.exists():
print("❌ unhcr_extractions directory not found")
return results
for d in sorted(UNHCR_DIR.iterdir()):
if not d.is_dir():
continue
# Skip already-converted doc_N directories
if d.name.startswith("doc_"):
continue
raw_file = d / "raw" / "raw_mentions.json"
if raw_file.exists():
results.append(d)
return results
def extract_pdf_url(pages_data):
"""Get PDF URL from document.source in first page."""
for page in pages_data:
source = page.get("document", {}).get("source")
if source:
return source
return None
def has_datasets(pages_data):
"""Check if any page has dataset mentions."""
for page in pages_data:
if page.get("datasets") and len(page["datasets"]) > 0:
return True
return False
def convert_directory(src_dir, doc_index, execute=False):
"""
Convert a named directory into doc_N format.
- Reads raw/raw_mentions.json
- Writes doc_N/raw/doc_N_direct_judged.jsonl
- Returns link entry dict
"""
raw_file = src_dir / "raw" / "raw_mentions.json"
pages_data = json.loads(raw_file.read_text())
pdf_url = extract_pdf_url(pages_data)
has_ds = has_datasets(pages_data)
target_dir = UNHCR_DIR / f"doc_{doc_index}" / "raw"
target_file = target_dir / f"doc_{doc_index}_direct_judged.jsonl"
link_entry = {
"index": doc_index,
"original_name": src_dir.name,
"direct_pdf_url": pdf_url,
"landing_page_url": pdf_url,
"status": "success",
"has_revalidation": True,
"has_datasets": has_ds,
"num_pages": len(pages_data),
}
if execute:
target_dir.mkdir(parents=True, exist_ok=True)
target_file.write_text(json.dumps(pages_data, indent=2))
return link_entry
def main():
parser = argparse.ArgumentParser(description="Prepare UNHCR corpus data")
parser.add_argument("--execute", action="store_true", help="Actually create files (default: dry run)")
parser.add_argument("--upload", action="store_true", help="Upload to HF after conversion")
parser.add_argument("--limit", type=int, default=None, help="Limit number of docs to process (for testing)")
args = parser.parse_args()
dirs = scan_raw_dirs()
print(f"πŸ“‚ Found {len(dirs)} UNHCR documents with raw_mentions.json")
if not dirs:
return
if args.limit:
dirs = dirs[:args.limit]
print(f"⚠️ Limited to {args.limit} docs")
links = []
docs_with_datasets = 0
for i, d in enumerate(dirs):
doc_index = i + 1
link = convert_directory(d, doc_index, execute=args.execute)
links.append(link)
if link["has_datasets"]:
docs_with_datasets += 1
if (i + 1) % 100 == 0:
print(f" Processed {i + 1}/{len(dirs)}...")
print(f"\nπŸ“Š Summary:")
print(f" Total docs: {len(links)}")
print(f" Docs with datasets: {docs_with_datasets}")
print(f" Docs without: {len(links) - docs_with_datasets}")
if args.execute:
# Write links file
LINKS_DIR.mkdir(parents=True, exist_ok=True)
LINKS_FILE.write_text(json.dumps(links, indent=2))
print(f"\nπŸ’Ύ Saved {LINKS_FILE}")
print(f"πŸ’Ύ Created {len(links)} doc_N directories in {UNHCR_DIR}")
# Clean up original dirs (optional β€” keep for now)
print("\n⚠️ Original named directories preserved. Remove manually if desired.")
if args.upload:
upload_to_hf(links)
else:
print(f"\n[DRY RUN] Would create {len(links)} doc_N dirs and unhcr_pdf_links.json")
print(f"[DRY RUN] Run with --execute to create files")
# Show sample
if links:
print(f"\nSample link entry:")
print(json.dumps(links[0], indent=2))
def upload_to_hf(links):
"""Upload links file and all doc files to HF."""
try:
from huggingface_hub import HfApi
token = None
env_path = Path(__file__).parent / ".env"
if env_path.exists():
for line in env_path.read_text().splitlines():
if line.startswith("HF_TOKEN="):
token = line.split("=", 1)[1].strip()
if not token:
token = os.environ.get("HF_TOKEN")
if not token:
print("❌ No HF_TOKEN found")
return
api = HfApi(token=token)
repo_id = "ai4data/annotation_data"
# Upload links file
api.upload_file(
path_or_fileobj=str(LINKS_FILE),
path_in_repo="annotation_data/unhcr_data/unhcr_pdf_links.json",
repo_id=repo_id,
repo_type="dataset",
commit_message="Add UNHCR PDF links",
)
print("βœ… Uploaded unhcr_pdf_links.json")
# Upload all doc files
api.upload_folder(
folder_path=str(UNHCR_DIR),
path_in_repo="annotation_data/unhcr_extractions",
repo_id=repo_id,
repo_type="dataset",
commit_message="Add UNHCR extraction data",
allow_patterns=["doc_*/raw/*_direct_judged.jsonl"],
)
print("βœ… Uploaded UNHCR extraction files")
except ImportError:
print("❌ huggingface_hub required: uv pip install huggingface_hub")
if __name__ == "__main__":
main()