Spaces:

ai4data
/

data-use-annotation

Sleeping

App Files Files Community

data-use-annotation / prepare_unhcr.py

rafmacalaba

feat: add UNHCR corpus to registry and prepare_unhcr.py script

4a9a5a6 26 days ago

raw

history blame contribute delete

6.45 kB

	#!/usr/bin/env python3
	"""
	prepare_unhcr.py

	Converts UNHCR raw_mentions.json files into the doc_N/_direct_judged.jsonl format
	and generates unhcr_pdf_links.json.

	Structure expected:
	annotation_data/unhcr_extractions/<dir_name>/raw/raw_mentions.json

	Output:
	annotation_data/unhcr_extractions/doc_N/raw/doc_N_direct_judged.jsonl
	annotation_data/unhcr_data/unhcr_pdf_links.json

	Usage:
	python3 prepare_unhcr.py # Dry run
	python3 prepare_unhcr.py --execute # Actually restructure files
	python3 prepare_unhcr.py --execute --upload # Restructure + upload to HF
	"""

	import argparse
	import json
	import os
	import shutil
	import sys
	from pathlib import Path

	UNHCR_DIR = Path(__file__).parent / "annotation_data" / "unhcr_extractions"
	LINKS_DIR = Path(__file__).parent / "annotation_data" / "unhcr_data"
	LINKS_FILE = LINKS_DIR / "unhcr_pdf_links.json"


	def scan_raw_dirs():
	"""Find all directories with raw_mentions.json."""
	results = []
	if not UNHCR_DIR.exists():
	print("❌ unhcr_extractions directory not found")
	return results

	for d in sorted(UNHCR_DIR.iterdir()):
	if not d.is_dir():
	continue
	# Skip already-converted doc_N directories
	if d.name.startswith("doc_"):
	continue
	raw_file = d / "raw" / "raw_mentions.json"
	if raw_file.exists():
	results.append(d)
	return results


	def extract_pdf_url(pages_data):
	"""Get PDF URL from document.source in first page."""
	for page in pages_data:
	source = page.get("document", {}).get("source")
	if source:
	return source
	return None


	def has_datasets(pages_data):
	"""Check if any page has dataset mentions."""
	for page in pages_data:
	if page.get("datasets") and len(page["datasets"]) > 0:
	return True
	return False


	def convert_directory(src_dir, doc_index, execute=False):
	"""
	Convert a named directory into doc_N format.
	- Reads raw/raw_mentions.json
	- Writes doc_N/raw/doc_N_direct_judged.jsonl
	- Returns link entry dict
	"""
	raw_file = src_dir / "raw" / "raw_mentions.json"
	pages_data = json.loads(raw_file.read_text())

	pdf_url = extract_pdf_url(pages_data)
	has_ds = has_datasets(pages_data)

	target_dir = UNHCR_DIR / f"doc_{doc_index}" / "raw"
	target_file = target_dir / f"doc_{doc_index}_direct_judged.jsonl"

	link_entry = {
	"index": doc_index,
	"original_name": src_dir.name,
	"direct_pdf_url": pdf_url,
	"landing_page_url": pdf_url,
	"status": "success",
	"has_revalidation": True,
	"has_datasets": has_ds,
	"num_pages": len(pages_data),
	}

	if execute:
	target_dir.mkdir(parents=True, exist_ok=True)
	target_file.write_text(json.dumps(pages_data, indent=2))

	return link_entry


	def main():
	parser = argparse.ArgumentParser(description="Prepare UNHCR corpus data")
	parser.add_argument("--execute", action="store_true", help="Actually create files (default: dry run)")
	parser.add_argument("--upload", action="store_true", help="Upload to HF after conversion")
	parser.add_argument("--limit", type=int, default=None, help="Limit number of docs to process (for testing)")
	args = parser.parse_args()

	dirs = scan_raw_dirs()
	print(f"📂 Found {len(dirs)} UNHCR documents with raw_mentions.json")

	if not dirs:
	return

	if args.limit:
	dirs = dirs[:args.limit]
	print(f"⚠️ Limited to {args.limit} docs")

	links = []
	docs_with_datasets = 0

	for i, d in enumerate(dirs):
	doc_index = i + 1
	link = convert_directory(d, doc_index, execute=args.execute)
	links.append(link)

	if link["has_datasets"]:
	docs_with_datasets += 1

	if (i + 1) % 100 == 0:
	print(f" Processed {i + 1}/{len(dirs)}...")

	print(f"\n📊 Summary:")
	print(f" Total docs: {len(links)}")
	print(f" Docs with datasets: {docs_with_datasets}")
	print(f" Docs without: {len(links) - docs_with_datasets}")

	if args.execute:
	# Write links file
	LINKS_DIR.mkdir(parents=True, exist_ok=True)
	LINKS_FILE.write_text(json.dumps(links, indent=2))
	print(f"\n💾 Saved {LINKS_FILE}")
	print(f"💾 Created {len(links)} doc_N directories in {UNHCR_DIR}")

	# Clean up original dirs (optional — keep for now)
	print("\n⚠️ Original named directories preserved. Remove manually if desired.")

	if args.upload:
	upload_to_hf(links)
	else:
	print(f"\n[DRY RUN] Would create {len(links)} doc_N dirs and unhcr_pdf_links.json")
	print(f"[DRY RUN] Run with --execute to create files")
	# Show sample
	if links:
	print(f"\nSample link entry:")
	print(json.dumps(links[0], indent=2))


	def upload_to_hf(links):
	"""Upload links file and all doc files to HF."""
	try:
	from huggingface_hub import HfApi

	token = None
	env_path = Path(__file__).parent / ".env"
	if env_path.exists():
	for line in env_path.read_text().splitlines():
	if line.startswith("HF_TOKEN="):
	token = line.split("=", 1)[1].strip()

	if not token:
	token = os.environ.get("HF_TOKEN")
	if not token:
	print("❌ No HF_TOKEN found")
	return

	api = HfApi(token=token)
	repo_id = "ai4data/annotation_data"

	# Upload links file
	api.upload_file(
	path_or_fileobj=str(LINKS_FILE),
	path_in_repo="annotation_data/unhcr_data/unhcr_pdf_links.json",
	repo_id=repo_id,
	repo_type="dataset",
	commit_message="Add UNHCR PDF links",
	)
	print("✅ Uploaded unhcr_pdf_links.json")

	# Upload all doc files
	api.upload_folder(
	folder_path=str(UNHCR_DIR),
	path_in_repo="annotation_data/unhcr_extractions",
	repo_id=repo_id,
	repo_type="dataset",
	commit_message="Add UNHCR extraction data",
	allow_patterns=["doc_/raw/_direct_judged.jsonl"],
	)
	print("✅ Uploaded UNHCR extraction files")

	except ImportError:
	print("❌ huggingface_hub required: uv pip install huggingface_hub")


	if __name__ == "__main__":
	main()