#!/usr/bin/env python3
"""
prepare_unhcr.py

Converts UNHCR raw_mentions.json files into the doc_N/_direct_judged.jsonl format
and generates unhcr_pdf_links.json.

Structure expected:
  annotation_data/unhcr_extractions/<dir_name>/raw/raw_mentions.json

Output:
  annotation_data/unhcr_extractions/doc_N/raw/doc_N_direct_judged.jsonl
  annotation_data/unhcr_data/unhcr_pdf_links.json

Usage:
    python3 prepare_unhcr.py                 # Dry run
    python3 prepare_unhcr.py --execute       # Actually restructure files
    python3 prepare_unhcr.py --execute --upload  # Restructure + upload to HF
"""

import argparse
import json
import os
import shutil
import sys
from pathlib import Path

UNHCR_DIR = Path(__file__).parent / "annotation_data" / "unhcr_extractions"
LINKS_DIR = Path(__file__).parent / "annotation_data" / "unhcr_data"
LINKS_FILE = LINKS_DIR / "unhcr_pdf_links.json"


def scan_raw_dirs():
    """Find all directories with raw_mentions.json."""
    results = []
    if not UNHCR_DIR.exists():
        print("❌ unhcr_extractions directory not found")
        return results

    for d in sorted(UNHCR_DIR.iterdir()):
        if not d.is_dir():
            continue
        # Skip already-converted doc_N directories
        if d.name.startswith("doc_"):
            continue
        raw_file = d / "raw" / "raw_mentions.json"
        if raw_file.exists():
            results.append(d)
    return results


def extract_pdf_url(pages_data):
    """Get PDF URL from document.source in first page."""
    for page in pages_data:
        source = page.get("document", {}).get("source")
        if source:
            return source
    return None


def has_datasets(pages_data):
    """Check if any page has dataset mentions."""
    for page in pages_data:
        if page.get("datasets") and len(page["datasets"]) > 0:
            return True
    return False


def convert_directory(src_dir, doc_index, execute=False):
    """
    Convert a named directory into doc_N format.
    - Reads raw/raw_mentions.json
    - Writes doc_N/raw/doc_N_direct_judged.jsonl
    - Returns link entry dict
    """
    raw_file = src_dir / "raw" / "raw_mentions.json"
    pages_data = json.loads(raw_file.read_text())

    pdf_url = extract_pdf_url(pages_data)
    has_ds = has_datasets(pages_data)

    target_dir = UNHCR_DIR / f"doc_{doc_index}" / "raw"
    target_file = target_dir / f"doc_{doc_index}_direct_judged.jsonl"

    link_entry = {
        "index": doc_index,
        "original_name": src_dir.name,
        "direct_pdf_url": pdf_url,
        "landing_page_url": pdf_url,
        "status": "success",
        "has_revalidation": True,
        "has_datasets": has_ds,
        "num_pages": len(pages_data),
    }

    if execute:
        target_dir.mkdir(parents=True, exist_ok=True)
        target_file.write_text(json.dumps(pages_data, indent=2))

    return link_entry


def main():
    parser = argparse.ArgumentParser(description="Prepare UNHCR corpus data")
    parser.add_argument("--execute", action="store_true", help="Actually create files (default: dry run)")
    parser.add_argument("--upload", action="store_true", help="Upload to HF after conversion")
    parser.add_argument("--limit", type=int, default=None, help="Limit number of docs to process (for testing)")
    args = parser.parse_args()

    dirs = scan_raw_dirs()
    print(f"📂 Found {len(dirs)} UNHCR documents with raw_mentions.json")

    if not dirs:
        return

    if args.limit:
        dirs = dirs[:args.limit]
        print(f"⚠️  Limited to {args.limit} docs")

    links = []
    docs_with_datasets = 0

    for i, d in enumerate(dirs):
        doc_index = i + 1
        link = convert_directory(d, doc_index, execute=args.execute)
        links.append(link)

        if link["has_datasets"]:
            docs_with_datasets += 1

        if (i + 1) % 100 == 0:
            print(f"  Processed {i + 1}/{len(dirs)}...")

    print(f"\n📊 Summary:")
    print(f"  Total docs:          {len(links)}")
    print(f"  Docs with datasets:  {docs_with_datasets}")
    print(f"  Docs without:        {len(links) - docs_with_datasets}")

    if args.execute:
        # Write links file
        LINKS_DIR.mkdir(parents=True, exist_ok=True)
        LINKS_FILE.write_text(json.dumps(links, indent=2))
        print(f"\n💾 Saved {LINKS_FILE}")
        print(f"💾 Created {len(links)} doc_N directories in {UNHCR_DIR}")

        # Clean up original dirs (optional — keep for now)
        print("\n⚠️  Original named directories preserved. Remove manually if desired.")

        if args.upload:
            upload_to_hf(links)
    else:
        print(f"\n[DRY RUN] Would create {len(links)} doc_N dirs and unhcr_pdf_links.json")
        print(f"[DRY RUN] Run with --execute to create files")
        # Show sample
        if links:
            print(f"\nSample link entry:")
            print(json.dumps(links[0], indent=2))


def upload_to_hf(links):
    """Upload links file and all doc files to HF."""
    try:
        from huggingface_hub import HfApi

        token = None
        env_path = Path(__file__).parent / ".env"
        if env_path.exists():
            for line in env_path.read_text().splitlines():
                if line.startswith("HF_TOKEN="):
                    token = line.split("=", 1)[1].strip()

        if not token:
            token = os.environ.get("HF_TOKEN")
        if not token:
            print("❌ No HF_TOKEN found")
            return

        api = HfApi(token=token)
        repo_id = "ai4data/annotation_data"

        # Upload links file
        api.upload_file(
            path_or_fileobj=str(LINKS_FILE),
            path_in_repo="annotation_data/unhcr_data/unhcr_pdf_links.json",
            repo_id=repo_id,
            repo_type="dataset",
            commit_message="Add UNHCR PDF links",
        )
        print("✅ Uploaded unhcr_pdf_links.json")

        # Upload all doc files
        api.upload_folder(
            folder_path=str(UNHCR_DIR),
            path_in_repo="annotation_data/unhcr_extractions",
            repo_id=repo_id,
            repo_type="dataset",
            commit_message="Add UNHCR extraction data",
            allow_patterns=["doc_*/raw/*_direct_judged.jsonl"],
        )
        print("✅ Uploaded UNHCR extraction files")

    except ImportError:
        print("❌ huggingface_hub required: uv pip install huggingface_hub")


if __name__ == "__main__":
    main()