"""
prepare_cardd_hf.py — CarDD HuggingFace mirror (FiftyOne format) → YOLO segmentation format

The HF mirror `harpreetsahota/CarDD` is a FiftyOne dataset (not COCO). This script:
1. Loads the dataset via `fiftyone.utils.huggingface.load_from_hub`
2. Maps original CarDD class names to our 6-class taxonomy
3. Exports to YOLO segmentation format (images + .txt polygons)
4. Creates train/val/test splits (80/10/10)

Usage:
    python scripts/prepare_cardd_hf.py \\
        --output_dir services/ml/data/cardd_yolo \\
        [--max_samples N]
"""
from __future__ import annotations

import argparse
import os
import random
import shutil
import sys
from pathlib import Path

# Class mapping — CarDD original labels → our taxonomy (matches services/ml/cardd.yaml)
CLASS_MAP = {
    "dent": 0,
    "scratch": 1,
    "crack": 2,
    "glass shatter": 3,
    "glass_shatter": 3,
    "broken lamp": 4,
    "lamp broken": 4,
    "lamp_broken": 4,
    "broken_lamp": 4,
    "tire flat": 5,
    "tire_flat": 5,
    "flat tire": 5,
}

CLASS_NAMES = ["dent", "scratch", "crack", "glass_shatter", "lamp_broken", "tire_flat"]


def main():
    ap = argparse.ArgumentParser(description=__doc__)
    ap.add_argument("--output_dir", required=True, type=Path)
    ap.add_argument("--hf_repo", default="harpreetsahota/CarDD")
    ap.add_argument("--max_samples", type=int, default=None,
                    help="Limit samples (debugging)")
    ap.add_argument("--train_ratio", type=float, default=0.8)
    ap.add_argument("--val_ratio", type=float, default=0.1)
    ap.add_argument("--seed", type=int, default=42)
    ap.add_argument("--symlink", action="store_true",
                    help="Symlink images instead of copying (saves disk)")
    args = ap.parse_args()

    try:
        import fiftyone as fo
        from fiftyone.utils.huggingface import load_from_hub
    except ImportError:
        print("FiftyOne yüklü değil. Önce: pip install fiftyone")
        sys.exit(1)

    out = args.output_dir
    out.mkdir(parents=True, exist_ok=True)
    for split in ("train", "val", "test"):
        (out / "images" / split).mkdir(parents=True, exist_ok=True)
        (out / "labels" / split).mkdir(parents=True, exist_ok=True)

    print(f">> Loading dataset from HF: {args.hf_repo}")
    kwargs = {}
    if args.max_samples:
        kwargs["max_samples"] = args.max_samples
    dataset = load_from_hub(args.hf_repo, **kwargs)
    print(f">> Loaded {len(dataset)} samples")

    # Explore label structure once
    sample = dataset.first()
    print(">> First sample fields:")
    for field, value in sample.iter_fields():
        if value is None:
            continue
        print(f"   {field}: {type(value).__name__}")

    # Find the label field — usually 'ground_truth', 'detections', or 'segmentations'
    label_field = None
    for cand in ("ground_truth", "detections", "segmentations", "polylines"):
        if dataset.has_sample_field(cand):
            label_field = cand
            break
    if label_field is None:
        print("HATA: label field bulunamadı. Mevcut field'lar:")
        print(dataset.get_field_schema())
        sys.exit(2)
    print(f">> Using label field: {label_field}")

    # Shuffle + split
    random.seed(args.seed)
    sample_ids = list(dataset.values("id"))
    random.shuffle(sample_ids)
    n = len(sample_ids)
    n_train = int(n * args.train_ratio)
    n_val = int(n * args.val_ratio)
    split_assign: dict[str, str] = {}
    for i, sid in enumerate(sample_ids):
        if i < n_train:
            split_assign[sid] = "train"
        elif i < n_train + n_val:
            split_assign[sid] = "val"
        else:
            split_assign[sid] = "test"

    counts = {"train": 0, "val": 0, "test": 0}
    skipped = 0

    for sample in dataset.iter_samples(progress=True):
        split = split_assign[sample.id]
        img_path = Path(sample.filepath)
        if not img_path.exists():
            skipped += 1
            continue

        # Get image dimensions
        try:
            w = sample.metadata.width
            h = sample.metadata.height
        except (AttributeError, TypeError):
            from PIL import Image
            with Image.open(img_path) as im:
                w, h = im.size

        # Get label field — may be Detections or Polylines
        labels = sample[label_field]
        if labels is None:
            skipped += 1
            continue

        yolo_lines = []
        # Detections has .detections, Polylines has .polylines
        items = (
            getattr(labels, "detections", None)
            or getattr(labels, "polylines", None)
            or getattr(labels, "segmentations", None)
            or []
        )

        for item in items:
            cls_name = (item.label or "").lower().strip().replace(" ", "_")
            if cls_name not in CLASS_MAP:
                continue
            cls_id = CLASS_MAP[cls_name]

            # Try mask first (Detections may have mask), then polylines
            mask = getattr(item, "mask", None)
            polys = getattr(item, "points", None)

            if polys:
                # Polylines.points is list[list[(x,y) tuples]] — list of contours
                for contour in polys:
                    if len(contour) < 3:
                        continue
                    flat = []
                    for (x, y) in contour:
                        # FiftyOne polylines: normalized [0,1] coords
                        flat.append(f"{x:.6f} {y:.6f}")
                    yolo_lines.append(f"{cls_id} " + " ".join(flat))
            elif mask is not None and hasattr(item, "bounding_box"):
                # Convert mask to polygon contour
                import numpy as np
                import cv2
                bbox = item.bounding_box  # [x_min, y_min, w, h] normalized
                # Mask is relative to bbox; convert to image-level polygon
                mask_array = (mask * 255).astype(np.uint8) if mask.dtype != np.uint8 else mask
                contours, _ = cv2.findContours(mask_array, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
                bx, by, bw, bh = bbox
                for c in contours:
                    if len(c) < 3:
                        continue
                    # Normalize: c is (n,1,2) in mask-local pixels
                    mh, mw = mask_array.shape[:2]
                    flat = []
                    for pt in c.squeeze(axis=1):
                        px, py = pt[0] / mw, pt[1] / mh
                        # px,py in [0,1] within bbox; map to image-level
                        ix = bx + px * bw
                        iy = by + py * bh
                        flat.append(f"{ix:.6f} {iy:.6f}")
                    yolo_lines.append(f"{cls_id} " + " ".join(flat))
            elif hasattr(item, "bounding_box"):
                # Fallback: bbox as 4-point polygon
                bx, by, bw, bh = item.bounding_box
                pts = [(bx, by), (bx + bw, by), (bx + bw, by + bh), (bx, by + bh)]
                flat = " ".join(f"{x:.6f} {y:.6f}" for x, y in pts)
                yolo_lines.append(f"{cls_id} {flat}")

        if not yolo_lines:
            skipped += 1
            continue

        # Copy/symlink image
        dst_img = out / "images" / split / img_path.name
        if not dst_img.exists():
            if args.symlink:
                try:
                    os.symlink(img_path, dst_img)
                except OSError:
                    shutil.copy2(img_path, dst_img)
            else:
                shutil.copy2(img_path, dst_img)

        # Write label
        label_path = out / "labels" / split / (img_path.stem + ".txt")
        label_path.write_text("\n".join(yolo_lines), encoding="utf-8")
        counts[split] += 1

    print(f">> Done. Counts: train={counts['train']}, val={counts['val']}, test={counts['test']}, skipped={skipped}")

    # Write YAML config
    yaml_path = out / "cardd.yaml"
    yaml_path.write_text(
        "# Auto-generated by prepare_cardd_hf.py\n"
        f"path: {out.resolve().as_posix()}\n"
        "train: images/train\n"
        "val: images/val\n"
        "test: images/test\n"
        "names:\n" + "\n".join(f"  {i}: {n}" for i, n in enumerate(CLASS_NAMES)) + "\n",
        encoding="utf-8",
    )
    print(f">> Wrote dataset config: {yaml_path}")


if __name__ == "__main__":
    main()