hasari-api / scripts /prepare_cardd_hf.py
erdoganpeker's picture
v0.3.0 — multimodal vehicle damage MVP
e327f0d
"""
prepare_cardd_hf.py — CarDD HuggingFace mirror (FiftyOne format) → YOLO segmentation format
The HF mirror `harpreetsahota/CarDD` is a FiftyOne dataset (not COCO). This script:
1. Loads the dataset via `fiftyone.utils.huggingface.load_from_hub`
2. Maps original CarDD class names to our 6-class taxonomy
3. Exports to YOLO segmentation format (images + .txt polygons)
4. Creates train/val/test splits (80/10/10)
Usage:
python scripts/prepare_cardd_hf.py \\
--output_dir services/ml/data/cardd_yolo \\
[--max_samples N]
"""
from __future__ import annotations
import argparse
import os
import random
import shutil
import sys
from pathlib import Path
# Class mapping — CarDD original labels → our taxonomy (matches services/ml/cardd.yaml)
CLASS_MAP = {
"dent": 0,
"scratch": 1,
"crack": 2,
"glass shatter": 3,
"glass_shatter": 3,
"broken lamp": 4,
"lamp broken": 4,
"lamp_broken": 4,
"broken_lamp": 4,
"tire flat": 5,
"tire_flat": 5,
"flat tire": 5,
}
CLASS_NAMES = ["dent", "scratch", "crack", "glass_shatter", "lamp_broken", "tire_flat"]
def main():
ap = argparse.ArgumentParser(description=__doc__)
ap.add_argument("--output_dir", required=True, type=Path)
ap.add_argument("--hf_repo", default="harpreetsahota/CarDD")
ap.add_argument("--max_samples", type=int, default=None,
help="Limit samples (debugging)")
ap.add_argument("--train_ratio", type=float, default=0.8)
ap.add_argument("--val_ratio", type=float, default=0.1)
ap.add_argument("--seed", type=int, default=42)
ap.add_argument("--symlink", action="store_true",
help="Symlink images instead of copying (saves disk)")
args = ap.parse_args()
try:
import fiftyone as fo
from fiftyone.utils.huggingface import load_from_hub
except ImportError:
print("FiftyOne yüklü değil. Önce: pip install fiftyone")
sys.exit(1)
out = args.output_dir
out.mkdir(parents=True, exist_ok=True)
for split in ("train", "val", "test"):
(out / "images" / split).mkdir(parents=True, exist_ok=True)
(out / "labels" / split).mkdir(parents=True, exist_ok=True)
print(f">> Loading dataset from HF: {args.hf_repo}")
kwargs = {}
if args.max_samples:
kwargs["max_samples"] = args.max_samples
dataset = load_from_hub(args.hf_repo, **kwargs)
print(f">> Loaded {len(dataset)} samples")
# Explore label structure once
sample = dataset.first()
print(">> First sample fields:")
for field, value in sample.iter_fields():
if value is None:
continue
print(f" {field}: {type(value).__name__}")
# Find the label field — usually 'ground_truth', 'detections', or 'segmentations'
label_field = None
for cand in ("ground_truth", "detections", "segmentations", "polylines"):
if dataset.has_sample_field(cand):
label_field = cand
break
if label_field is None:
print("HATA: label field bulunamadı. Mevcut field'lar:")
print(dataset.get_field_schema())
sys.exit(2)
print(f">> Using label field: {label_field}")
# Shuffle + split
random.seed(args.seed)
sample_ids = list(dataset.values("id"))
random.shuffle(sample_ids)
n = len(sample_ids)
n_train = int(n * args.train_ratio)
n_val = int(n * args.val_ratio)
split_assign: dict[str, str] = {}
for i, sid in enumerate(sample_ids):
if i < n_train:
split_assign[sid] = "train"
elif i < n_train + n_val:
split_assign[sid] = "val"
else:
split_assign[sid] = "test"
counts = {"train": 0, "val": 0, "test": 0}
skipped = 0
for sample in dataset.iter_samples(progress=True):
split = split_assign[sample.id]
img_path = Path(sample.filepath)
if not img_path.exists():
skipped += 1
continue
# Get image dimensions
try:
w = sample.metadata.width
h = sample.metadata.height
except (AttributeError, TypeError):
from PIL import Image
with Image.open(img_path) as im:
w, h = im.size
# Get label field — may be Detections or Polylines
labels = sample[label_field]
if labels is None:
skipped += 1
continue
yolo_lines = []
# Detections has .detections, Polylines has .polylines
items = (
getattr(labels, "detections", None)
or getattr(labels, "polylines", None)
or getattr(labels, "segmentations", None)
or []
)
for item in items:
cls_name = (item.label or "").lower().strip().replace(" ", "_")
if cls_name not in CLASS_MAP:
continue
cls_id = CLASS_MAP[cls_name]
# Try mask first (Detections may have mask), then polylines
mask = getattr(item, "mask", None)
polys = getattr(item, "points", None)
if polys:
# Polylines.points is list[list[(x,y) tuples]] — list of contours
for contour in polys:
if len(contour) < 3:
continue
flat = []
for (x, y) in contour:
# FiftyOne polylines: normalized [0,1] coords
flat.append(f"{x:.6f} {y:.6f}")
yolo_lines.append(f"{cls_id} " + " ".join(flat))
elif mask is not None and hasattr(item, "bounding_box"):
# Convert mask to polygon contour
import numpy as np
import cv2
bbox = item.bounding_box # [x_min, y_min, w, h] normalized
# Mask is relative to bbox; convert to image-level polygon
mask_array = (mask * 255).astype(np.uint8) if mask.dtype != np.uint8 else mask
contours, _ = cv2.findContours(mask_array, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
bx, by, bw, bh = bbox
for c in contours:
if len(c) < 3:
continue
# Normalize: c is (n,1,2) in mask-local pixels
mh, mw = mask_array.shape[:2]
flat = []
for pt in c.squeeze(axis=1):
px, py = pt[0] / mw, pt[1] / mh
# px,py in [0,1] within bbox; map to image-level
ix = bx + px * bw
iy = by + py * bh
flat.append(f"{ix:.6f} {iy:.6f}")
yolo_lines.append(f"{cls_id} " + " ".join(flat))
elif hasattr(item, "bounding_box"):
# Fallback: bbox as 4-point polygon
bx, by, bw, bh = item.bounding_box
pts = [(bx, by), (bx + bw, by), (bx + bw, by + bh), (bx, by + bh)]
flat = " ".join(f"{x:.6f} {y:.6f}" for x, y in pts)
yolo_lines.append(f"{cls_id} {flat}")
if not yolo_lines:
skipped += 1
continue
# Copy/symlink image
dst_img = out / "images" / split / img_path.name
if not dst_img.exists():
if args.symlink:
try:
os.symlink(img_path, dst_img)
except OSError:
shutil.copy2(img_path, dst_img)
else:
shutil.copy2(img_path, dst_img)
# Write label
label_path = out / "labels" / split / (img_path.stem + ".txt")
label_path.write_text("\n".join(yolo_lines), encoding="utf-8")
counts[split] += 1
print(f">> Done. Counts: train={counts['train']}, val={counts['val']}, test={counts['test']}, skipped={skipped}")
# Write YAML config
yaml_path = out / "cardd.yaml"
yaml_path.write_text(
"# Auto-generated by prepare_cardd_hf.py\n"
f"path: {out.resolve().as_posix()}\n"
"train: images/train\n"
"val: images/val\n"
"test: images/test\n"
"names:\n" + "\n".join(f" {i}: {n}" for i, n in enumerate(CLASS_NAMES)) + "\n",
encoding="utf-8",
)
print(f">> Wrote dataset config: {yaml_path}")
if __name__ == "__main__":
main()