""" export.py — Convert COCO annotations to YOLO training format. The final step before training: takes the pipeline's COCO JSON output and produces a YOLO-ready dataset with data.yaml. Usage: # Convert a pipeline experiment to YOLO format data_label_factory export --experiment experiments/latest/ --output yolo_dataset/ # Or specify a COCO file directly data_label_factory export --coco path/to/stop-signs.coco.json --images ~/data-label-factory/stop-signs --output yolo_dataset/ """ from __future__ import annotations import json import os import shutil import random from pathlib import Path def coco_to_yolo( coco_path: str, image_root: str, output_dir: str, val_split: float = 0.1, copy_images: bool = True, ) -> dict: """Convert COCO annotations to YOLO format. Args: coco_path: Path to COCO JSON file image_root: Root directory where images live (file_name in COCO is relative to this) output_dir: Output directory for YOLO dataset val_split: Fraction of images for validation (default 0.1) copy_images: Whether to copy images to output dir (default True) Returns: Summary dict with paths and stats """ with open(coco_path) as f: coco = json.load(f) images = {img["id"]: img for img in coco.get("images", [])} annotations = coco.get("annotations", []) categories = coco.get("categories", []) # Build category mapping: COCO cat_id → YOLO class_id (0-indexed) cat_id_to_yolo = {} cat_names = {} for i, cat in enumerate(categories): cat_id_to_yolo[cat["id"]] = i cat_names[i] = cat["name"] # Group annotations by image anns_by_image = {} for ann in annotations: anns_by_image.setdefault(ann["image_id"], []).append(ann) # Create output dirs out = Path(output_dir) train_img = out / "images" / "train" train_lbl = out / "labels" / "train" val_img = out / "images" / "val" val_lbl = out / "labels" / "val" for d in [train_img, train_lbl, val_img, val_lbl]: d.mkdir(parents=True, exist_ok=True) # Split images img_ids = list(images.keys()) random.shuffle(img_ids) n_val = max(1, int(len(img_ids) * val_split)) val_ids = set(img_ids[:n_val]) train_ids = set(img_ids[n_val:]) stats = {"train": 0, "val": 0, "annotations": 0, "skipped": 0} for img_id, img_info in images.items(): is_val = img_id in val_ids img_dir = val_img if is_val else train_img lbl_dir = val_lbl if is_val else train_lbl iw = img_info.get("width", 1) ih = img_info.get("height", 1) fname = img_info.get("file_name", "") src_path = os.path.join(image_root, fname) if not os.path.exists(src_path): stats["skipped"] += 1 continue # Copy image ext = os.path.splitext(fname)[1] or ".jpg" safe_name = fname.replace("/", "_").replace("\\", "_") dst_img = img_dir / safe_name if copy_images: shutil.copy2(src_path, dst_img) # Write YOLO label file stem = os.path.splitext(safe_name)[0] label_lines = [] for ann in anns_by_image.get(img_id, []): cls_id = cat_id_to_yolo.get(ann.get("category_id"), 0) x, y, w, h = ann["bbox"] # COCO: [x, y, w, h] in pixels # Convert to YOLO: [cx, cy, w, h] normalized 0-1 cx = (x + w / 2) / iw cy = (y + h / 2) / ih nw = w / iw nh = h / ih # Clamp to [0, 1] cx = max(0, min(1, cx)) cy = max(0, min(1, cy)) nw = max(0, min(1, nw)) nh = max(0, min(1, nh)) label_lines.append(f"{cls_id} {cx:.6f} {cy:.6f} {nw:.6f} {nh:.6f}") stats["annotations"] += 1 with open(lbl_dir / f"{stem}.txt", "w") as f: f.write("\n".join(label_lines)) if is_val: stats["val"] += 1 else: stats["train"] += 1 # Write data.yaml import yaml data_yaml = { "path": str(out.resolve()), "train": "images/train", "val": "images/val", "nc": len(cat_names), "names": cat_names, } yaml_path = out / "data.yaml" with open(yaml_path, "w") as f: yaml.dump(data_yaml, f, default_flow_style=False) summary = { "output_dir": str(out), "data_yaml": str(yaml_path), "train_images": stats["train"], "val_images": stats["val"], "total_annotations": stats["annotations"], "skipped": stats["skipped"], "classes": cat_names, "nc": len(cat_names), } print(f" YOLO dataset: {out}") print(f" Train: {stats['train']} images, Val: {stats['val']} images") print(f" Annotations: {stats['annotations']}") print(f" Classes ({len(cat_names)}): {list(cat_names.values())}") print(f" data.yaml: {yaml_path}") # Print training command print(f"\n Training command:") print(f" yolo detect train \\") print(f" model=yolo11n.pt \\") print(f" data={yaml_path} \\") print(f" epochs=50 imgsz=640 batch=16 \\") print(f" project=runs name={out.name}") return summary def main(argv=None): import argparse p = argparse.ArgumentParser( prog="data_label_factory export", description="Convert COCO annotations to YOLO training format.", ) p.add_argument("--coco", help="Path to COCO JSON file") p.add_argument("--experiment", help="Experiment directory (auto-finds COCO file)") p.add_argument("--images", help="Image root directory") p.add_argument("--output", default="yolo_dataset", help="Output directory") p.add_argument("--val-split", type=float, default=0.1, help="Validation split (default 0.1)") p.add_argument("--no-copy", action="store_true", help="Don't copy images (symlink instead)") args = p.parse_args(argv) coco_path = args.coco image_root = args.images if not coco_path and args.experiment: # Find COCO file in experiment exp_dir = args.experiment if exp_dir == "latest": from .experiments import list_experiments exps = list_experiments() if exps: exp_dir = exps[0]["path"] for dirpath, _, filenames in os.walk(exp_dir): for fn in filenames: if fn.endswith(".coco.json"): coco_path = os.path.join(dirpath, fn) break if not coco_path: print(f"No COCO file found in {exp_dir}") return if not coco_path: p.error("--coco or --experiment required") if not image_root: # Try to guess from COCO info with open(coco_path) as f: coco = json.load(f) target = coco.get("info", {}).get("target_object", "") project = coco.get("info", {}).get("description", "").split("for ")[-1].split(" via")[0] image_root = os.path.expanduser(f"~/data-label-factory/{project}") if not os.path.exists(image_root): print(f" Image root not found: {image_root}") print(f" Specify with --images") return print(f"Converting COCO → YOLO") print(f" COCO: {coco_path}") print(f" Images: {image_root}") print(f" Output: {args.output}") coco_to_yolo(coco_path, image_root, args.output, val_split=args.val_split, copy_images=not args.no_copy)