waltgrace's picture
feat: COCO→YOLO export, 5-stage pipeline, Falcon MLX server
094d5f3 verified
"""
export.py — Convert COCO annotations to YOLO training format.
The final step before training: takes the pipeline's COCO JSON output
and produces a YOLO-ready dataset with data.yaml.
Usage:
# Convert a pipeline experiment to YOLO format
data_label_factory export --experiment experiments/latest/ --output yolo_dataset/
# Or specify a COCO file directly
data_label_factory export --coco path/to/stop-signs.coco.json --images ~/data-label-factory/stop-signs --output yolo_dataset/
"""
from __future__ import annotations
import json
import os
import shutil
import random
from pathlib import Path
def coco_to_yolo(
coco_path: str,
image_root: str,
output_dir: str,
val_split: float = 0.1,
copy_images: bool = True,
) -> dict:
"""Convert COCO annotations to YOLO format.
Args:
coco_path: Path to COCO JSON file
image_root: Root directory where images live (file_name in COCO is relative to this)
output_dir: Output directory for YOLO dataset
val_split: Fraction of images for validation (default 0.1)
copy_images: Whether to copy images to output dir (default True)
Returns:
Summary dict with paths and stats
"""
with open(coco_path) as f:
coco = json.load(f)
images = {img["id"]: img for img in coco.get("images", [])}
annotations = coco.get("annotations", [])
categories = coco.get("categories", [])
# Build category mapping: COCO cat_id → YOLO class_id (0-indexed)
cat_id_to_yolo = {}
cat_names = {}
for i, cat in enumerate(categories):
cat_id_to_yolo[cat["id"]] = i
cat_names[i] = cat["name"]
# Group annotations by image
anns_by_image = {}
for ann in annotations:
anns_by_image.setdefault(ann["image_id"], []).append(ann)
# Create output dirs
out = Path(output_dir)
train_img = out / "images" / "train"
train_lbl = out / "labels" / "train"
val_img = out / "images" / "val"
val_lbl = out / "labels" / "val"
for d in [train_img, train_lbl, val_img, val_lbl]:
d.mkdir(parents=True, exist_ok=True)
# Split images
img_ids = list(images.keys())
random.shuffle(img_ids)
n_val = max(1, int(len(img_ids) * val_split))
val_ids = set(img_ids[:n_val])
train_ids = set(img_ids[n_val:])
stats = {"train": 0, "val": 0, "annotations": 0, "skipped": 0}
for img_id, img_info in images.items():
is_val = img_id in val_ids
img_dir = val_img if is_val else train_img
lbl_dir = val_lbl if is_val else train_lbl
iw = img_info.get("width", 1)
ih = img_info.get("height", 1)
fname = img_info.get("file_name", "")
src_path = os.path.join(image_root, fname)
if not os.path.exists(src_path):
stats["skipped"] += 1
continue
# Copy image
ext = os.path.splitext(fname)[1] or ".jpg"
safe_name = fname.replace("/", "_").replace("\\", "_")
dst_img = img_dir / safe_name
if copy_images:
shutil.copy2(src_path, dst_img)
# Write YOLO label file
stem = os.path.splitext(safe_name)[0]
label_lines = []
for ann in anns_by_image.get(img_id, []):
cls_id = cat_id_to_yolo.get(ann.get("category_id"), 0)
x, y, w, h = ann["bbox"] # COCO: [x, y, w, h] in pixels
# Convert to YOLO: [cx, cy, w, h] normalized 0-1
cx = (x + w / 2) / iw
cy = (y + h / 2) / ih
nw = w / iw
nh = h / ih
# Clamp to [0, 1]
cx = max(0, min(1, cx))
cy = max(0, min(1, cy))
nw = max(0, min(1, nw))
nh = max(0, min(1, nh))
label_lines.append(f"{cls_id} {cx:.6f} {cy:.6f} {nw:.6f} {nh:.6f}")
stats["annotations"] += 1
with open(lbl_dir / f"{stem}.txt", "w") as f:
f.write("\n".join(label_lines))
if is_val:
stats["val"] += 1
else:
stats["train"] += 1
# Write data.yaml
import yaml
data_yaml = {
"path": str(out.resolve()),
"train": "images/train",
"val": "images/val",
"nc": len(cat_names),
"names": cat_names,
}
yaml_path = out / "data.yaml"
with open(yaml_path, "w") as f:
yaml.dump(data_yaml, f, default_flow_style=False)
summary = {
"output_dir": str(out),
"data_yaml": str(yaml_path),
"train_images": stats["train"],
"val_images": stats["val"],
"total_annotations": stats["annotations"],
"skipped": stats["skipped"],
"classes": cat_names,
"nc": len(cat_names),
}
print(f" YOLO dataset: {out}")
print(f" Train: {stats['train']} images, Val: {stats['val']} images")
print(f" Annotations: {stats['annotations']}")
print(f" Classes ({len(cat_names)}): {list(cat_names.values())}")
print(f" data.yaml: {yaml_path}")
# Print training command
print(f"\n Training command:")
print(f" yolo detect train \\")
print(f" model=yolo11n.pt \\")
print(f" data={yaml_path} \\")
print(f" epochs=50 imgsz=640 batch=16 \\")
print(f" project=runs name={out.name}")
return summary
def main(argv=None):
import argparse
p = argparse.ArgumentParser(
prog="data_label_factory export",
description="Convert COCO annotations to YOLO training format.",
)
p.add_argument("--coco", help="Path to COCO JSON file")
p.add_argument("--experiment", help="Experiment directory (auto-finds COCO file)")
p.add_argument("--images", help="Image root directory")
p.add_argument("--output", default="yolo_dataset", help="Output directory")
p.add_argument("--val-split", type=float, default=0.1, help="Validation split (default 0.1)")
p.add_argument("--no-copy", action="store_true", help="Don't copy images (symlink instead)")
args = p.parse_args(argv)
coco_path = args.coco
image_root = args.images
if not coco_path and args.experiment:
# Find COCO file in experiment
exp_dir = args.experiment
if exp_dir == "latest":
from .experiments import list_experiments
exps = list_experiments()
if exps:
exp_dir = exps[0]["path"]
for dirpath, _, filenames in os.walk(exp_dir):
for fn in filenames:
if fn.endswith(".coco.json"):
coco_path = os.path.join(dirpath, fn)
break
if not coco_path:
print(f"No COCO file found in {exp_dir}")
return
if not coco_path:
p.error("--coco or --experiment required")
if not image_root:
# Try to guess from COCO info
with open(coco_path) as f:
coco = json.load(f)
target = coco.get("info", {}).get("target_object", "")
project = coco.get("info", {}).get("description", "").split("for ")[-1].split(" via")[0]
image_root = os.path.expanduser(f"~/data-label-factory/{project}")
if not os.path.exists(image_root):
print(f" Image root not found: {image_root}")
print(f" Specify with --images")
return
print(f"Converting COCO → YOLO")
print(f" COCO: {coco_path}")
print(f" Images: {image_root}")
print(f" Output: {args.output}")
coco_to_yolo(coco_path, image_root, args.output,
val_split=args.val_split, copy_images=not args.no_copy)