import os import shutil import csv from pathlib import Path RAW_ROOT = Path("raw_data") OUT_IMG = Path("data_merged/images") OUT_CSV = Path("data_merged/metadata_raw.csv") OUT_IMG.mkdir(parents=True, exist_ok=True) rows = [] img_id = 0 VALID_EXT = (".png", ".jpg", ".jpeg") def merge_any_dataset(dataset_name, base_path): global img_id for root, _, files in os.walk(base_path): for f in files: if not f.lower().endswith(VALID_EXT): continue src = Path(root) / f class_name = Path(root).name new_name = f"{dataset_name}__{class_name}__{img_id}{src.suffix}" dst = OUT_IMG / new_name shutil.copy(src, dst) rows.append({ "image_id": img_id, "filename": new_name, "label": class_name, "source": dataset_name }) img_id += 1 # Merging all datasets found in RAW_ROOT for item in RAW_ROOT.iterdir(): if item.is_dir(): merge_any_dataset(item.name, item) # writing out the CSV OUT_CSV.parent.mkdir(parents=True, exist_ok=True) with open(OUT_CSV, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter( f, fieldnames=["image_id", "filename", "label", "source"] ) writer.writeheader() writer.writerows(rows) print("Merged dataset created") print("Images:", len(rows))