Spaces:
Sleeping
Sleeping
| import os | |
| import shutil | |
| import csv | |
| from pathlib import Path | |
| RAW_ROOT = Path("raw_data") | |
| OUT_IMG = Path("data_merged/images") | |
| OUT_CSV = Path("data_merged/metadata_raw.csv") | |
| OUT_IMG.mkdir(parents=True, exist_ok=True) | |
| rows = [] | |
| img_id = 0 | |
| VALID_EXT = (".png", ".jpg", ".jpeg") | |
| def merge_any_dataset(dataset_name, base_path): | |
| global img_id | |
| for root, _, files in os.walk(base_path): | |
| for f in files: | |
| if not f.lower().endswith(VALID_EXT): | |
| continue | |
| src = Path(root) / f | |
| class_name = Path(root).name | |
| new_name = f"{dataset_name}__{class_name}__{img_id}{src.suffix}" | |
| dst = OUT_IMG / new_name | |
| shutil.copy(src, dst) | |
| rows.append({ | |
| "image_id": img_id, | |
| "filename": new_name, | |
| "label": class_name, | |
| "source": dataset_name | |
| }) | |
| img_id += 1 | |
| # Merging all datasets found in RAW_ROOT | |
| for item in RAW_ROOT.iterdir(): | |
| if item.is_dir(): | |
| merge_any_dataset(item.name, item) | |
| # writing out the CSV | |
| OUT_CSV.parent.mkdir(parents=True, exist_ok=True) | |
| with open(OUT_CSV, "w", newline="", encoding="utf-8") as f: | |
| writer = csv.DictWriter( | |
| f, | |
| fieldnames=["image_id", "filename", "label", "source"] | |
| ) | |
| writer.writeheader() | |
| writer.writerows(rows) | |
| print("Merged dataset created") | |
| print("Images:", len(rows)) | |