xray-classification / scripts /01_merge_datasets.py
Flamekizer11's picture
Upload 27 files
64d0ccc verified
import os
import shutil
import csv
from pathlib import Path
RAW_ROOT = Path("raw_data")
OUT_IMG = Path("data_merged/images")
OUT_CSV = Path("data_merged/metadata_raw.csv")
OUT_IMG.mkdir(parents=True, exist_ok=True)
rows = []
img_id = 0
VALID_EXT = (".png", ".jpg", ".jpeg")
def merge_any_dataset(dataset_name, base_path):
global img_id
for root, _, files in os.walk(base_path):
for f in files:
if not f.lower().endswith(VALID_EXT):
continue
src = Path(root) / f
class_name = Path(root).name
new_name = f"{dataset_name}__{class_name}__{img_id}{src.suffix}"
dst = OUT_IMG / new_name
shutil.copy(src, dst)
rows.append({
"image_id": img_id,
"filename": new_name,
"label": class_name,
"source": dataset_name
})
img_id += 1
# Merging all datasets found in RAW_ROOT
for item in RAW_ROOT.iterdir():
if item.is_dir():
merge_any_dataset(item.name, item)
# writing out the CSV
OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(
f,
fieldnames=["image_id", "filename", "label", "source"]
)
writer.writeheader()
writer.writerows(rows)
print("Merged dataset created")
print("Images:", len(rows))