import os
import shutil
import csv
from pathlib import Path

RAW_ROOT = Path("raw_data")
OUT_IMG = Path("data_merged/images")
OUT_CSV = Path("data_merged/metadata_raw.csv")

OUT_IMG.mkdir(parents=True, exist_ok=True)

rows = []
img_id = 0
VALID_EXT = (".png", ".jpg", ".jpeg")

def merge_any_dataset(dataset_name, base_path):
    global img_id
    for root, _, files in os.walk(base_path):
        for f in files:
            if not f.lower().endswith(VALID_EXT):
                continue

            src = Path(root) / f
            class_name = Path(root).name

            new_name = f"{dataset_name}__{class_name}__{img_id}{src.suffix}"
            dst = OUT_IMG / new_name

            shutil.copy(src, dst)

            rows.append({
                "image_id": img_id,
                "filename": new_name,
                "label": class_name,
                "source": dataset_name
            })

            img_id += 1

# Merging all datasets found in RAW_ROOT
for item in RAW_ROOT.iterdir():
    if item.is_dir():
        merge_any_dataset(item.name, item)

# writing out the CSV
OUT_CSV.parent.mkdir(parents=True, exist_ok=True)
with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
    writer = csv.DictWriter(
        f,
        fieldnames=["image_id", "filename", "label", "source"]
    )
    writer.writeheader()
    writer.writerows(rows)

print("Merged dataset created")
print("Images:", len(rows))