import os
import tarfile
import urllib.request
from pathlib import Path
from sklearn.model_selection import train_test_split
import shutil
from collections import defaultdict

# URLs
images_url = "https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz"
annotations_url = "https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz"

# Paths
root_dir = Path("oxford_pet_dataset")
images_tar = root_dir / "images.tar.gz"
annotations_tar = root_dir / "annotations.tar.gz"
images_dir = root_dir / "images"
annotations_dir = root_dir / "annotations"

# Create directory
root_dir.mkdir(exist_ok=True)

# Download function
def download(url, path):
    if not path.exists():
        print(f"Downloading {url}...")
        urllib.request.urlretrieve(url, path)
        print(f"Downloaded to {path}")
    else:
        print(f"{path.name} already exists.")

# Extract function
def extract(tar_path, extract_to):
    if not extract_to.exists():
        print(f"Extracting {tar_path.name}...")
        with tarfile.open(tar_path) as tar:
            tar.extractall(path=extract_to.parent)
        print(f"Extracted to {extract_to}")
    else:
        print(f"{extract_to.name} already extracted.")

# Download and extract
download(images_url, images_tar)
download(annotations_url, annotations_tar)
extract(images_tar, images_dir)
extract(annotations_tar, annotations_dir)

# Function to extract class name from filename
def get_class_name(filename):
    # Format: 'Abyssinian_123.jpg' → 'abyssinian'
    return filename.name.split("_")[0].lower()

# Group image files by class
class_to_files = defaultdict(list)
for img_path in images_dir.glob("*.jpg"):
    cls = get_class_name(img_path)
    class_to_files[cls].append(img_path)

# Split each class into train/val/test and copy
for cls, files in class_to_files.items():
    train_cls, testval_cls = train_test_split(files, test_size=0.2, random_state=42)
    val_cls, test_cls = train_test_split(testval_cls, test_size=0.5, random_state=42)

    for split_name, split_data in zip(["train", "val", "test"], [train_cls, val_cls, test_cls]):
        split_cls_dir = root_dir / split_name / cls
        split_cls_dir.mkdir(parents=True, exist_ok=True)
        for file in split_data:
            shutil.copy(file, split_cls_dir / file.name)

print("✅ Dataset is now organized by class for ImageFolder.")