import os import tarfile import urllib.request from pathlib import Path from sklearn.model_selection import train_test_split import shutil from collections import defaultdict # URLs images_url = "https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz" annotations_url = "https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz" # Paths root_dir = Path("oxford_pet_dataset") images_tar = root_dir / "images.tar.gz" annotations_tar = root_dir / "annotations.tar.gz" images_dir = root_dir / "images" annotations_dir = root_dir / "annotations" # Create directory root_dir.mkdir(exist_ok=True) # Download function def download(url, path): if not path.exists(): print(f"Downloading {url}...") urllib.request.urlretrieve(url, path) print(f"Downloaded to {path}") else: print(f"{path.name} already exists.") # Extract function def extract(tar_path, extract_to): if not extract_to.exists(): print(f"Extracting {tar_path.name}...") with tarfile.open(tar_path) as tar: tar.extractall(path=extract_to.parent) print(f"Extracted to {extract_to}") else: print(f"{extract_to.name} already extracted.") # Download and extract download(images_url, images_tar) download(annotations_url, annotations_tar) extract(images_tar, images_dir) extract(annotations_tar, annotations_dir) # Function to extract class name from filename def get_class_name(filename): # Format: 'Abyssinian_123.jpg' → 'abyssinian' return filename.name.split("_")[0].lower() # Group image files by class class_to_files = defaultdict(list) for img_path in images_dir.glob("*.jpg"): cls = get_class_name(img_path) class_to_files[cls].append(img_path) # Split each class into train/val/test and copy for cls, files in class_to_files.items(): train_cls, testval_cls = train_test_split(files, test_size=0.2, random_state=42) val_cls, test_cls = train_test_split(testval_cls, test_size=0.5, random_state=42) for split_name, split_data in zip(["train", "val", "test"], [train_cls, val_cls, test_cls]): split_cls_dir = root_dir / split_name / cls split_cls_dir.mkdir(parents=True, exist_ok=True) for file in split_data: shutil.copy(file, split_cls_dir / file.name) print("✅ Dataset is now organized by class for ImageFolder.")