File size: 2,428 Bytes
81e78bd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 |
import os
import tarfile
import urllib.request
from pathlib import Path
from sklearn.model_selection import train_test_split
import shutil
from collections import defaultdict
# URLs
images_url = "https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz"
annotations_url = "https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz"
# Paths
root_dir = Path("oxford_pet_dataset")
images_tar = root_dir / "images.tar.gz"
annotations_tar = root_dir / "annotations.tar.gz"
images_dir = root_dir / "images"
annotations_dir = root_dir / "annotations"
# Create directory
root_dir.mkdir(exist_ok=True)
# Download function
def download(url, path):
if not path.exists():
print(f"Downloading {url}...")
urllib.request.urlretrieve(url, path)
print(f"Downloaded to {path}")
else:
print(f"{path.name} already exists.")
# Extract function
def extract(tar_path, extract_to):
if not extract_to.exists():
print(f"Extracting {tar_path.name}...")
with tarfile.open(tar_path) as tar:
tar.extractall(path=extract_to.parent)
print(f"Extracted to {extract_to}")
else:
print(f"{extract_to.name} already extracted.")
# Download and extract
download(images_url, images_tar)
download(annotations_url, annotations_tar)
extract(images_tar, images_dir)
extract(annotations_tar, annotations_dir)
# Function to extract class name from filename
def get_class_name(filename):
# Format: 'Abyssinian_123.jpg' → 'abyssinian'
return filename.name.split("_")[0].lower()
# Group image files by class
class_to_files = defaultdict(list)
for img_path in images_dir.glob("*.jpg"):
cls = get_class_name(img_path)
class_to_files[cls].append(img_path)
# Split each class into train/val/test and copy
for cls, files in class_to_files.items():
train_cls, testval_cls = train_test_split(files, test_size=0.2, random_state=42)
val_cls, test_cls = train_test_split(testval_cls, test_size=0.5, random_state=42)
for split_name, split_data in zip(["train", "val", "test"], [train_cls, val_cls, test_cls]):
split_cls_dir = root_dir / split_name / cls
split_cls_dir.mkdir(parents=True, exist_ok=True)
for file in split_data:
shutil.copy(file, split_cls_dir / file.name)
print("✅ Dataset is now organized by class for ImageFolder.")
|