|
|
import os
|
|
|
import tarfile
|
|
|
import urllib.request
|
|
|
from pathlib import Path
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
import shutil
|
|
|
from collections import defaultdict
|
|
|
|
|
|
|
|
|
images_url = "https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz"
|
|
|
annotations_url = "https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz"
|
|
|
|
|
|
|
|
|
root_dir = Path("oxford_pet_dataset")
|
|
|
images_tar = root_dir / "images.tar.gz"
|
|
|
annotations_tar = root_dir / "annotations.tar.gz"
|
|
|
images_dir = root_dir / "images"
|
|
|
annotations_dir = root_dir / "annotations"
|
|
|
|
|
|
|
|
|
root_dir.mkdir(exist_ok=True)
|
|
|
|
|
|
|
|
|
def download(url, path):
|
|
|
if not path.exists():
|
|
|
print(f"Downloading {url}...")
|
|
|
urllib.request.urlretrieve(url, path)
|
|
|
print(f"Downloaded to {path}")
|
|
|
else:
|
|
|
print(f"{path.name} already exists.")
|
|
|
|
|
|
|
|
|
def extract(tar_path, extract_to):
|
|
|
if not extract_to.exists():
|
|
|
print(f"Extracting {tar_path.name}...")
|
|
|
with tarfile.open(tar_path) as tar:
|
|
|
tar.extractall(path=extract_to.parent)
|
|
|
print(f"Extracted to {extract_to}")
|
|
|
else:
|
|
|
print(f"{extract_to.name} already extracted.")
|
|
|
|
|
|
|
|
|
download(images_url, images_tar)
|
|
|
download(annotations_url, annotations_tar)
|
|
|
extract(images_tar, images_dir)
|
|
|
extract(annotations_tar, annotations_dir)
|
|
|
|
|
|
|
|
|
def get_class_name(filename):
|
|
|
|
|
|
return filename.name.split("_")[0].lower()
|
|
|
|
|
|
|
|
|
class_to_files = defaultdict(list)
|
|
|
for img_path in images_dir.glob("*.jpg"):
|
|
|
cls = get_class_name(img_path)
|
|
|
class_to_files[cls].append(img_path)
|
|
|
|
|
|
|
|
|
for cls, files in class_to_files.items():
|
|
|
train_cls, testval_cls = train_test_split(files, test_size=0.2, random_state=42)
|
|
|
val_cls, test_cls = train_test_split(testval_cls, test_size=0.5, random_state=42)
|
|
|
|
|
|
for split_name, split_data in zip(["train", "val", "test"], [train_cls, val_cls, test_cls]):
|
|
|
split_cls_dir = root_dir / split_name / cls
|
|
|
split_cls_dir.mkdir(parents=True, exist_ok=True)
|
|
|
for file in split_data:
|
|
|
shutil.copy(file, split_cls_dir / file.name)
|
|
|
|
|
|
print("✅ Dataset is now organized by class for ImageFolder.")
|
|
|
|