Spaces:

Tzetha
/

IS_Finals

Sleeping

App Files Files Community

IS_Finals / models /python /dataset_dl.py

Tzetha

Uploaded Complete App

81e78bd verified 8 months ago

raw

history blame contribute delete

2.43 kB

	import os
	import tarfile
	import urllib.request
	from pathlib import Path
	from sklearn.model_selection import train_test_split
	import shutil
	from collections import defaultdict

	# URLs
	images_url = "https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz"
	annotations_url = "https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz"

	# Paths
	root_dir = Path("oxford_pet_dataset")
	images_tar = root_dir / "images.tar.gz"
	annotations_tar = root_dir / "annotations.tar.gz"
	images_dir = root_dir / "images"
	annotations_dir = root_dir / "annotations"

	# Create directory
	root_dir.mkdir(exist_ok=True)

	# Download function
	def download(url, path):
	if not path.exists():
	print(f"Downloading {url}...")
	urllib.request.urlretrieve(url, path)
	print(f"Downloaded to {path}")
	else:
	print(f"{path.name} already exists.")

	# Extract function
	def extract(tar_path, extract_to):
	if not extract_to.exists():
	print(f"Extracting {tar_path.name}...")
	with tarfile.open(tar_path) as tar:
	tar.extractall(path=extract_to.parent)
	print(f"Extracted to {extract_to}")
	else:
	print(f"{extract_to.name} already extracted.")

	# Download and extract
	download(images_url, images_tar)
	download(annotations_url, annotations_tar)
	extract(images_tar, images_dir)
	extract(annotations_tar, annotations_dir)

	# Function to extract class name from filename
	def get_class_name(filename):
	# Format: 'Abyssinian_123.jpg' → 'abyssinian'
	return filename.name.split("_")[0].lower()

	# Group image files by class
	class_to_files = defaultdict(list)
	for img_path in images_dir.glob("*.jpg"):
	cls = get_class_name(img_path)
	class_to_files[cls].append(img_path)

	# Split each class into train/val/test and copy
	for cls, files in class_to_files.items():
	train_cls, testval_cls = train_test_split(files, test_size=0.2, random_state=42)
	val_cls, test_cls = train_test_split(testval_cls, test_size=0.5, random_state=42)

	for split_name, split_data in zip(["train", "val", "test"], [train_cls, val_cls, test_cls]):
	split_cls_dir = root_dir / split_name / cls
	split_cls_dir.mkdir(parents=True, exist_ok=True)
	for file in split_data:
	shutil.copy(file, split_cls_dir / file.name)

	print("✅ Dataset is now organized by class for ImageFolder.")