GAP_SMALL_PROJECT2 / data_prep.py
fatimaxa's picture
Upload 46 files
83be575 verified
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torchvision import transforms
from datasets import load_dataset
import torch.nn.functional as F
from utils.config import load_config
config = load_config()
batch_size = config["batch_size"]
num_workers = config["num_workers"]
mean_nm = config["normalize_mean"]
std_nm = config["normalize_std"]
# let's select the first GPU device if available
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
dataset = load_dataset("DScomp380/plant_village")
#split dataset into train(70%), and 30% remaining for val and test
splits = dataset["train"].train_test_split(test_size=0.30, seed=42)
train_split = splits["train"] #training set
remaining = splits["test"]
#split remaining 30% into val(15%) and test(15%)
val_test = remaining.train_test_split(test_size=0.5, seed=42)
val_split = val_test["train"] #validation set
test_split = val_test["test"] #test set
transform = transforms.Compose([
# resize images to 224x224, convert to tensor, and normalize
transforms.Resize((224, 224)),
transforms.ToTensor(),
transforms.Normalize(mean=mean_nm, std=std_nm)
])
def transform_batch(batch):
# apply transformations to a batch of images
batch["pixel_values"] = [transform(img) for img in batch["image"]]
return batch
# apply transformations to the datasets
train_split = train_split.with_transform(transform_batch)
val_split = val_split.with_transform(transform_batch)
test_split = test_split.with_transform(transform_batch)
def collate_fn(batch):
# custom collate function to handle batching
return {
"pixel_values": torch.stack([item["pixel_values"] for item in batch]),
"labels": torch.tensor([item["label"] for item in batch])
}
# create DataLoaders for train, val, and test sets
train_loader = DataLoader(train_split, batch_size=batch_size, shuffle=True, num_workers=num_workers, collate_fn=collate_fn)
val_loader = DataLoader(val_split, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_fn)
test_loader = DataLoader(test_split, batch_size=batch_size, shuffle=False, num_workers=num_workers, collate_fn=collate_fn)
if __name__ == "__main__":
print(device)
print(f"Loaded PlantVillage dataset with splits: {dataset}")