Spaces:
Sleeping
Sleeping
| import torch | |
| from torch.utils.data import DataLoader | |
| from torchvision import transforms | |
| from datasets import load_dataset | |
| from utils.config import load_config | |
| config = load_config() | |
| batch_size = config["batch_size"] | |
| num_workers = config["num_workers"] | |
| mean_nm = config["normalize_mean"] | |
| std_nm = config["normalize_std"] | |
| execute_remotely = config.get("execute_remotely", False) | |
| device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu') | |
| # set dataset to clearml dataset if executing remotely or load from huggingface otherwise | |
| if config["execute_remotely"]: | |
| from clearml import Dataset as ClearMLDataset | |
| clearml_dataset = ClearMLDataset.get(dataset_id="0c3de7af2d98482dacf41633a0587845") | |
| dataset_path = clearml_dataset.get_local_copy() | |
| dataset = load_dataset(dataset_path) | |
| else: | |
| dataset = load_dataset("DScomp380/plant_village", cache_dir="./data_cache") | |
| #split dataset into train(70%), and 30% remaining for val and test | |
| splits = dataset["train"].train_test_split(test_size=0.30, seed=42) | |
| train_split = splits["train"] #training set | |
| remaining = splits["test"] | |
| #split remaining 30% into val(15%) and test(15%) | |
| val_test = remaining.train_test_split(test_size=0.5, seed=42) | |
| val_split = val_test["train"] #validation set | |
| test_split = val_test["test"] #test set | |
| preprocess_transform = transforms.Compose([ | |
| # resize images to 224x224, convert to tensor, and normalize | |
| transforms.Resize((224, 224)), | |
| transforms.ToTensor(), | |
| transforms.Normalize(mean=mean_nm, std=std_nm) | |
| ]) | |
| def preprocess_batch(batch): | |
| batch["pixel_values"] = [preprocess_transform(img) for img in batch["image"]] | |
| return batch | |
| if execute_remotely: | |
| def train_transform_batch(batch): | |
| batch["pixel_values"] = [preprocess_transform(img) for img in batch["image"]] | |
| return batch | |
| train_split = train_split.with_transform(train_transform_batch) | |
| val_split = val_split.with_transform(train_transform_batch) | |
| test_split = test_split.with_transform(train_transform_batch) | |
| else: | |
| train_split = train_split.map( | |
| preprocess_batch, | |
| batched=True, | |
| batch_size=100, | |
| remove_columns=["image"], | |
| cache_file_name="./data_cache/train_preprocessed.arrow" | |
| ) | |
| val_split = val_split.map( | |
| preprocess_batch, | |
| batched=True, | |
| batch_size=100, | |
| remove_columns=["image"], | |
| cache_file_name="./data_cache/val_preprocessed.arrow" | |
| ) | |
| test_split = test_split.map( | |
| preprocess_batch, | |
| batched=True, | |
| batch_size=100, | |
| remove_columns=["image"], | |
| cache_file_name="./data_cache/test_preprocessed.arrow" | |
| ) | |
| train_split.set_format(type="torch", columns=["pixel_values", "label"]) | |
| val_split.set_format(type="torch", columns=["pixel_values", "label"]) | |
| test_split.set_format(type="torch", columns=["pixel_values", "label"]) | |
| # augmentations | |
| train_augment = transforms.Compose([ | |
| transforms.RandomHorizontalFlip(p=0.5), | |
| transforms.RandomVerticalFlip(p=0.3), | |
| transforms.RandomRotation(degrees=15), | |
| transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1), | |
| transforms.RandomApply([ | |
| transforms.GaussianBlur(kernel_size=3, sigma=(0.1, 2.0)) | |
| ], p=0.3), | |
| ]) | |
| def train_collate_fn(batch): | |
| pixel_values = [item["pixel_values"] for item in batch] | |
| labels = [item["label"] for item in batch] | |
| augmented = [train_augment(img) for img in pixel_values] # apply augmentation while training | |
| return { | |
| "pixel_values": torch.stack(augmented), | |
| "labels": torch.tensor(labels) | |
| } | |
| def val_collate_fn(batch): | |
| return { | |
| "pixel_values": torch.stack([item["pixel_values"] for item in batch]), | |
| "labels": torch.tensor([item["label"] for item in batch]) | |
| } | |
| # create DataLoaders for train, val, and test sets | |
| train_loader = DataLoader( | |
| train_split, | |
| batch_size=batch_size, | |
| shuffle=True, | |
| num_workers=num_workers, | |
| pin_memory=True, | |
| persistent_workers=True if num_workers > 0 else False, | |
| collate_fn=train_collate_fn | |
| ) | |
| val_loader = DataLoader( | |
| val_split, | |
| batch_size=batch_size, | |
| shuffle=False, | |
| num_workers=num_workers, | |
| pin_memory=True, | |
| persistent_workers=True if num_workers > 0 else False, | |
| collate_fn=val_collate_fn | |
| ) | |
| test_loader = DataLoader( | |
| test_split, | |
| batch_size=batch_size, | |
| shuffle=False, | |
| num_workers=num_workers, | |
| pin_memory=True, | |
| persistent_workers=True if num_workers > 0 else False, | |
| collate_fn=val_collate_fn | |
| ) | |
| if __name__ == "__main__": | |
| print(f"Device: {device}") | |
| print(f"Train samples: {len(train_split)}") | |
| print(f"Val samples: {len(val_split)}") | |
| print(f"Test samples: {len(test_split)}") | |
| print(f"Batches per epoch: {len(train_loader)}") | |