Spaces:

k23064919
/

smallGroupProject

Sleeping

App Files Files Community

k23064919 commited on Nov 17, 2025

Commit

deb385a

2 Parent(s): f622232 04cb886

integration between ops/clearml-setup and feature/ui-deplotment branches

Browse files

Files changed (7) hide show

.gitignore +16 -0
dataPrep/data_preparation.py +143 -0
dataPrep/helpers/create_dataset.py +55 -0
dataPrep/helpers/transforms_loaders.py +76 -0
models/modelOne.py +31 -0
requirements.txt +18 -0
trainingModel/Training.py +153 -0

.gitignore CHANGED Viewed

@@ -1,5 +1,21 @@
 .vscode/
 .venv/
 .vscode/
 .models/
 __pycache__/

+<<<<<<< HEAD
 .vscode/
 .venv/
 .vscode/
 .models/
 __pycache__/
+=======
+# Python environment
+venv/
+*.pyc
+__pycache__/
+# Editor files
+.DS_Store
+.vscode/
+.python-version
+# Generated files from data_preparation.py
+class_distribution.png
+>>>>>>> 04cb88662062ef6b880c627546d067fa0cedfa8b

dataPrep/data_preparation.py ADDED Viewed

	@@ -0,0 +1,143 @@

+# --- Standard Python Library ---
+import os
+import random
+# --- Data Handling & Analysis ---
+import numpy as np
+import pandas as pd
+from datasets import load_dataset
+from helpers.create_dataset import load_subset_from_dataset
+from helpers.transforms_loaders import make_dataset_loaders
+# --- Visualization ---
+import matplotlib.pyplot as plt
+# import seaborn as sns
+# --- PyTorch (Machine Learning) ---
+import torch
+from torchvision import transforms
+from torch.utils.data import DataLoader
+# --- Experiment Tracking ---
+from clearml import Task, Logger, Dataset
+# Setting up the SEED to be able to repeat experiments
+SEED = 42
+DATASET_SUBSET_RATIO = 0.25
+random.seed(SEED)
+np.random.seed(SEED)
+torch.manual_seed(SEED)
+if torch.cuda.is_available():
+    torch.cuda.manual_seed_all(SEED)
+# ----- ClearML Setup -----
+task = Task.init(project_name= 'Small Group CW', task_name = 'data_prep')
+task.set_random_seed(SEED)
+clearml_logger = task.get_logger()
+# Log subset config to ClearML
+task.connect_configuration(
+    {"subset_ratio": DATASET_SUBSET_RATIO},
+    name="Data subsetting"
+)
+# ----- Load a subset from a given dataset & track with ClearML -----
+data_plants, prototyping_dataset, features, clearml_dataset = load_subset_from_dataset(
+    SEED, DATASET_SUBSET_RATIO, clearml_logger
+)
+# ---- Exploratory data analysis (EDA) ----
+# Reformatting the label feature to understand bias
+labels_list = prototyping_dataset['label']
+df_labels = pd.Series(labels_list)
+label_count = df_labels.value_counts(sort = False)
+# Checking the amount of samples in each class and logging it to clearML
+min_count = label_count.min()
+clearml_logger.report_scalar(
+    title="Exploratory data analysis (EDA)",
+    series="Min Class Count",
+    value=min_count,
+    iteration=1
+)
+max_count = label_count.max()
+clearml_logger.report_scalar(
+    title="Exploratory data analysis (EDA)",
+    series="Max Class Count",
+    value=max_count,
+    iteration=1
+)
+mean_count = label_count.mean()
+clearml_logger.report_scalar(
+    title="Exploratory data analysis (EDA)",
+    series="Imbalance Ratio (Max/Min)",
+    value=(max_count / min_count),
+    iteration=1
+)
+print("--- Class imbalance analysis --- ")
+print(f"Max labels in a class: {max_count}")
+print(f"Min labels in a class: {min_count}")
+print(f"Mean labels in a class: {mean_count}")
+print(f"Imbalance ratio: {max_count/min_count:.2f}")
+# Mapping indeces to class names
+class_names = features['label'].names
+formatted_class_names = [" ".join(name.replace('_', ' ').split()) for name in class_names]
+label_count.index = formatted_class_names
+plt.figure(figsize=(10,6))
+label_count.plot(kind='bar', color='skyblue')
+plt.title("Class Distribution in Prototype Dataset")
+plt.xlabel("Class")
+plt.ylabel("Count")
+plt.tight_layout()
+plt.savefig("class_distribution.png")
+clearml_logger.report_image(
+    title="EDA Class Distribution",
+    series="Prototype Subset",
+    local_path="class_distribution.png",
+    iteration=1
+)
+# ----------------------------------------------------------------------
+if __name__ == "__main__":
+    # ------------------- Dataset splits ----------------------------------
+    prototype_loaders = make_dataset_loaders(
+        prototyping_dataset, seed=SEED, batch_size=32, test_size=0.3
+    )
+    print("\n--- Handoff Test Successful ---")
+    print(f"Prototype Train loader batches: {len(prototype_loaders['train'])}")
+    print(f"Prototype Validation loader batches: {len(prototype_loaders['val'])}")
+    print(f"Prototype Test loader batches: {len(prototype_loaders['test'])}")
+    final_loaders = make_dataset_loaders(
+        data_plants, seed=SEED, batch_size=32, test_size=0.3
+    )
+    print("\n--- Handoff Test Successful ---")
+    print(f"Train loader batches: {len(final_loaders['train'])}")
+    print(f"Validation loader batches: {len(final_loaders['val'])}")
+    print(f"Test loader batches: {len(final_loaders['test'])}")
+    # Record dataset info in ClearML
+    task.connect_configuration(
+        {"dataset_id": clearml_dataset.id},
+        name="Dataset Metadata"
+    )
+    # Close the ClearML task
+    task.close()
+    print("\n--- Script Finished ---")

dataPrep/helpers/create_dataset.py ADDED Viewed

	@@ -0,0 +1,55 @@

+"""
+A collection of dataset (DS) loading and subsetting functions.
+"""
+import random
+import numpy as np
+from datasets import load_dataset
+from clearml import Dataset
+# Load a DS from HuggingFace Link and subset - upload both to ClearML
+def load_subset_from_dataset(seed, subset_ratio, clearml_logger):
+    DATASET_LINK = "DScomp380/plant_village"
+    # Load dataset
+    try:
+        ds = load_dataset(DATASET_LINK)
+    except Exception as e:
+        raise RuntimeError(f"Error loading the dataset: {e}")
+    data_plants = ds['train']
+    data_length = len(data_plants)
+    features = data_plants.features
+    # Calculate amount of samples we use
+    subset_size = int(data_length * subset_ratio)
+    # Creating a subset of random data (by their indicies)
+    indices = list(range(data_length))
+    random.shuffle(indices)
+    subset_indices = indices[:subset_size]
+    prototyping_dataset = data_plants.select(subset_indices)
+    # ---------- Register subset in ClearML ----------
+    clearml_dataset = Dataset.create(
+        dataset_name="Plant Village Prototype",
+        dataset_project="smallGroupProject",
+        dataset_tags=["prototype", "subset"]
+    )
+    # Save indices
+    subset_path = "subset_indices.npy"
+    np.save(subset_path, subset_indices)
+    clearml_dataset.add_files(subset_path)
+    clearml_dataset.set_metadata({
+        "subset_ratio": subset_ratio,
+        "total_samples": len(prototyping_dataset)
+    })
+    clearml_dataset.upload()
+    clearml_dataset.finalize()
+    clearml_logger.report_text(f"Created ClearML Dataset: {clearml_dataset.id}")
+    return data_plants, prototyping_dataset, features, clearml_dataset

dataPrep/helpers/transforms_loaders.py ADDED Viewed

	@@ -0,0 +1,76 @@

+"""
+A collection of data transformation and dataset loading functions.
+"""
+from torchvision import transforms
+from torch.utils.data import DataLoader
+# Defines and returns the normalization and augmentation pipelines.
+def make_transform_pipelines():
+    # Standard ImageNet mean and std - Used to normalize the tensors
+    IMAGENET_MEAN = [0.485, 0.456, 0.406]
+    IMAGENET_STD = [0.229, 0.224, 0.225]
+    # Pipeline ensures image format is consistent (for Val/Test)
+    normalisation = transforms.Compose([
+        # Convert PIL Image to a PyTorch Tensor, scales pixel values from [0, 255] to [0.0, 1.0]
+        transforms.ToTensor(),
+        # Standardises pixel values
+        transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
+    ])
+    # Augmentation pipeline (to create "new" images by changing some parameters)
+    augmentation = transforms.Compose([
+        # Randomly changing some parameters of pictures to enrich dataset
+        transforms.RandomRotation(30),
+        transforms.ColorJitter(brightness=0.2, saturation=0.2),
+        transforms.GaussianBlur(3),
+        transforms.ToTensor(),
+        transforms.Normalize(IMAGENET_MEAN, IMAGENET_STD)
+    ])
+    return normalisation, augmentation
+"""
+Creates and returns DataLoaders (train, val, test) for a given dataset.
+Performs a 70/15/15 split
+"""
+def make_dataset_loaders(dataset, seed, batch_size=32, test_size=0.3):
+    # Define transformation pipelines for the dataset
+    normalisation, augmentation = make_transform_pipelines()
+    # 70/30 split creates train set
+    split_1 = dataset.train_test_split(test_size=test_size, seed=seed)
+    train_split = split_1['train']
+    remaining_split = split_1['test']
+    # 15/15 split on remaining data - validation and test sets
+    val_split = test_size/2
+    split_2 = remaining_split.train_test_split(test_size=val_split, seed=seed)
+    val_split, test_split = split_2['train'], split_2['test']
+    # Put each split through pipelines
+    train_split.set_transform(augmentation)
+    val_split.set_transform(normalisation)
+    test_split.set_transform(normalisation)
+    # Create dataloader for each
+    train_loader = DataLoader(train_split, batch_size=batch_size, shuffle=True)
+    val_loader = DataLoader(val_split, batch_size=batch_size, shuffle=False)
+    test_loader = DataLoader(test_split, batch_size=batch_size, shuffle=False)
+    dataset_loaders = {
+        "train": train_loader,
+        "val": val_loader,
+        "test": test_loader
+    }
+    return dataset_loaders

models/modelOne.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class modelOne(nn.Module) :
+    def __init__(self, noOfClasses=39):
+        super(modelOne, self).__init__()
+        self.conv1 = nn.Conv2d(3, 6, 5)
+        self.batchNorm1 = nn.BatchNorm2d(6)
+        self.pool = nn.MaxPool2d(2, 2)
+        self.conv2 = nn.Conv2d(6, 16, 5, padding=2)
+        self.batchNorm2 = nn.BatchNorm2d(16)
+        self.fc1 = nn.Linear(16*64*64, 512)
+        self.dropout = nn.Dropout(0.5)
+        self.fc2 = nn.Linear(512, 84)
+        self.fc3 = nn.Linear(84, noOfClasses)
+    def forward(self, x) :
+        x = self.pool(F.relu(self.batchNorm1(self.conv1(x))))
+        x = self.pool(F.relu(self.batchNorm2(self.conv2(x))))
+        x = torch.flatten(x, 1)
+        x = self.dropout(x)
+        x = F.relu(self.fc1(x))
+        x = F.relu(self.fc2(x))
+        x = self.fc3(x)
+        return x

requirements.txt CHANGED Viewed

@@ -1,3 +1,4 @@
 # Core dependencies
 torch>=2.0.0
 torchvision>=0.15.0
@@ -11,3 +12,20 @@ clearml>=1.14.0
 # Optional: for advanced features
 datasets>=2.14.0  # For loading PlantVillage dataset from HuggingFace

+<<<<<<< HEAD
 # Core dependencies
 torch>=2.0.0
 torchvision>=0.15.0
 # Optional: for advanced features
 datasets>=2.14.0  # For loading PlantVillage dataset from HuggingFace
+=======
+# -- Data prep requirements --
+# Data Handling & Analysis
+numpy
+pandas
+datasets
+# Visualization
+matplotlib
+# PyTorch (Machine Learning)
+torch
+torchvision
+# Experiment Tracking
+clearml
+>>>>>>> 04cb88662062ef6b880c627546d067fa0cedfa8b

trainingModel/Training.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import torch
+import torch.nn as nn
+import numpy as np
+from torcheval.metrics import MulticlassAccuracy
+#from torchvision import transforms
+from torch.utils.data import DataLoader
+#from torchvision.datasets import MNIST
+#import torchvision.utils
+# loss, optimizer, training loop, validation, best model saving
+def train_model(
+    model: nn.Module,
+    train_loader: DataLoader,
+    val_loader: DataLoader,
+    device: torch.device,
+    n_epochs: int = 4,
+    lr: float = 1e-3,
+    save_path: str = "best_model.pt",
+    flatten_input = False,
+    num_classes : int = 39,
+):
+    # Move model to device
+    model.to(device)
+    # Loss and optimizer
+    criterion = nn.CrossEntropyLoss()
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr ) # might add momentum 0.9 later
+    # Metric trackers
+    train_accuracy_fn = MulticlassAccuracy(num_classes=num_classes)
+    val_accuracy_fn = MulticlassAccuracy(num_classes=num_classes)
+    # Arrays to log metrics
+    num_batches = len(train_loader)
+    # Store training losses and accuracies for every batch
+    # num_batches is the number of batches for every epoch
+    training_losses = np.zeros(num_batches * n_epochs)
+    training_accuracies = np.zeros(num_batches * n_epochs)
+    # store validation accuracy for every epoch
+    val_accuracies = np.zeros(n_epochs)
+    # keep track of best validation accuracy and best model
+    best_accuracy = 0.0
+    #----------------------
+    # training loop
+    #----------------------
+    for epoch in range(n_epochs):
+        model.train()
+        train_accuracy_fn.reset()
+        # iterate over all the dataloader's mini-batches
+        for i, batch in enumerate(train_loader):
+            # move to GPU memory
+            inputs = batch["image"].to(device)
+            labels = batch["label"].to(device)
+            # flatten if not cnn REVISE LATER
+            if flatten_input:
+                inputs = inputs.view(inputs.size(0), -1)
+            optimizer.zero_grad()
+            # Forward pass
+            outputs = model(inputs)
+            loss = criterion(outputs, labels)
+            # Backward pass
+            loss.backward()
+            # updates the parameters
+            optimizer.step()
+            # log the loss value
+            training_losses[epoch * num_batches + i] = loss.item()
+            # Compute accuracy of the batch.
+            #updates the accuracy computation with new data
+            train_accuracy_fn.update(outputs, labels)
+            #compute accuracy with the current data
+            training_accuracies[epoch * num_batches + i] = train_accuracy_fn.compute().item()
+            # display some progress (every 200 batches)
+            # optional, you can comment out
+            # if i % 200 == 0:
+            #     print(f'Epoch {epoch + 1}, batch {i+1} of {len(train_loader)}')
+        print(f'Epoch {epoch + 1} training complete')
+        # Validation after each epoch
+        model.eval()
+        val_accuracy_fn.reset()
+        # The context 'torch.no_grad()' tells pytorch we are not interested in computing
+        # gradients here, so forward pass is more efficient
+        with torch.no_grad():
+            for i, batch in enumerate(val_loader):
+                inputs = batch["image"].to(device)
+                labels = batch["label"].to(device)
+                # flatten if not cnn REVISE LATER
+                if flatten_input:
+                    inputs = inputs.view(inputs.size(0), -1)
+                outputs = model(inputs)
+                val_accuracy_fn.update(outputs, labels)
+        current_accuracy = val_accuracy_fn.compute().item()
+        val_accuracies[epoch] = current_accuracy
+        # keep track of best validation accuracy and save best model so far
+        if current_accuracy > best_accuracy:
+            best_accuracy = current_accuracy
+            torch.save(model.state_dict(), save_path)
+            print(f'Epoch {epoch + 1} (validation accuracy: {best_accuracy})')
+        print(f'Epoch {epoch + 1} validation complete')
+    print(f"\nTraining finished. Best val accuracy: {best_accuracy:.4f}")
+    print(f"Best model weights saved to: {save_path}")
+    return training_losses, training_accuracies, val_accuracies, best_accuracy
+    #tweak later
+    #best_model = MNISTNet().to(device)
+    #best_model.load_state_dict(
+    #    torch.load('mnist-torch-best_model.pt', map_location=device))