Spaces:

eloise54
/

pcam_project

Sleeping

App Files Files Community

eloise54 commited on Jun 19

Commit

8234a97

1 Parent(s): b4c4828

new requirements file

Browse files

Files changed (2) hide show

PCAM-pipeline.py +0 -973
requirements.txt +250 -5

PCAM-pipeline.py DELETED Viewed

@@ -1,973 +0,0 @@
-#!/usr/bin/env python
-# coding: utf-8
-# # 🧬 PCam Dataset: Tumor Detection via Binary Image Classification
-#
-# For full dataset details, visit the official repository:
-# 🔗 [github.com/basveeling/pcam](https://github.com/basveeling/pcam)
-#
-#
-# ## 📊 Dataset Overview
-#
-# The **PatchCamelyon (PCam)** benchmark is a challenging image classification dataset designed for breast cancer detection tasks.
-#
-# - 📦 **Total images**: 327,680 color patches
-# - 🖼️ **Image size**: 96 × 96 pixels
-# - 🧪 **Source**: Histopathologic scans of lymph node sections
-# - 🏷️ **Labels**: Binary — A positive (1) label indicates that the center 32x32px region of a patch contains at least one pixel of tumor tissue. Tumor tissue in the outer region of the patch does not influence the label.
-#
-#
-# ## 🧠 Solution to Implement
-#
-# In this notebook, we implement a solution inspired by the following research paper:
-#
-# > 📄 [**Cancer Image Classification Based on DenseNet Model**](https://arxiv.org/abs/2011.11186)
-# > _by Zhong, Ziliang; Zheng, Muhang; Mai, Huafeng; Zhao, Jianan; Liu, Xinyi_
-#
-# This study explores the application of DenseNet architectures to the PCam dataset for accurate cancer classification.
-#
-# ---
-#
-# ## Results
-#
-# The submission on kaggle with the best model trained on this notebook is
-#
-# ```Score: 0.9648```
-# ```Private score: 0.9702```
-#
-# # 1. Load the dataset
-# Load the training, test and validation datasets from PCAM.
-#
-# We are going to use the kaggle version that is a cleaned version of the official PCAM dataset.
-#
-# In the kaggle version duplicates ar removed and there is no leakage between training and test datasets.
-# In[1]:
-import typing as tp
-import numpy as np
-import torch
-import torchvision
-from torch import nn
-from torch.utils.data import Dataset, DataLoader, ConcatDataset
-from torchvision.transforms import ToTensor
-from torchvision import datasets
-from torch.utils.tensorboard import SummaryWriter
-# We need to use GPU if available
-# In[2]:
-from torch.optim import Optimizer, lr_scheduler
-from torch.optim.lr_scheduler import LRScheduler
-if torch.cuda.is_available():
-    device = torch.device("cuda")
-else:
-    device = torch.device("cpu")
-print("Using device", device)
-# Let's download the kaggle dataset.
-# For this you need your credentials.
-# If you did not set already your ```~/.kaggle/kaggle.json``` key:
-#  - Go to your kaggle account setting and create a new API token if needed.
-#  - Then feel in this part with your information ```creds = '{"username":"xxxxx","key":"xxxxx"}'```
-# In[3]:
-get_ipython().system('pip install kaggle')
-creds = '{"username":"xxxxx","key":"xxxxx"}'
-from pathlib import Path
-cred_path = Path('~/.kaggle/kaggle.json').expanduser()
-if not cred_path.exists():
-    cred_path.parent.mkdir(exist_ok=True)
-    cred_path.write_text(creds)
-    cred_path.chmod(0o600)
-# In[4]:
-import os
-import zipfile
-root = "data/"
-dataset_dir = "data/histopathologic-cancer-detection"
-zip_file = "histopathologic-cancer-detection.zip"
-train_path = os.path.join(dataset_dir, "train")
-if not os.path.exists(root):
-  os.mkdir(root)
-if not os.path.exists('results'):
-  os.mkdir('results')
-if not os.path.exists(train_path):
-    print("Downloading Histopathologic Cancer Detection dataset...")
-    get_ipython().system('kaggle competitions download -c histopathologic-cancer-detection -p {root} --force')
-else:
-    print("Dataset zip already downloaded.")
-if not os.path.exists(train_path):
-    print("Unzipping dataset...")
-    with zipfile.ZipFile(os.path.join(root, zip_file), 'r') as zip_ref:
-        zip_ref.extractall(dataset_dir)
-else:
-    print("Dataset already unzipped.")
-# Know Let's create our pytorch dataset class.
-# I have used train_test_split from sklearn to have a stratified dataset (The kaggle PCAM dataset is unbalanced)
-# In[5]:
-from sklearn.model_selection import train_test_split
-from PIL import Image
-import pandas as pd
-class PcamDatasetKaggle(torchvision.datasets.VisionDataset):
-    def __init__(self, root, split, transform, target_transform = None):
-         super().__init__(root, transform=transform, target_transform=target_transform)
-         self.root = root
-         self.split = split
-         self.transform = transform
-         self.img_path = os.path.join(self.root, "train")
-         self.full_labels = pd.read_csv(self.root+'/train_labels.csv')
-         X_train, X_test, y_train, y_test = train_test_split(self.full_labels['id'],
-                                                             self.full_labels['label'],
-                                                             test_size = 0.2,
-                                                             train_size = 0.8,
-                                                             random_state=30,
-                                                             shuffle=True,
-                                                             stratify=self.full_labels['label'])
-         if (split == "train"):
-             self.imgs = X_train + ".tif"
-             self.labels = y_train
-         elif (split == "val"):
-             self.imgs = X_test + ".tif"
-             self.labels = y_test
-         else:
-             self.img_path = os.path.join(self.root, self.split)
-             self.imgs = pd.Series(list(sorted(os.listdir(self.img_path))))
-             self.labels = pd.Series(torch.full((len(self.imgs),), -10))
-         assert len(self.labels) == len(self.imgs)
-         print("Split", split, "Negative/Positive samples % " , 100.0*(self.labels.value_counts() / self.labels.shape[0]))
-    def __getitem__(self, idx):
-        assert idx < len(self.imgs)
-        img_pil = Image.open(os.path.join(self.img_path, self.imgs.iloc[idx]))
-        img = self.transform(img_pil)
-        label = self.labels.iloc[idx]
-        return img, label
-    def __len__(self) :
-        return len(self.imgs)
-def check_dataset_leakage(dataset1, dataset2):
-    duplicates = set(dataset1.imgs) & set(dataset2.imgs)
-    assert len(duplicates) == 0
-def check_same_imgs(dataset1, dataset2):
-    duplicates = set(dataset1.imgs) & set(dataset2.imgs)
-    assert len(duplicates) == len(dataset1.imgs)
-    assert len(duplicates) == len(dataset2.imgs)
-# Let's define some transforms for dataloading and data augmentation
-#
-# An improvment could be to use [albumentation](https://albumentations.ai/) to define a more refined ```transform_data_augment```
-# In[6]:
-import torchvision.transforms as transforms
-torch.manual_seed(30)
-torch.cuda.manual_seed_all(30)
-# Preprocess images with transforms
-transform = transforms.Compose([
-    transforms.Resize((224, 224)), #Match resnet original input size
-    transforms.ToTensor()
-])
-# For augmenting data
-transform_data_augment = transforms.Compose([
-    transforms.Resize((300, 300)),
-    transforms.RandomHorizontalFlip(),
-    transforms.RandomVerticalFlip(),
-    transforms.GaussianBlur(kernel_size = (5,5),sigma=(0.1, 0.5)),
-    transforms.RandomRotation(degrees=25),
-    transforms.ColorJitter(
-        brightness=0.1,
-        contrast=0.1,
-        saturation=0.01,
-        hue=0.005
-    ),
-    transforms.CenterCrop((224, 224)),
-    transforms.RandomResizedCrop(size = (224, 224), scale = (0.9, 1.0)),
-    transforms.ToTensor()
-])
-# In[7]:
-from copy import deepcopy
-""" PCAM pytorch version but the dataset is not clean
-training_set_original = datasets.PCAM(root="data", split="train",download = True, transform = transform)
-training_set_augment = datasets.PCAM(root="data", split="train",download = True, transform = transform_data_augment)
-val_set = datasets.PCAM(root="data", split="val", download=True, transform = transform)
-test_set = datasets.PCAM(root="data", split="test", download=True, transform = transform)
-"""
-training_set_original = PcamDatasetKaggle(root=dataset_dir, split="train", transform = deepcopy(transform))
-training_set_augment = PcamDatasetKaggle(root=dataset_dir, split="train", transform = deepcopy(transform_data_augment))
-val_set = PcamDatasetKaggle(root=dataset_dir, split="val", transform = deepcopy(transform))
-val_set_augment = PcamDatasetKaggle(root=dataset_dir, split="val", transform = deepcopy(transform_data_augment))
-test_set = PcamDatasetKaggle(root=dataset_dir, split="test", transform = deepcopy(transform))
-test_set_augment = PcamDatasetKaggle(root=dataset_dir, split="test", transform = deepcopy(transform_data_augment)) #For TTA
-check_dataset_leakage(training_set_original, val_set)
-check_dataset_leakage(training_set_original, test_set)
-check_dataset_leakage(val_set, test_set)
-check_same_imgs(training_set_original, training_set_augment)
-check_same_imgs(val_set, val_set_augment)
-check_same_imgs(test_set, test_set_augment)
-# # 2. Plot and visualize original and augmented the data
-# Each (3,96,96) image is associated with a binary label indicates the presence of a tumor.
-#
-# Let's define a function to plot some images with their label.
-#
-# Let's save the plots in an experiment directory for logging purposes
-#
-# In[8]:
-import matplotlib.pyplot as plt
-def plot_training_set_sample(training_set,
-                             file_name = "results/pcam/data.png",
-                             rows = 5,
-                             cols = 5,
-                             mean_stdev = torch.Tensor([[0.0, 0.0, 0.0], [1.0, 1.0, 1.0]])):
-    mean = mean_stdev[0].numpy()
-    std  = mean_stdev[1].numpy()
-    fig = plt.figure(figsize=(2*cols, 2*rows))
-    for i in range(1, rows*cols + 1):
-        random_idx = torch.randint(len(training_set), (1,)).item()
-        fig.add_subplot(rows, cols, i)
-        img = training_set[random_idx][0].permute(1,2,0).numpy()
-        img_unnormalized = img*std + mean
-        img_unnormalized = np.clip(img_unnormalized, 0, 1)
-        plt.imshow(img_unnormalized)
-        plt.axis("off")
-        plt.title(training_set[random_idx][1])
-    plt.savefig(file_name)
-    plt.show()
-# In[9]:
-import os
-from datetime import datetime
-exp_dir = "results/pcam/"+datetime.now().strftime("%d_%m_%Y_%H_%M_%S")
-os.mkdir(exp_dir)
-# In[10]:
-print("Original Training Set")
-plot_training_set_sample(training_set_original, exp_dir + "/training_set_original.png",rows=2, cols=5)
-# In[11]:
-print("Augmented Training Set")
-plot_training_set_sample(training_set_augment, exp_dir + "/training_set_augment.png",rows=2, cols=5)
-# # 3.Normalize and create augmented dataset
-# Let's create a function that computes mean, standard deviation and class balance for a pytorch DataLoader.
-#
-# Normalize the datasets accordingly
-# In[12]:
-def compute_dataset_mean_stdev_class_balance(dataloader: DataLoader, device: torch.cuda.device) -> tp.List[float]:
-    mean = 0.0
-    stdev = 0.0
-    y_full = torch.Tensor([]).to(device)
-    for batch, (X,y) in enumerate(dataloader):
-        X = X.to(device)
-        y = y.to(device)
-        batch_samples = X.size(0)
-        mean += torch.mean(X, dim = (0,2,3)) * batch_samples
-        stdev += torch.std(X, dim = (0,2,3)) * batch_samples
-        y_full = torch.cat([y_full, y])
-    positive_labels = (y_full == torch.Tensor([1]).to(device)).sum()
-    negative_labels = (y_full == torch.Tensor([0]).to(device)).sum()
-    return [mean.detach().cpu() / len(dataloader.dataset), stdev.detach().cpu() / len(dataloader.dataset)], positive_labels.detach().cpu(), negative_labels.detach().cpu()
-# In[13]:
-# Create DataLoader
-batch_size = 128
-training_set_original_dataloader = DataLoader(training_set_original, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=6, persistent_workers = True)
-training_set_augment_dataloader = DataLoader(training_set_augment, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=6, persistent_workers = True)
-# Compute Mean and Std to normalize images if not already done
-COMPUTE_NORMALIZATION_AGAIN = False
-mean_stdev_original =[torch.Tensor([0.7022, 0.5459, 0.6962]), torch.Tensor([0.2218, 0.2668, 0.1982])]
-mean_stdev_augment = [torch.Tensor([0.6939, 0.5397, 0.6904]), torch.Tensor([0.2225, 0.2661, 0.1988])]
-pos = 71294
-neg = 104726
-apos = 71294
-aneg = 104726
-if (COMPUTE_NORMALIZATION_AGAIN):
-    mean_stdev_original, pos, neg = compute_dataset_mean_stdev_class_balance(training_set_original_dataloader, device)
-    mean_stdev_augment, apos, aneg = compute_dataset_mean_stdev_class_balance(training_set_augment_dataloader, device)
-def combine_std(mean1_stdev1: torch.torch.Tensor, mean2_stdev2: torch.Tensor):
-    mean1, stdev1 = mean1_stdev1[0], mean1_stdev1[1]
-    mean2, stdev2 = mean2_stdev2[0], mean2_stdev2[1]
-    mean3 = (mean1 + mean2) * 0.5
-    var1 = stdev1 ** 2
-    var2 = stdev2 ** 2
-    var3 = 0.5 * (var1 + (mean1 - mean3) ** 2 + var2 + (mean2 - mean3) ** 2)
-    stdev3 = torch.sqrt(var3)
-    return [mean3, stdev3]
-new_mean_stdev = combine_std(mean_stdev_original, mean_stdev_augment)
-new_mean_stdev = torch.stack(new_mean_stdev).cpu().detach()
-print("Normalization done with")
-print("training_set [mean, stdev]: ", new_mean_stdev)
-training_set_original_transform = transforms.Compose([*training_set_original.transforms.transform.transforms,
-                                              transforms.Normalize(new_mean_stdev[0], new_mean_stdev[1])])
-training_set_augment_transform = transforms.Compose([*training_set_augment.transforms.transform.transforms,
-                                                      transforms.Normalize(new_mean_stdev[0], new_mean_stdev[1])])
-training_set_original = PcamDatasetKaggle(root=dataset_dir, split="train", transform = deepcopy(training_set_original_transform))
-training_set_augment = PcamDatasetKaggle(root=dataset_dir, split="train", transform = deepcopy(training_set_augment_transform))
-val_set = PcamDatasetKaggle(root=dataset_dir, split="val", transform = deepcopy(training_set_original_transform))
-val_set_augment = PcamDatasetKaggle(root=dataset_dir, split="val", transform = deepcopy(training_set_augment_transform))
-test_set = PcamDatasetKaggle(root=dataset_dir, split="test", transform = deepcopy(training_set_original_transform))
-# Create Augmented Training Dataset
-training_set = ConcatDataset([training_set_original, training_set_augment])
-# Create Final DataLoaders
-training_dataloader = DataLoader(training_set, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=6, persistent_workers = True)
-val_dataloader = DataLoader(val_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=6, persistent_workers = True)
-val_dataloader_augment = DataLoader(val_set_augment, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=6, persistent_workers = True)
-test_dataloader = DataLoader(test_set, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=6, persistent_workers = True)
-# In[14]:
-print("Full Training Set Normalized")
-plot_training_set_sample(training_set, exp_dir + "/training_set_final.png", rows = 2, cols = 5, mean_stdev=new_mean_stdev)
-# # 3. Defining a training loop over one epoch and a metric
-# The dataset is not balance thus it is better to use roc_auc_score than accuracy
-# In[15]:
-def compute_metrics(full_y: torch.Tensor,
-                    full_logits: torch.Tensor,
-                    full_pred: torch.Tensor,
-                    sk_learn_metrics_logits: tp.List[tp.Callable],
-                    sk_learn_metrics_pred: tp.List[tp.Callable]) -> tp.Dict:
-    full_y = full_y.detach().cpu().numpy()
-    full_logits = torch.sigmoid(full_logits).detach().cpu().numpy()
-    full_pred = full_pred.detach().cpu().numpy()
-    results = {}
-    for metric in sk_learn_metrics_logits:
-        results[metric.__name__] = metric(full_y, full_logits)
-    for metric in sk_learn_metrics_pred:
-        results[metric.__name__] = metric(full_y, full_pred)
-    return results
-# In[16]:
-def run_one_epoch(model : nn.Module,
-                   training_dataloader: DataLoader,
-                   optimizer: Optimizer,
-                   loss_function: nn.Module,
-                   scheduler : LRScheduler,
-                   device: torch.cuda.device,
-                   writer: SummaryWriter,
-                   epoch: int,
-                   sk_learn_metrics_logits: tp.List[tp.Callable],
-                   sk_learn_metrics_pred: tp.List[tp.Callable],
-                   threshold: float = 0.5):
-    running_loss = 0.0
-    num_batch = len(training_dataloader)
-    full_y = torch.Tensor([]).to(device)
-    full_logits = torch.Tensor([]).to(device)
-    full_pred = torch.Tensor([]).to(device)
-    model.train()
-    scaler = torch.amp.GradScaler("cuda")
-    for batch, (X, y) in enumerate(training_dataloader):
-        optimizer.zero_grad()
-        X = X.to(device, non_blocking=True)
-        y = y.to(device, non_blocking=True)
-        with torch.amp.autocast("cuda"):
-            logits = model(X).squeeze()
-            loss = loss_function(logits, y.float())
-        scaler.scale(loss).backward()
-        scaler.step(optimizer)
-        scaler.update()
-        with torch.no_grad():
-            preds = (torch.sigmoid(logits) > threshold).float()
-            full_y = torch.cat([full_y, y])
-            full_logits = torch.cat([full_logits, logits])
-            full_pred = torch.cat([full_pred, preds])
-        running_loss += loss.item()
-        avg_loss = running_loss / (batch + 1.)
-        if batch % 250 == 0:
-            writer.add_scalar('Training Loss(avg)', avg_loss, batch + epoch*num_batch)
-            writer.add_scalar('Training Loss (raw)', loss.item(), batch + epoch*num_batch)
-    scheduler.step()
-    writer.flush()
-    return compute_metrics(full_y, full_logits, full_pred, sk_learn_metrics_logits, sk_learn_metrics_pred)
-# In[17]:
-def eval_model(model: nn.Module,
-               dataloader: DataLoader,
-               sk_learn_metrics_logits: tp.List[tp.Callable],
-               sk_learn_metrics_pred: tp.List[tp.Callable],
-               device: torch.cuda.device,
-               threshold: float = 0.5) -> tp.Dict:
-    model.eval()
-    full_y = torch.Tensor([]).to(device)
-    full_logits = torch.Tensor([]).to(device)
-    full_pred = torch.Tensor([]).to(device)
-    with torch.no_grad():
-        for X, y in dataloader:
-            X = X.to(device)
-            y = y.to(device)
-            logits = model(X).squeeze()
-            preds = (torch.sigmoid(logits) > threshold).float()
-            full_y = torch.cat([full_y, y])
-            full_logits = torch.cat([full_logits, logits])
-            full_pred = torch.cat([full_pred, preds])
-    return compute_metrics(full_y, full_logits, full_pred, sk_learn_metrics_logits, sk_learn_metrics_pred)
-# # 4. Setup tensorboard for monitoring
-# In[18]:
-import threading
-import tensorboard
-from tensorboard import program
-def start_tensorboard(logdir):
-    tb = program.TensorBoard()
-    tb.configure(argv=[None, '--logdir', logdir])
-    url = tb.launch()
-    print(f"TensorBoard is running at {url}")
-# Replace 'logs' with your actual log directory
-logdir = exp_dir
-tb_thread = threading.Thread(target=start_tensorboard, args=(logdir,), daemon=True)
-tb_thread.start()
-# In[19]:
-from PIL import Image
-def load_image(path):
-    img = Image.open(path)
-    # Convert to numpy array and add batch dimension (C, H, W)
-    img_array = np.array(img)
-    if len(img_array.shape) == 2:  # Grayscale image
-        img_array = np.expand_dims(img_array, axis=0)  # (1, H, W)
-    else:  # Color image
-        img_array = img_array.transpose(2, 0, 1)  # (C, H, W)
-    return img_array
-writer = SummaryWriter(exp_dir + '/tensorboard')
-writer.add_image('training_set_original', load_image(exp_dir + "/training_set_original.png"), 0)
-writer.flush()
-writer.add_image('training_set_augment',  load_image(exp_dir + "/training_set_augment.png"), 0)
-writer.flush()
-writer.add_image('training_set_final',  load_image(exp_dir + "/training_set_final.png"), 0)
-writer.flush()
-# # 5. Find best learning rate
-#
-# > 📄 [**Cancer Image Classification Based on DenseNet Model**](https://arxiv.org/abs/2011.11186)
-# > _by Zhong, Ziliang; Zheng, Muhang; Mai, Huafeng; Zhao, Jianan; Liu, Xinyi_
-#
-# Suggest to use a learning rate lr = 1e-4 for densenet201.
-#
-# You can also plot the loss with respect to the lr evaluated on a few batches.
-#
-# It gives insight on which lr to take: between 1e-4 and 1e-3
-# In[20]:
-from torchvision.models import densenet201, DenseNet201_Weights
-model = densenet201(weights=DenseNet201_Weights.DEFAULT)
-for params in model.parameters():
-    params.requires_grad = False
-model.classifier = nn.Sequential(nn.Linear(1920, 1, bias= True))
-for param in model.classifier.parameters():
-    param.requires_grad = True
-model = model.to(device)
-def custom_lr_find(model : nn.Module,
-                   dataloader: DataLoader,
-                   loss_function: nn.Module,
-                   device: str,
-                   start_lr = 1e-7,
-                   end_lr = 1.0,
-                   num_iteration = 200):
-    rates = []
-    lossses = []
-    model = model.to(device)
-    optimizer = torch.optim.Adam(model.parameters(),lr=start_lr)
-    def lr_lambda(iteration):
-        return (end_lr / start_lr) ** (iteration / num_iteration)
-    scheduler = lr_scheduler.LambdaLR(optimizer, lr_lambda)
-    initial_weights = model.state_dict()
-    model.train()
-    X_full = torch.Tensor([]).to(device)
-    y_full = torch.Tensor([]).to(device)
-    for h in range (0, 5):
-        X, y = next(iter(dataloader))
-        X = X.to(device)
-        y = y.to(device)
-        X_full = torch.cat([X_full, X])
-        y_full = torch.cat([y_full, y])
-    for i in range(0, num_iteration):
-        optimizer.zero_grad()
-        pred = model(X_full).squeeze()
-        loss = loss_function(pred, y_full.float())
-        lossses.append(loss.item())
-        rates.append(scheduler.get_last_lr()[0])
-        loss.backward()
-        optimizer.step()
-        scheduler.step()
-        model.load_state_dict(initial_weights)
-        if(scheduler.get_last_lr()[0] > end_lr):
-            break
-    return rates, lossses
-def plot_lr_find(rates, losses, file_name):
-    fig = plt.Figure()
-    plt.plot(rates, losses)
-    plt.xscale('log')
-    plt.xlabel('learning_rate')
-    plt.ylabel('loss')
-    plt.ylim(0.0, 1.0)
-    plt.title('lr_find_results')
-    plt.legend()
-    plt.savefig(file_name)
-    plt.figure()
-pos_weight = torch.Tensor([float(neg) / float(pos)]).to(device)# models class imbalance.
-#rates, losses = custom_lr_find(model, training_dataloader, torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight), device)
-rates, losses = custom_lr_find(model, training_dataloader, torch.nn.BCEWithLogitsLoss(), device)
-plot_lr_find(rates, losses, exp_dir + '/lr_find.jpg')
-writer.add_image('lr_find', load_image(exp_dir + "/lr_find.jpg"), 0)
-writer.flush()
-# # 6. Using already trained networks: Train the head only
-#
-# First train the head and freeze all other layers
-# In[21]:
-from torchvision.models import densenet201, DenseNet201_Weights, densenet121, DenseNet121_Weights
-model = densenet201(weights=DenseNet201_Weights.DEFAULT)
-for params in model.parameters():
-    params.requires_grad = False
-#Replace the last layer (to output a 1d prediction)
-model.classifier = nn.Sequential(nn.Linear(model.classifier.in_features, 1, bias= True))
-for param in model.classifier.parameters():
-    param.requires_grad = True
-model = model.to(device)
-# In[22]:
-#optionnaly load from checkpoint
-"""
-model = torch.load('results/pcam/14_06_2025_10_25_48/model_'+str(19)+'.pt', weights_only = False)
-for params in model.parameters():
-    params.requires_grad = False
-for param in model.classifier.parameters():
-    param.requires_grad = True
-model = model.to(device)
-"""
-# In[23]:
-lr = 1e-4
-optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
-#loss_func = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
-loss_func = torch.nn.BCEWithLogitsLoss()
-scheduler = lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.01)
-# In[24]:
-from sklearn.metrics import classification_report, roc_auc_score, f1_score, precision_score, recall_score, accuracy_score, classification_report
-import time
-epoch_num = 2
-sk_learn_metrics_logits = [roc_auc_score]
-sk_learn_metrics_pred = [f1_score, accuracy_score]
-for i in range(0, epoch_num):
-    start_time = time.time()
-    train_res = run_one_epoch(model,
-                  training_dataloader,
-                  optimizer,
-                  loss_func,
-                  scheduler,
-                  device,
-                  writer,
-                  i,
-                  sk_learn_metrics_logits,
-                  sk_learn_metrics_pred)
-    end_time = time.time()
-    print("epoch n°: ", i, " training time : ", end_time-start_time, " sec")
-    start_time = time.time()
-    val_res = eval_model(model, val_dataloader, sk_learn_metrics_logits, sk_learn_metrics_pred, device)
-    for key in train_res.keys():
-        writer.add_scalars(key, {"Train " + key: train_res[key], "Val "+ key : val_res[key]}, i*len(training_dataloader))
-    end_time = time.time()
-    print("epoch n°: ", i, " evaluation time : ", end_time-start_time, " sec")
-    torch.save(model, exp_dir+"/model_" + str(i) + ".pt")
-# # 7. Using already trained networks: Fine Tune a few layers
-# I did not use it in the end, this is optional
-# In[25]:
-'''
-for name, param in model.features.denseblock4.denselayer32.conv1.named_parameters():
-    param.requires_grad = True
-for name, param in model.features.denseblock4.denselayer32.conv2.named_parameters():
-    param.requires_grad = True
-'''
-# In[26]:
-# Unfreeze last two blocks (features.6 and features.7)
-'''
-lr = 1e-4
-#optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
-#loss_func = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
-loss_func = torch.nn.BCEWithLogitsLoss()
-# Use lower LR for fine-tuning
-optimizer = torch.optim.Adam([
-    {"params": model.classifier.parameters(), "lr": 1e-4},
-     {"params": model.features.denseblock4.denselayer32.conv1.parameters(), "lr": 1e-5},
-     {"params": model.features.denseblock4.denselayer32.conv2.parameters(), "lr": 1e-5},
- ])
-'''
-# In[27]:
-'''
-from sklearn.metrics import classification_report, roc_auc_score, f1_score, precision_score, recall_score, accuracy_score, classification_report
-import time
-sk_learn_metrics_logits = [roc_auc_score]
-sk_learn_metrics_pred = [f1_score, accuracy_score]
-epoch_num = 2
-finetune_epoch_num = 6
-for i in range(epoch_num, epoch_num + finetune_epoch_num):
-    start_time = time.time()
-    train_res = run_one_epoch(model,
-                  training_dataloader,
-                  optimizer,
-                  loss_func,
-                  scheduler,
-                  device,
-                  writer,
-                  i,
-                  sk_learn_metrics_logits,
-                  sk_learn_metrics_pred)
-    end_time = time.time()
-    print("epoch n°: ", i, " training time : ", end_time-start_time, " sec")
-    start_time = time.time()
-    val_res = eval_model(model, val_dataloader, sk_learn_metrics_logits, sk_learn_metrics_pred, device)
-    for key in train_res.keys():
-        writer.add_scalars(key, {"Train " + key: train_res[key], "Val "+ key : val_res[key]}, i*len(training_dataloader))
-    end_time = time.time()
-    print("epoch n°: ", i, " evaluation time : ", end_time-start_time, " sec")
-    torch.save(model, exp_dir+"/model_" + str(i) + ".pt")
-'''
-# # 8. Fine tune the entire model
-# In[28]:
-for params in model.parameters():
-    params.requires_grad = True
-lr = 1e-5
-optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=1e-4)
-#loss_func = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weight)
-loss_func = torch.nn.BCEWithLogitsLoss()
-scheduler = lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.01)
-# In[ ]:
-from sklearn.metrics import classification_report, roc_auc_score, f1_score, precision_score, recall_score, accuracy_score, classification_report
-import time
-sk_learn_metrics_logits = [roc_auc_score]
-sk_learn_metrics_pred = [f1_score, accuracy_score]
-epoch_num = 2
-finetune_epoch_num = 4
-for i in range(epoch_num, epoch_num + finetune_epoch_num):
-    start_time = time.time()
-    train_res = run_one_epoch(model,
-                  training_dataloader,
-                  optimizer,
-                  loss_func,
-                  scheduler,
-                  device,
-                  writer,
-                  i,
-                  sk_learn_metrics_logits,
-                  sk_learn_metrics_pred)
-    end_time = time.time()
-    print("epoch n°: ", i, " training time : ", end_time-start_time, " sec")
-    start_time = time.time()
-    val_res = eval_model(model, val_dataloader, sk_learn_metrics_logits, sk_learn_metrics_pred, device)
-    for key in train_res.keys():
-        writer.add_scalars(key, {"Train " + key: train_res[key], "Val "+ key : val_res[key]}, i*len(training_dataloader))
-    end_time = time.time()
-    print("epoch n°: ", i, " evaluation time : ", end_time-start_time, " sec")
-    torch.save(model, exp_dir+"/model_" + str(i) + ".pt")
-# # 9. Compute test set prediction and submit to kaggle
-#
-# We will use TTA (Test Time with Augmentation).
-# We can also optionally use several models to make a prediction and average the results
-# In[30]:
-def run_inference(model: nn.Module,
-                     dataloader: DataLoader,
-                     device: torch.cuda.device):
-    model.eval()
-    full_y = torch.Tensor([]).to(device)
-    full_logits = torch.Tensor([]).to(device)
-    with torch.no_grad():
-        for X, y in dataloader:
-            X = X.to(device)
-            y = y.to(device)
-            logits = model(X).squeeze()
-            full_y = torch.cat([full_y, y])
-            full_logits = torch.cat([full_logits, logits])
-    return full_y, full_logits
-# In[54]:
-models_paths = ['results/pcam/17_06_2025_12_19_40/model_5.pt']
-# First create tta_num augmented dataloaders
-tta_num = 5
-logits = []
-for i in range(0, tta_num):
-    test_set_augment = PcamDatasetKaggle(root=dataset_dir, split="test", transform = deepcopy(transform_data_augment)) #For TTA
-    test_dataloader_augment = DataLoader(test_set_augment, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=6, persistent_workers = True)
-    for model in models_paths:
-        pcam_model = torch.load(models_paths[0], weights_only = False)
-        pcam_model = pcam_model.to(device)
-        test_y, test_logits = run_inference(pcam_model, test_dataloader, device)
-        logits.append(test_logits)
-        test_y_augm, test_logits_aum = run_inference(pcam_model, test_dataloader_augment, device)
-        logits.append(test_logits_aum)
-# In[55]:
-# Average logits
-logits_stacked = torch.stack(logits)
-mean_logits = torch.mean(logits_stacked, dim = 0, keepdims=True)
-# In[56]:
-#Create submission file with final predictions
-image_ids = [img.replace('.tif', '') for img in test_set.imgs.tolist()]
-test_preds = torch.sigmoid(mean_logits)
-submission_df = pd.DataFrame({
-    'id': image_ids,
-    'label': test_preds.squeeze().detach().cpu().numpy()
-})
-submission_df.to_csv(exp_dir+'/submission.csv', index=False)
-# In[57]:
-sub_path = exp_dir + '/submission.csv'
-get_ipython().system('kaggle competitions submit -c histopathologic-cancer-detection -f {sub_path} -m "DenseNet201 + correct normalization + no ensemble, no 42*42 crop pytorch "')
-# # 11. Find best threshold for prediction on validation set
-# In[40]:
-models_paths = ['results/pcam/17_06_2025_12_19_40/model_4.pt']
-pcam_model = torch.load(models_paths[0], weights_only = False)
-pcam_model = pcam_model.to(device)
-test_y, test_logits = run_inference(pcam_model, val_dataloader, device)
-test_y_augment, test_logits_augment = run_inference(pcam_model, val_dataloader_augment, device)
-full_y = torch.cat([test_y, test_y_augment])
-full_logits = torch.cat([test_logits, test_logits_augment])
-# In[41]:
-from sklearn.metrics import roc_curve, auc
-fpr, tpr, thresholds = roc_curve(full_y.detach().cpu().numpy(), torch.sigmoid(full_logits).detach().cpu().numpy())
-roc_auc = auc(fpr, tpr)
-# In[42]:
-plt.figure(figsize=(8,6))
-plt.plot(fpr, tpr, color='orange', lw=2, label=f'ROC curve (AUC = {roc_auc})')
-plt.xlim([0.0, 1.0])
-plt.ylim([0.0, 1.0])
-plt.xlabel('False Positive Rate')
-plt.ylabel('True Positive Rate')
-plt.title('Receiver Operating Characteristic')
-plt.grid(alpha=0.3)
-plt.show()
-# In[43]:
-# Find best threshold index (maximize TPR-FPR).
-j_scores = tpr - fpr
-best_idx = np.argmax(j_scores)
-best_threshold = thresholds[best_idx]
-# In[44]:
-best_threshold
-# In[ ]:

requirements.txt CHANGED Viewed

@@ -1,8 +1,253 @@
-gradio==5.34.1
 matplotlib==3.10.3
-numpy==2.3.0
-pandas==2.3.0
-Pillow==11.2.1
-scikit_learn==1.7.0
 torch==2.7.0
 torchvision==0.22.0

+absl-py==2.3.0
+aiofiles==24.1.0
+aiohappyeyeballs==2.6.1
+aiohttp==3.12.2
+aiosignal==1.3.2
+albucore==0.0.24
+albumentations==2.0.8
+annotated-types==0.7.0
+anyio==4.9.0
+argon2-cffi==23.1.0
+argon2-cffi-bindings==21.2.0
+arrow==1.3.0
+asttokens==3.0.0
+astunparse==1.6.3
+async-lru==2.0.5
+attrs==25.3.0
+azure-cognitiveservices-search-imagesearch==2.0.1
+azure-common==1.1.28
+azure-core==1.34.0
+azure-mgmt-core==1.5.0
+babel==2.17.0
+backcall==0.2.0
+beartype==0.21.0
+beautifulsoup4==4.13.4
+bleach==6.2.0
+blessed==1.21.0
+blis==1.3.0
+catalogue==2.0.10
+certifi==2025.4.26
+cffi==1.17.1
+charset-normalizer==3.4.2
+click==8.2.1
+cloudpathlib==0.21.1
+cloudpickle==3.1.1
+comm==0.2.2
+confection==0.1.5
+contourpy==1.3.2
+cycler==0.12.1
+cymem==2.0.11
+datasets==3.6.0
+debugpy==1.8.14
+decorator==5.2.1
+defusedxml==0.7.1
+dill==0.3.8
+docopt==0.6.2
+execnb==0.1.14
+executing==2.2.0
+fastai==2.8.2
+fastapi==0.115.12
+fastbook==0.0.29
+fastcore==1.8.2
+fastdownload==0.0.7
+fastjsonschema==2.21.1
+fastprogress==1.0.3
+fasttransform==0.0.2
+ffmpy==0.6.0
+filelock==3.18.0
+fonttools==4.58.1
+fqdn==1.5.1
+frozenlist==1.6.0
+fsspec==2025.3.0
+gdown==5.2.0
+ghapi==1.0.6
+gpustat==1.1.1
+gradio==5.33.1
+gradio_client==1.10.3
+graphviz==0.20.3
+groovy==0.1.2
+grpcio==1.72.1
+h11==0.16.0
+h5py==3.13.0
+hf-xet==1.1.2
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.32.2
+idna==3.10
+importlib_metadata==8.7.0
+ipykernel==6.29.5
+ipython==8.12.3
+ipython-genutils==0.2.0
+ipython_pygments_lexers==1.1.1
+ipywidgets==7.8.5
+isodate==0.7.2
+isoduration==20.11.0
+jedi==0.19.2
+Jinja2==3.1.6
+joblib==1.5.1
+json5==0.12.0
+jsonpointer==3.0.0
+jsonschema==4.24.0
+jsonschema-specifications==2025.4.1
+jupyter-events==0.12.0
+jupyter-lsp==2.2.5
+jupyter_client==8.6.3
+jupyter_core==5.8.1
+jupyter_server==2.16.0
+jupyter_server_terminals==0.5.3
+jupyterlab==4.4.3
+jupyterlab_pygments==0.3.0
+jupyterlab_server==2.27.3
+jupyterlab_widgets==1.1.11
+kaggle==1.7.4.5
+kagglehub==0.3.12
+kiwisolver==1.4.8
+langcodes==3.5.0
+language_data==1.3.0
+marisa-trie==1.2.1
+Markdown==3.8
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
 matplotlib==3.10.3
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mistune==3.1.3
+mpmath==1.3.0
+msrest==0.7.1
+multidict==6.4.4
+multiprocess==0.70.16
+murmurhash==1.0.13
+nbclient==0.10.2
+nbconvert==7.16.6
+nbdev==2.4.2
+nbformat==5.10.4
+nest-asyncio==1.6.0
+networkx==3.4.2
+notebook==7.4.3
+notebook_shim==0.2.4
+numpy==2.2.6
+nvidia-cublas-cu12==12.6.4.1
+nvidia-cuda-cupti-cu12==12.6.80
+nvidia-cuda-nvrtc-cu12==12.6.77
+nvidia-cuda-runtime-cu12==12.6.77
+nvidia-cudnn-cu12==9.5.1.17
+nvidia-cufft-cu12==11.3.0.4
+nvidia-cufile-cu12==1.11.1.6
+nvidia-curand-cu12==10.3.7.77
+nvidia-cusolver-cu12==11.7.1.2
+nvidia-cusparse-cu12==12.5.4.2
+nvidia-cusparselt-cu12==0.6.3
+nvidia-ml-py==12.575.51
+nvidia-nccl-cu12==2.26.2
+nvidia-nvjitlink-cu12==12.6.85
+nvidia-nvtx-cu12==12.6.77
+oauthlib==3.2.2
+opencv-python==4.11.0.86
+opencv-python-headless==4.11.0.86
+orjson==3.10.18
+overrides==7.7.0
+packaging==25.0
+pandas==2.2.3
+pandocfilters==1.5.1
+parso==0.8.4
+pexpect==4.9.0
+pickleshare==0.7.5
+pillow==11.2.1
+pipreqs==0.5.0
+platformdirs==4.3.8
+plum-dispatch==2.5.7
+preshed==3.0.10
+prometheus_client==0.22.0
+prompt_toolkit==3.0.51
+propcache==0.3.1
+protobuf==6.31.1
+psutil==7.0.0
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==20.0.0
+pycparser==2.22
+pydantic==2.11.5
+pydantic_core==2.33.2
+pydub==0.25.1
+Pygments==2.19.1
+pyparsing==3.2.3
+PySocks==1.7.1
+python-dateutil==2.9.0.post0
+python-json-logger==3.3.0
+python-multipart==0.0.20
+python-slugify==8.0.4
+pytz==2025.2
+PyYAML==6.0.2
+pyzmq==26.4.0
+referencing==0.36.2
+regex==2024.11.6
+requests==2.32.3
+requests-oauthlib==2.0.0
+rfc3339-validator==0.1.4
+rfc3986-validator==0.1.1
+rich==14.0.0
+rpds-py==0.25.1
+ruff==0.11.13
+safehttpx==0.1.6
+safetensors==0.5.3
+scikit-learn==1.6.1
+scipy==1.15.3
+semantic-version==2.10.0
+Send2Trash==1.8.3
+sentencepiece==0.2.0
+setuptools==80.9.0
+shellingham==1.5.4
+simsimd==6.4.9
+six==1.17.0
+smart-open==7.1.0
+sniffio==1.3.1
+soupsieve==2.7
+spacy==3.8.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+srsly==2.5.1
+stack-data==0.6.3
+starlette==0.46.2
+stringzilla==3.12.5
+sympy==1.14.0
+tensorboard==2.19.0
+tensorboard-data-server==0.7.2
+tensordict==0.8.3
+terminado==0.18.1
+text-unidecode==1.3
+thinc==8.3.6
+threadpoolctl==3.6.0
+tinycss2==1.4.0
+tokenizers==0.21.1
+tomlkit==0.13.3
 torch==2.7.0
 torchvision==0.22.0
+tornado==6.5.1
+tqdm==4.67.1
+traitlets==5.14.3
+transformers==4.52.3
+triton==3.3.0
+typer==0.16.0
+types-python-dateutil==2.9.0.20250516
+typing-inspection==0.4.1
+typing_extensions==4.13.2
+tzdata==2025.2
+uri-template==1.3.0
+urllib3==2.4.0
+uvicorn==0.34.3
+wasabi==1.1.3
+watchdog==6.0.0
+wcwidth==0.2.13
+weasel==0.4.1
+webcolors==24.11.1
+webencodings==0.5.1
+websocket-client==1.8.0
+websockets==15.0.1
+Werkzeug==3.1.3
+wheel==0.45.1
+widgetsnbextension==3.6.10
+wrapt==1.17.2
+xxhash==3.5.0
+yarg==0.1.9
+yarl==1.20.0
+zipp==3.22.0