Spaces:

adityasync
/

OncoVision-X

Sleeping

File size: 6,151 Bytes
#!/usr/bin/env python3
"""
Malignancy Dataset — LIDC-IDRI Malignancy Classification Data Pipeline

Handles:
  - Loading LIDC-IDRI malignancy annotations (1-5 scale)
  - Binary conversion: 1-2 → benign (0), 4-5 → malignant (1), skip 3
  - PyTorch Dataset for loading 64³ nodule patches with labels
  - 3D augmentation (rotation, flip, noise)

NOT part of the research paper — demo feature only.
"""

import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from pathlib import Path


def prepare_malignancy_data(csv_path):
    """Load and prepare LIDC-IDRI malignancy annotations for binary classification.

    Converts the 5-point malignancy scale to binary:
      - Benign (0):      malignancy 1-2
      - Malignant (1):   malignancy 4-5
      - Skipped:         malignancy 3 (indeterminate)

    Args:
        csv_path: Path to LIDC annotations CSV with columns:
                  - nodule_id: unique identifier
                  - malignancy: 1-5 rating

    Returns:
        pd.DataFrame with added 'label' column (0 or 1)
    """
    annotations = pd.read_csv(csv_path)

    if 'malignancy' not in annotations.columns:
        raise ValueError(
            f"CSV must have a 'malignancy' column. Found: {list(annotations.columns)}"
        )

    # Convert to binary classification
    annotations['label'] = annotations['malignancy'].apply(
        lambda x: 0 if x <= 2 else (1 if x >= 4 else -1)
    )

    # Remove indeterminate cases (malignancy == 3)
    annotations = annotations[annotations['label'] != -1].reset_index(drop=True)

    benign_count = (annotations['label'] == 0).sum()
    malignant_count = (annotations['label'] == 1).sum()
    print(f"Malignancy data prepared:")
    print(f"  Benign nodules:    {benign_count}")
    print(f"  Malignant nodules: {malignant_count}")
    print(f"  Total:             {len(annotations)}")
    print(f"  Ratio (B:M):       {benign_count / max(malignant_count, 1):.1f}:1")

    return annotations


class MalignancyDataset(Dataset):
    """PyTorch Dataset for malignancy classification.

    Loads 64³ nodule patches from .npy files and returns (patch, label) pairs.

    Args:
        annotations_df: DataFrame with 'nodule_id' and 'label' columns
        patches_dir: Directory containing {nodule_id}.npy patch files
        augment: Whether to apply data augmentation
    """

    def __init__(self, annotations_df, patches_dir, augment=False):
        self.annotations = annotations_df.reset_index(drop=True)
        self.patches_dir = Path(patches_dir)
        self.augment = augment

        # Verify directory exists
        if not self.patches_dir.exists():
            raise FileNotFoundError(
                f"Patches directory not found: {self.patches_dir}"
            )

    def __len__(self):
        return len(self.annotations)

    def __getitem__(self, idx):
        row = self.annotations.iloc[idx]

        # Load nodule patch (64×64×64)
        nodule_id = row['nodule_id']
        patch_path = self.patches_dir / f"{nodule_id}.npy"

        if not patch_path.exists():
            # Fallback: try .npz format
            npz_path = self.patches_dir / f"{nodule_id}.npz"
            if npz_path.exists():
                patch = np.load(npz_path)['patch'].astype(np.float32)
            else:
                raise FileNotFoundError(
                    f"Patch file not found: {patch_path} or {npz_path}"
                )
        else:
            patch = np.load(patch_path).astype(np.float32)

        # Apply augmentation
        if self.augment:
            patch = self._augment(patch)

        # Convert to tensor: add channel dim → (1, 64, 64, 64)
        patch_tensor = torch.from_numpy(patch).unsqueeze(0)
        label = torch.tensor(row['label'], dtype=torch.long)

        return patch_tensor, label

    def _augment(self, patch):
        """Apply random 3D augmentations.

        Consistent with the augmentation patterns used in the main
        LunaDataset for the detection task.
        """
        # Random 90° rotation along a random axis pair
        k = np.random.randint(0, 4)
        axes = [(0, 1), (0, 2), (1, 2)]
        ax = axes[np.random.randint(0, 3)]
        patch = np.rot90(patch, k=k, axes=ax).copy()

        # Random flip along each axis
        for axis in range(3):
            if np.random.rand() > 0.5:
                patch = np.flip(patch, axis=axis).copy()

        # Gaussian noise
        if np.random.rand() > 0.5:
            noise = np.random.normal(0, 0.05, patch.shape).astype(np.float32)
            patch = patch + noise

        # Clamp to valid range
        patch = np.clip(patch, -1.0, 1.0)

        return patch


def create_malignancy_loaders(config):
    """Create train and validation DataLoaders for malignancy classification.

    Args:
        config: Configuration dict with 'data' and 'training' sections

    Returns:
        (train_loader, val_loader)
    """
    from sklearn.model_selection import train_test_split

    data_cfg = config.get('data', {})
    training_cfg = config.get('training', {})

    annotations = prepare_malignancy_data(data_cfg['annotations_csv'])

    # Stratified train/val split
    val_ratio = data_cfg.get('val_ratio', 0.2)
    train_df, val_df = train_test_split(
        annotations, test_size=val_ratio,
        stratify=annotations['label'], random_state=42
    )

    print(f"  Train split: {len(train_df)} samples")
    print(f"  Val split:   {len(val_df)} samples")

    patches_dir = data_cfg['patches_dir']
    train_dataset = MalignancyDataset(train_df, patches_dir, augment=True)
    val_dataset = MalignancyDataset(val_df, patches_dir, augment=False)

    batch_size = training_cfg.get('batch_size', 32)
    num_workers = data_cfg.get('num_workers', 4)

    train_loader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True,
        num_workers=num_workers, pin_memory=True, drop_last=True
    )
    val_loader = DataLoader(
        val_dataset, batch_size=batch_size, shuffle=False,
        num_workers=num_workers, pin_memory=True
    )

    return train_loader, val_loader