Spaces:

adityasync
/

OncoVision-X

Sleeping

App Files Files Community

OncoVision-X / src /data /malignancy_dataset.py

adityasync

Clean OncoVision-X deployment with LFS

8960670 2 months ago

raw

history blame contribute delete

6.15 kB

	#!/usr/bin/env python3
	"""
	Malignancy Dataset — LIDC-IDRI Malignancy Classification Data Pipeline

	Handles:
	- Loading LIDC-IDRI malignancy annotations (1-5 scale)
	- Binary conversion: 1-2 → benign (0), 4-5 → malignant (1), skip 3
	- PyTorch Dataset for loading 64³ nodule patches with labels
	- 3D augmentation (rotation, flip, noise)

	NOT part of the research paper — demo feature only.
	"""

	import numpy as np
	import pandas as pd
	import torch
	from torch.utils.data import Dataset, DataLoader
	from pathlib import Path


	def prepare_malignancy_data(csv_path):
	"""Load and prepare LIDC-IDRI malignancy annotations for binary classification.

	Converts the 5-point malignancy scale to binary:
	- Benign (0): malignancy 1-2
	- Malignant (1): malignancy 4-5
	- Skipped: malignancy 3 (indeterminate)

	Args:
	csv_path: Path to LIDC annotations CSV with columns:
	- nodule_id: unique identifier
	- malignancy: 1-5 rating

	Returns:
	pd.DataFrame with added 'label' column (0 or 1)
	"""
	annotations = pd.read_csv(csv_path)

	if 'malignancy' not in annotations.columns:
	raise ValueError(
	f"CSV must have a 'malignancy' column. Found: {list(annotations.columns)}"
	)

	# Convert to binary classification
	annotations['label'] = annotations['malignancy'].apply(
	lambda x: 0 if x <= 2 else (1 if x >= 4 else -1)
	)

	# Remove indeterminate cases (malignancy == 3)
	annotations = annotations[annotations['label'] != -1].reset_index(drop=True)

	benign_count = (annotations['label'] == 0).sum()
	malignant_count = (annotations['label'] == 1).sum()
	print(f"Malignancy data prepared:")
	print(f" Benign nodules: {benign_count}")
	print(f" Malignant nodules: {malignant_count}")
	print(f" Total: {len(annotations)}")
	print(f" Ratio (B:M): {benign_count / max(malignant_count, 1):.1f}:1")

	return annotations


	class MalignancyDataset(Dataset):
	"""PyTorch Dataset for malignancy classification.

	Loads 64³ nodule patches from .npy files and returns (patch, label) pairs.

	Args:
	annotations_df: DataFrame with 'nodule_id' and 'label' columns
	patches_dir: Directory containing {nodule_id}.npy patch files
	augment: Whether to apply data augmentation
	"""

	def __init__(self, annotations_df, patches_dir, augment=False):
	self.annotations = annotations_df.reset_index(drop=True)
	self.patches_dir = Path(patches_dir)
	self.augment = augment

	# Verify directory exists
	if not self.patches_dir.exists():
	raise FileNotFoundError(
	f"Patches directory not found: {self.patches_dir}"
	)

	def __len__(self):
	return len(self.annotations)

	def __getitem__(self, idx):
	row = self.annotations.iloc[idx]

	# Load nodule patch (64×64×64)
	nodule_id = row['nodule_id']
	patch_path = self.patches_dir / f"{nodule_id}.npy"

	if not patch_path.exists():
	# Fallback: try .npz format
	npz_path = self.patches_dir / f"{nodule_id}.npz"
	if npz_path.exists():
	patch = np.load(npz_path)['patch'].astype(np.float32)
	else:
	raise FileNotFoundError(
	f"Patch file not found: {patch_path} or {npz_path}"
	)
	else:
	patch = np.load(patch_path).astype(np.float32)

	# Apply augmentation
	if self.augment:
	patch = self._augment(patch)

	# Convert to tensor: add channel dim → (1, 64, 64, 64)
	patch_tensor = torch.from_numpy(patch).unsqueeze(0)
	label = torch.tensor(row['label'], dtype=torch.long)

	return patch_tensor, label

	def _augment(self, patch):
	"""Apply random 3D augmentations.

	Consistent with the augmentation patterns used in the main
	LunaDataset for the detection task.
	"""
	# Random 90° rotation along a random axis pair
	k = np.random.randint(0, 4)
	axes = [(0, 1), (0, 2), (1, 2)]
	ax = axes[np.random.randint(0, 3)]
	patch = np.rot90(patch, k=k, axes=ax).copy()

	# Random flip along each axis
	for axis in range(3):
	if np.random.rand() > 0.5:
	patch = np.flip(patch, axis=axis).copy()

	# Gaussian noise
	if np.random.rand() > 0.5:
	noise = np.random.normal(0, 0.05, patch.shape).astype(np.float32)
	patch = patch + noise

	# Clamp to valid range
	patch = np.clip(patch, -1.0, 1.0)

	return patch


	def create_malignancy_loaders(config):
	"""Create train and validation DataLoaders for malignancy classification.

	Args:
	config: Configuration dict with 'data' and 'training' sections

	Returns:
	(train_loader, val_loader)
	"""
	from sklearn.model_selection import train_test_split

	data_cfg = config.get('data', {})
	training_cfg = config.get('training', {})

	annotations = prepare_malignancy_data(data_cfg['annotations_csv'])

	# Stratified train/val split
	val_ratio = data_cfg.get('val_ratio', 0.2)
	train_df, val_df = train_test_split(
	annotations, test_size=val_ratio,
	stratify=annotations['label'], random_state=42
	)

	print(f" Train split: {len(train_df)} samples")
	print(f" Val split: {len(val_df)} samples")

	patches_dir = data_cfg['patches_dir']
	train_dataset = MalignancyDataset(train_df, patches_dir, augment=True)
	val_dataset = MalignancyDataset(val_df, patches_dir, augment=False)

	batch_size = training_cfg.get('batch_size', 32)
	num_workers = data_cfg.get('num_workers', 4)

	train_loader = DataLoader(
	train_dataset, batch_size=batch_size, shuffle=True,
	num_workers=num_workers, pin_memory=True, drop_last=True
	)
	val_loader = DataLoader(
	val_dataset, batch_size=batch_size, shuffle=False,
	num_workers=num_workers, pin_memory=True
	)

	return train_loader, val_loader