mae

File size: 14,521 Bytes

5ffe2e2

# Standard library
import os
from pathlib import Path

# Data handling
import pandas as pd
import numpy as np

# Machine learning
from sklearn.model_selection import train_test_split

class CheXpertDataSplitter:
    """

    Advanced stratified train-validation splitter for CheXpert dataset.

    Handles:

    - Patient-level splitting (prevents data leakage)

    - Multi-label stratification

    - Class imbalance awareness

    - Study-level grouping

    """

    PATHOLOGIES = [
        'No Finding',
        'Enlarged Cardiomediastinum',
        'Cardiomegaly',
        'Lung Opacity',
        'Lung Lesion',
        'Edema',
        'Consolidation',
        'Pneumonia',
        'Atelectasis',
        'Pneumothorax',
        'Pleural Effusion',
        'Pleural Other',
        'Fracture',
        'Support Devices'
    ]

    def __init__(self, csv_path, val_size=0.15,test_size=0.15, random_state=42,

                 use_frontal_only=True, fill_uncertain='zeros',root=None):
        """

        Initialize the splitter.



        Args:

            csv_path: Path to train.csv from CheXpert-small

            val_size: Validation set proportion (default: 0.15)

            random_state: Random seed for reproducibility

            use_frontal_only: Use only frontal view images

            fill_uncertain: How to handle uncertain labels ('zeros', 'ones', 'ignore')

        """
        self.csv_path = csv_path
        self.val_size = val_size
        self.test_size = test_size
        self.random_state = random_state
        self.use_frontal_only = use_frontal_only
        self.fill_uncertain = fill_uncertain
        self.root=root

        print("=" * 80)
        print("CheXpert Data Splitter - Preventing Data Leakage & Class Bias")
        print("=" * 80)

    def load_and_preprocess(self):
        """Load and preprocess the dataset."""
        print("\n[1/5] Loading data...")
        self.df = pd.read_csv(self.csv_path)
        print(f"   Loaded {len(self.df)} images")

        #self.df=self.df[self.df["Path"].apply(os.path.exists)]

        # Filter for frontal views only
        if self.use_frontal_only:
            initial_count = len(self.df)
            self.df = self.df[self.df['Frontal/Lateral'] == 'Frontal'].reset_index(drop=True)
            print(f"   Filtered to frontal views: {len(self.df)} images ({initial_count - len(self.df)} removed)")

        # Extract patient and study IDs from path
        print("\n[2/5] Extracting patient and study IDs...")
        self.df['patient_id'] = self.df['Path'].apply(lambda x: x.split('/')[2])
        self.df['study_id'] = self.df['Path'].apply(lambda x: x.split('/')[3])

        n_patients = self.df['patient_id'].nunique()
        n_studies = self.df['study_id'].nunique()
        print(f"   Unique patients: {n_patients}")
        print(f"   Unique studies: {n_studies}")
        print(f"   Images per patient (avg): {len(self.df) / n_patients:.2f}")

        # Process uncertain labels
        print("\n[3/5] Processing uncertain labels...")
        self._process_uncertain_labels()

        return self.df

    def _process_uncertain_labels(self):
        """Process uncertain labels (-1) based on the chosen strategy."""
        for pathology in self.PATHOLOGIES:
            if pathology in self.df.columns:
                uncertain_count = (self.df[pathology] == -1).sum()

                if self.fill_uncertain == 'zeros':
                    self.df[pathology] = self.df[pathology].replace(-1, 0)
                elif self.fill_uncertain == 'ones':
                    self.df[pathology] = self.df[pathology].replace(-1, 1)
                elif self.fill_uncertain == 'ignore':
                    pass  # Keep -1 as is

                # Fill NaN with 0
                self.df[pathology] = self.df[pathology].fillna(0)

        print(f"   Uncertain labels strategy: {self.fill_uncertain}")

    def create_stratification_groups(self):
        """

        Create stratification groups based on multi-label combinations.

        Uses patient-level aggregation to prevent data leakage.

        """
        print("\n[4/5] Creating stratification groups (patient-level)...")

        # Group by patient and aggregate labels
        patient_groups = self.df.groupby('patient_id').agg({
            **{pathology: 'max' for pathology in self.PATHOLOGIES if pathology in self.df.columns},
            'study_id': 'first',  # Keep one study_id for reference
            'Sex': 'first',
            'Age': 'first'
        }).reset_index()

        # Create label signature for each patient
        # This is a binary string representing which conditions are present
        def create_label_signature(row):
            signature = []
            for pathology in self.PATHOLOGIES:
                if pathology in patient_groups.columns:
                    signature.append(str(int(row[pathology])))
            return ''.join(signature)

        patient_groups['label_signature'] = patient_groups.apply(create_label_signature, axis=1)

        # For rare combinations, group them together
        signature_counts = patient_groups['label_signature'].value_counts()
        rare_threshold = max(5, int(len(patient_groups) * 0.001))  # At least 5 or 0.1%

        def get_stratification_group(signature):
            if signature_counts[signature] < rare_threshold:
                return 'RARE_COMBINATION'
            return signature

        patient_groups['stratification_group'] = patient_groups['label_signature'].apply(get_stratification_group)

        # Print distribution statistics
        print(f"\n   Patient-level label distribution:")
        for pathology in self.PATHOLOGIES:
            if pathology in patient_groups.columns:
                positive_count = (patient_groups[pathology] == 1).sum()
                percentage = positive_count / len(patient_groups) * 100
                print(f"   {pathology:30s}: {positive_count:5d} ({percentage:5.2f}%)")

        unique_groups = patient_groups['stratification_group'].nunique()
        print(f"\n   Unique stratification groups: {unique_groups}")
        print(f"   Rare combinations grouped: {(patient_groups['stratification_group'] == 'RARE_COMBINATION').sum()}")

        return patient_groups

    def perform_split(self, patient_groups):
        """

        Perform stratified train-validation-test split at patient level.

        """
        print("\n[5/5] Performing stratified patient-level split...")

        stratification_labels = patient_groups['stratification_group'].values

        # ---- train / (val+test) ----
        train_patients, valtest_patients = train_test_split(
            patient_groups['patient_id'].values,
            test_size=self.val_size + self.test_size,          # <-- new
            stratify=stratification_labels,
            random_state=self.random_state
        )

        # ---- val / test from the remaining pool ----
        remaining_labels = patient_groups.set_index('patient_id').loc[valtest_patients]['stratification_group'].values
        val_patients, test_patients = train_test_split(
            valtest_patients,
            test_size=self.test_size / (self.val_size + self.test_size),   # <-- proportion of the val+test pool
            stratify=remaining_labels,
            random_state=self.random_state
        )

        print(f"   Train patients: {len(train_patients)}")
        print(f"   Val   patients: {len(val_patients)}")
        print(f"   Test  patients: {len(test_patients)}")

        # Split the full dataframe
        train_df = self.df[self.df['patient_id'].isin(train_patients)].copy()
        val_df   = self.df[self.df['patient_id'].isin(val_patients)].copy()
        test_df  = self.df[self.df['patient_id'].isin(test_patients)].copy()

        # ---- leakage check (train vs val vs test) ----
        sets = [('train', train_df), ('val', val_df), ('test', test_df)]
        for i, (name_i, df_i) in enumerate(sets):
            for j, (name_j, df_j) in enumerate(sets[i+1:]):
                overlap = set(df_i['patient_id']).intersection(set(df_j['patient_id']))
                if overlap:
                    raise ValueError(f"Data leakage between {name_i} and {name_j}: {len(overlap)} patients overlap")
        print("\n   No patient overlap – leakage prevented!")

        return train_df, val_df, test_df

    def run(self, output_dir='.', save_test=True):
        self.load_and_preprocess()
        patient_groups = self.create_stratification_groups()
        train_df, val_df, test_df = self.perform_split(patient_groups)

        self.verify_split_quality(train_df, val_df)
        # optional: also verify train vs test (same function works with two dfs)
        print("\n--- Train vs Test distribution check ---")
        self.verify_split_quality(train_df, test_df)

        train_path, val_path = self.save_splits(train_df, val_df, output_dir)
        if save_test:
            test_path = self.save_test_split(test_df, output_dir)
        else:
            test_path = None

        print("\n" + "="*80)
        print("Split Complete! (train / val / test)")
        print("="*80)
        return train_path, val_path, test_path

    def save_test_split(self, test_df, output_dir):
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True)
        test_path = output_dir / 'test_ready.csv'

        cols_to_drop = ['patient_id', 'study_id']
        test_clean = test_df.drop(columns=[c for c in cols_to_drop if c in test_df.columns])
        test_clean.to_csv(test_path, index=False)

        print(f"Test set : {test_path} ({len(test_clean)} images)")
        return test_path

    def verify_split_quality(self, train_df, val_df):
        """

        Verify the quality of the split by comparing label distributions.

        """
        print("\n" + "=" * 80)
        print("Split Quality Verification")
        print("=" * 80)

        print(f"\n{'Pathology':<30s} {'Train %':>10s} {'Val %':>10s} {'Difference':>12s}")
        print("-" * 80)

        max_diff = 0
        for pathology in self.PATHOLOGIES:
            if pathology in train_df.columns:
                train_pos = (train_df[pathology] == 1).sum() / len(train_df) * 100
                val_pos = (val_df[pathology] == 1).sum() / len(val_df) * 100
                diff = abs(train_pos - val_pos)
                max_diff = max(max_diff, diff)

                print(f"{pathology:<30s} {train_pos:>9.2f}% {val_pos:>9.2f}% {diff:>11.2f}%")

        print("-" * 80)
        print(f"Maximum distribution difference: {max_diff:.2f}%")

        if max_diff < 2.0:
            print("✓ Excellent stratification (< 2% difference)")
        elif max_diff < 5.0:
            print("✓ Good stratification (< 5% difference)")
        else:
            print("⚠ Warning: Large distribution differences detected")

        # Check for class imbalance
        print("\n" + "=" * 80)
        print("Class Imbalance Analysis (Train Set)")
        print("=" * 80)

        imbalance_ratios = []
        for pathology in self.PATHOLOGIES:
            if pathology in train_df.columns:
                pos = (train_df[pathology] == 1).sum()
                neg = (train_df[pathology] == 0).sum()
                if pos > 0:
                    ratio = neg / pos
                    imbalance_ratios.append(ratio)
                    severity = "Low" if ratio < 5 else "Medium" if ratio < 20 else "High"
                    print(f"{pathology:<30s} Ratio: {ratio:>6.2f}:1 [{severity:>6s} imbalance]")

        avg_imbalance = np.mean(imbalance_ratios)
        print(f"\nAverage imbalance ratio: {avg_imbalance:.2f}:1")

    def save_splits(self, train_df, val_df, output_dir='.'):
        """Save train and validation splits to CSV files."""
        output_dir = Path(output_dir)
        output_dir.mkdir(exist_ok=True)

        train_path = output_dir / 'train_ready.csv'
        val_path = output_dir / 'val_ready.csv'

        # Remove temporary columns used for splitting
        columns_to_drop = ['patient_id', 'study_id']
        train_df_clean = train_df.drop(columns=[col for col in columns_to_drop if col in train_df.columns])
        val_df_clean = val_df.drop(columns=[col for col in columns_to_drop if col in val_df.columns])

        train_df_clean.to_csv(train_path, index=False)
        val_df_clean.to_csv(val_path, index=False)

        print("\n" + "=" * 80)
        print("Files Saved Successfully")
        print("=" * 80)
        print(f"Train set: {train_path} ({len(train_df_clean)} images)")
        print(f"Val set:   {val_path} ({len(val_df_clean)} images)")

        return train_path, val_path

# Main execution
if __name__ == "__main__":
    root = "/content/drive/MyDrive"
    # Configuration
    CHEXPERT_CSV = os.path.join(root,"CheXpert-v1.0-small","train.csv")  # Adjust path as needed
    OUTPUT_DIR = os.path.join(root,"CheXpert-v1.0-small")
    VAL_SIZE = 0.15
    RANDOM_STATE = 42
    USE_FRONTAL_ONLY = True
    FILL_UNCERTAIN = 'zeros'  # Options: 'zeros', 'ones', 'ignore'

    # Create splitter
    splitter = CheXpertDataSplitter(
        csv_path=CHEXPERT_CSV,
        val_size=VAL_SIZE,test_size=VAL_SIZE,
        random_state=RANDOM_STATE,
        use_frontal_only=USE_FRONTAL_ONLY,
        fill_uncertain=FILL_UNCERTAIN,
        root=OUTPUT_DIR
    )

    # Run the split
    if os.path.exists(os.path.join(root,"CheXpert-v1.0-small","train_ready.csv")) and os.path.exists(os.path.join(root,"CheXpert-v1.0-small","val_ready.csv")):
        train_path=os.path.join(root,"CheXpert-v1.0-small","train_ready.csv")
        val_path=os.path.join(root,"CheXpert-v1.0-small","val_ready.csv")
        test_path=os.path.join(root,"CheXpert-v1.0-small","test_ready.csv")
    else:
        train_path, val_path,test_path = splitter.run(output_dir=OUTPUT_DIR)

    print("\nYou can now use these files with your CheXpertDataset class:")
    print(f"  train_dataset = CheXpertDataset('{train_path}', root_dir='...', augment=True)")
    print(f"  val_dataset = CheXpertDataset('{val_path}', root_dir='...', augment=False)")
    print(f"  test_dataset = CheXpertDataset('{test_path}', root_dir='...', augment=False)")