File size: 29,320 Bytes

7f61943

# datasets.py

import torch
from torch.utils.data import Dataset
import pandas as pd
import numpy as np
import os
import pyfaidx 
import kipoiseq.transforms.functional 
from rdkit import Chem
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.Chem import rdFingerprintGenerator

# --- Global Config ---
# Enformer typically uses a 196,608 bp input sequence.
# We will use a shorter input (1/4 of usual length) to speed up training.
ENFORMER_INPUT_SEQ_LENGTH = 49_152 

# Relative paths from the project root directory
GENOME_FASTA_PATH = "data/hg38.fa"
TSS_REGIONS_CSV_PATH = "data/Enformer_genomic_regions_TSSCenteredGenes_FixedOverlapRemoval.csv"

# Path to pseudobulk target data, matching the provided dummy file
PSEUDOBULK_TARGET_DATA_PATH = "data/pseudobulk_dummy.csv"

# ----------------------


class GenomeOneHotEncoder:
    """
    Encodes DNA sequences into one-hot format using kipoiseq.
    """
    def __init__(self, sequence_length: int = ENFORMER_INPUT_SEQ_LENGTH):
        self.sequence_length = sequence_length

    @staticmethod
    def _one_hot_encode(sequence: str) -> np.ndarray:
        ## one hot encodes DNA using the same code from the original Enformer paper. 
        ## Ensures one-hot encoding is consistent with representations Enformer has 
        ## already learned
        return kipoiseq.transforms.functional.one_hot_dna(sequence).astype(np.float32)

    def encode(self, seq: str) -> np.ndarray:
        """
        One-hot encodes a DNA sequence using kipoiseq.

        Args:
            seq (str): The DNA sequence string. The FastaReader should ensure this
                       sequence is already uppercase and of length ENFORMER_INPUT_SEQ_LENGTH.

        Returns:
            np.ndarray: A numpy array, typically (L, 4) for DNA, with one-hot encoded sequence.
        """

        return GenomeOneHotEncoder._one_hot_encode(seq)


class FastaReader:
    """
    Reads sequences from a FASTA file using pyfaidx.
    Handles chromosome boundary conditions by padding with 'N'.
    """
    def __init__(self, fasta_path: str):
        self.fasta_path = fasta_path
        self.genome = None
        try:
            self.genome = pyfaidx.Fasta(self.fasta_path, sequence_always_upper=True)
            print(f"Successfully loaded and indexed genome using pyfaidx from: {self.fasta_path}")
        except pyfaidx.FastaIndexingError as e:
            print(f"Error: Could not index FASTA file at {self.fasta_path}.")
            print("Ensure it's a valid FASTA file and the .fai index can be created/read in its directory.")
            print(f"pyfaidx error: {e}")
            raise
        except FileNotFoundError:
            print(f"Error: FASTA file not found at {self.fasta_path}.")
            raise

    def get_sequence(self, chrom: str, start_0based: int, end_0based_exclusive: int) -> str:
        """
        Fetches a DNA sequence for the given 0-based genomic interval.
        Pads with 'N' if the interval extends beyond chromosome boundaries.

        Args:
            chrom (str): Chromosome name (e.g., 'chr1').
            start_0based (int): 0-based start coordinate (inclusive).
            end_0based_exclusive (int): 0-based end coordinate (exclusive).

        Returns:
            str: The DNA sequence, padded with 'N's to match the requested length
                 (end_0based_exclusive - start_0based).
        """
        if self.genome is None:
            raise RuntimeError("FastaReader not properly initialized (pyfaidx missing or genome loading failed).")

        # Sanitize chromosome name (e.g., '1' vs 'chr1')
        true_chrom_name = chrom
        if chrom not in self.genome:
            alternative_chrom_name = 'chr' + chrom if not chrom.startswith('chr') else chrom.replace('chr', '', 1)
            if alternative_chrom_name in self.genome:
                true_chrom_name = alternative_chrom_name
            else:
                available_chroms_sample = list(self.genome.keys())[:5]
                raise ValueError(
                    f"Chromosome '{chrom}' (and alternative '{alternative_chrom_name}') not found in FASTA file. "
                    f"Available chromosomes sample: {available_chroms_sample}..."
                )
        
        chrom_len = len(self.genome[true_chrom_name])
        seq_len_requested = end_0based_exclusive - start_0based

        # init sequence with Ns for padding
        sequence_parts = []
        
        # handle padding at the beginning
        padding_start_len = 0
        if start_0based < 0:
            padding_start_len = abs(start_0based)
            sequence_parts.append('N' * padding_start_len)
            effective_start = 0
        else:
            effective_start = start_0based

        # determine the part of the sequence to fetch from FASTA
        fetch_len = min(end_0based_exclusive, chrom_len) - effective_start
        
        if fetch_len > 0:
            sequence_parts.append(self.genome[true_chrom_name][effective_start : effective_start + fetch_len].seq)
        elif effective_start >= chrom_len: # Requested start is beyond chromosome end
             pass # No sequence to fetch, only padding needed

        # handle padding at the end
        current_len = sum(len(p) for p in sequence_parts)
        padding_end_len = seq_len_requested - current_len
        if padding_end_len > 0:
            sequence_parts.append('N' * padding_end_len)
        
        final_sequence = "".join(sequence_parts)

        # Final check for length; this should be guaranteed by logic above
        if len(final_sequence) != seq_len_requested:
            # This indicates a logic error in padding/fetching
            raise RuntimeError(
                f"Internal error: Final sequence length {len(final_sequence)} for {true_chrom_name}:{start_0based}-{end_0based_exclusive} "
                f"does not match requested {seq_len_requested}."
            )
        return final_sequence


# --- Main Dataset Classes ---

class TahoeDataset(Dataset):
    """
    PyTorch Dataset for loading Tahoe data for Enformer fine-tuning.
    - Reads genomic regions from a regions CSV.
    - Reads pseudobulk conditions and expression values from a pseudobulk CSV.
    - Merges these two data sources based on gene identifiers.
    - Each sample represents a unique gene-condition pair.
    - Fetches DNA sequence for the gene, resized to `enformer_input_seq_length`.
    - One-hot encodes sequence and returns it with the specific expression value.
    """
    ORIGINAL_ENFORMER_WINDOW_SIZE = 196_608

    def __init__(self,
                 tss_regions_csv_path: str,
                 genome_fasta_path: str,
                 pseudobulk_data_path: str,
                 enformer_input_seq_length: int = ENFORMER_INPUT_SEQ_LENGTH,
                 regions_csv_gene_col: str = 'gene_name',        # Gene ID column in tss_regions_csv
                 pseudobulk_csv_gene_col: str = 'gene_id',     # Gene ID column in pseudobulk_data_csv
                 regions_csv_chr_col: str = 'seqnames',      # Chromosome column in tss_regions_csv
                 regions_csv_start_col: str = 'starts',        # 0-based start col in tss_regions_csv
                 regions_csv_end_col: str = 'ends'):           # 0-based exclusive end col in tss_regions_csv
        super().__init__()

        self.enformer_input_seq_length = enformer_input_seq_length
        # Store column names for clarity
        self.regions_gene_col = regions_csv_gene_col
        self.pseudobulk_gene_col = pseudobulk_csv_gene_col
        self.regions_chr_col = regions_csv_chr_col
        self.regions_start_col = regions_csv_start_col
        self.regions_end_col = regions_csv_end_col

        print(f"Initializing TahoeDataset...")
        print(f"  Target model input sequence length: {self.enformer_input_seq_length} bp")
        print(f"  Genomic regions are assumed to define a {self.ORIGINAL_ENFORMER_WINDOW_SIZE} bp window for centering.")

        # Load genomic regions data
        print(f"  Loading TSS regions from: {tss_regions_csv_path}")
        try:
            regions_df = pd.read_csv(tss_regions_csv_path)
            print(f"    Successfully loaded regions CSV with {len(regions_df)} gene region entries.")
            expected_region_cols = [self.regions_chr_col, self.regions_gene_col, 
                                    self.regions_start_col, self.regions_end_col]
            missing_region_cols = [col for col in expected_region_cols if col not in regions_df.columns]
            if missing_region_cols:
                raise ValueError(f"Missing columns in regions CSV ('{tss_regions_csv_path}'): {missing_region_cols}. Expected: {expected_region_cols}")
        except FileNotFoundError:
            print(f"FATAL ERROR: Regions CSV file not found at {tss_regions_csv_path}")
            raise
        except Exception as e:
            print(f"FATAL ERROR loading or validating regions CSV: {e}")
            raise

        # Load pseudobulk target data
        print(f"  Loading pseudobulk targets from: {pseudobulk_data_path}")
        try:
            pseudobulk_df = pd.read_csv(pseudobulk_data_path)
            print(f"    Successfully loaded pseudobulk CSV with {len(pseudobulk_df)} condition entries.")
            expected_pb_cols = [self.pseudobulk_gene_col, 'cell_line', 'drug_id', 'drug_dose', 'expression']
            missing_pb_cols = [col for col in expected_pb_cols if col not in pseudobulk_df.columns]
            if missing_pb_cols:
                raise ValueError(f"Missing columns in pseudobulk CSV ('{pseudobulk_data_path}'): {missing_pb_cols}. Expected: {expected_pb_cols}")
        except FileNotFoundError:
            print(f"FATAL ERROR: Pseudobulk CSV file not found at {pseudobulk_data_path}")
            raise
        except Exception as e:
            print(f"FATAL ERROR loading or validating pseudobulk CSV: {e}")
            raise

        # Merge regions with pseudobulk data
        print(f"  Merging genomic regions with pseudobulk target data...")
        print(f"    Regions gene column: '{self.regions_gene_col}', Pseudobulk gene column: '{self.pseudobulk_gene_col}'")
        
        regions_df[self.regions_gene_col] = regions_df[self.regions_gene_col].astype(str)
        pseudobulk_df[self.pseudobulk_gene_col] = pseudobulk_df[self.pseudobulk_gene_col].astype(str)

        self.samples_df = pd.merge(
            regions_df,
            pseudobulk_df,
            left_on=self.regions_gene_col,
            right_on=self.pseudobulk_gene_col,
            how='inner' # Keeps only genes present in both DataFrames
        )

        if len(self.samples_df) == 0:
            print("WARNING: The merge operation resulted in an empty DataFrame.")
            print(f"  No common genes found between column '{self.regions_gene_col}' in regions CSV ")
            print(f"  and column '{self.pseudobulk_gene_col}' in pseudobulk CSV.")
            print(f"  Please check that gene identifiers match and are of the same type in both files.")
            # Example gene IDs for debugging:
            if not regions_df.empty: print(f"  Sample gene IDs from regions CSV: {regions_df[self.regions_gene_col].unique()[:5].tolist()}")
            if not pseudobulk_df.empty: print(f"  Sample gene IDs from pseudobulk CSV: {pseudobulk_df[self.pseudobulk_gene_col].unique()[:5].tolist()}")
        else:
            print(f"    Successfully merged data: {len(self.samples_df)} total samples (gene-condition pairs).")
            
            # Check for genes in regions_df not found in pseudobulk_df (and thus dropped)
            original_region_genes = set(regions_df[self.regions_gene_col].unique())
            merged_region_genes = set(self.samples_df[self.regions_gene_col].unique())
            dropped_region_genes = original_region_genes - merged_region_genes
            if dropped_region_genes:
                print(f"    WARNING: {len(dropped_region_genes)} unique gene IDs from the regions CSV ('{self.regions_gene_col}') were not found in the pseudobulk CSV ('{self.pseudobulk_gene_col}') and were dropped.")
                print(f"      Examples of dropped region gene IDs: {list(dropped_region_genes)[:min(5, len(dropped_region_genes))]}")

            # Check for genes in pseudobulk_df not found in regions_df (and thus dropped)
            original_pseudobulk_genes = set(pseudobulk_df[self.pseudobulk_gene_col].unique())
            
            merged_pseudobulk_genes = set(self.samples_df[self.regions_gene_col].unique()) # Genes that made it into the merge, identified by the regions_gene_col key
            
            final_merged_keys_from_pseudobulk_perspective = set(self.samples_df[self.pseudobulk_gene_col].unique())
            dropped_pseudobulk_genes = original_pseudobulk_genes - final_merged_keys_from_pseudobulk_perspective
            
            if dropped_pseudobulk_genes:
                print(f"    WARNING: {len(dropped_pseudobulk_genes)} unique gene IDs from the pseudobulk CSV ('{self.pseudobulk_gene_col}') were not found in the regions CSV ('{self.regions_gene_col}') and were dropped.")
                print(f"      Examples of dropped pseudobulk gene IDs: {list(dropped_pseudobulk_genes)[:min(5, len(dropped_pseudobulk_genes))]}")

        if 'expression' in self.samples_df and self.samples_df['expression'].isnull().any():
             print("WARNING: NA values found in 'expression' column after merge. These samples might cause errors or yield NaN targets.")
             print("         Consider handling these (e.g., fill with a default or drop rows withna(subset=['expression'])).")
             # self.samples_df.dropna(subset=['expression'], inplace=True) # Example: drop rows with NA expression

        print(f"  Initializing FASTA reader for genome: {genome_fasta_path}")
        self.fasta_reader = FastaReader(genome_fasta_path)
        self.encoder = GenomeOneHotEncoder(sequence_length=self.enformer_input_seq_length)
        print("TahoeDataset initialized successfully.")

    def __len__(self):
        return len(self.samples_df)

    def __getitem__(self, idx: int):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        if not (0 <= idx < len(self.samples_df)):
            raise IndexError(f"Index {idx} out of bounds for dataset of length {len(self.samples_df)}")

        sample_info = self.samples_df.iloc[idx]

        try:
            chrom = str(sample_info[self.regions_chr_col])
            # Gene name from the regions CSV (used for merge, should be consistent)
            gene_name_for_logging = str(sample_info[self.regions_gene_col]) 
            
            csv_region_start = int(sample_info[self.regions_start_col])
            csv_region_end = int(sample_info[self.regions_end_col])
            
            expression_value = float(sample_info['expression']) # Assuming 'expression' is the target column
        except KeyError as e:
            print(f"FATAL ERROR in __getitem__ (idx {idx}): Missing expected column {e} in merged samples_df.")
            print(f"  Available columns: {self.samples_df.columns.tolist()}")
            print(f"  Sample info for this index: {sample_info.to_dict() if isinstance(sample_info, pd.Series) else sample_info}")
            raise
        except ValueError as e:
            print(f"FATAL ERROR in __getitem__ (idx {idx}): Could not convert data for gene {gene_name_for_logging}. Error: {e}")
            print(f"  Expression value was: '{sample_info.get('expression', 'N/A')}'")
            raise
        except Exception as e: # Catch any other unexpected error for this item
            print(f"FATAL ERROR in __getitem__ (idx {idx}) for gene {gene_name_for_logging}: An unexpected error occurred: {type(e).__name__} - {e}")
            raise

        # --- Sequence window calculation ---
        actual_csv_window_len = csv_region_end - csv_region_start
        if actual_csv_window_len != self.ORIGINAL_ENFORMER_WINDOW_SIZE:
            # Warning if the input CSV regions are not consistently 196kb.
            # The centering logic below will still try to work based on csv_region_end.
            print(f"WARNING for gene {gene_name_for_logging} (idx {idx}): Region {chrom}:{csv_region_start}-{csv_region_end} from CSV "
                  f"has length {actual_csv_window_len}bp, but expected {self.ORIGINAL_ENFORMER_WINDOW_SIZE}bp "
                  f"for the original window definition used for centering. Sequence extraction might be affected if assumptions are wrong.")

        # Initialize final sequence coordinates with those from the CSV.
        # These will be used if no resizing is needed.
        final_seq_start_0based = csv_region_start
        final_seq_end_0based_exclusive = csv_region_end
        
        # If the target model input sequence length is different from the original Enformer window size,
        # recalculate start and end positions by centering the target length within the original window.
        if self.enformer_input_seq_length != self.ORIGINAL_ENFORMER_WINDOW_SIZE:
            # Calculate the center of the ORIGINAL_ENFORMER_WINDOW_SIZE.
            # Assumes 'csv_region_end' is the exclusive end of this original window.
            original_window_center = csv_region_end - (self.ORIGINAL_ENFORMER_WINDOW_SIZE // 2)
            
            half_target_seq_len = self.enformer_input_seq_length // 2
            final_seq_start_0based = original_window_center - half_target_seq_len
            # Ensure the end is exclusive and maintains the correct length for the target sequence
            final_seq_end_0based_exclusive = final_seq_start_0based + self.enformer_input_seq_length
        
        # Fetch and encode DNA sequence
        dna_sequence = self.fasta_reader.get_sequence(chrom, final_seq_start_0based, final_seq_end_0based_exclusive)
        one_hot_sequence = self.encoder.encode(dna_sequence) 
        one_hot_sequence_tensor = torch.tensor(one_hot_sequence, dtype=torch.float32)
        
        # Target is the specific expression value for this gene-condition pair
        target_tensor = torch.tensor([expression_value], dtype=torch.float32)

        return one_hot_sequence_tensor, target_tensor


# --- Extended Dataset for SMILES ---
class TahoeSMILESDataset(Dataset):
    """
    Extends TahoeDataset to also return:
        - Morgan Fingerprints for the drug
        - drug dose
        - target expression
    """
    def __init__(self,
                 regions_csv_path: str, # Renamed from tss_regions_csv_path for clarity with config
                 pbulk_parquet_path: str, # Renamed from pseudobulk_data_path for clarity with config
                 drug_meta_csv_path: str, # Renamed from drug_metadata_path for clarity with config
                 fasta_file_path: str,    # Renamed from genome_fasta_path for clarity with config
                 enformer_input_seq_length: int = ENFORMER_INPUT_SEQ_LENGTH,
                 # Morgan fingerprint parameters (from data_config)
                 morgan_fp_radius: int = 2,
                 morgan_fp_nbits: int = 2048,
                 # Column names from regions_csv (from data_config)
                 regions_gene_col: str    = 'gene_name',
                 regions_chr_col: str     = 'seqnames',
                 regions_start_col: str   = 'starts',
                 regions_end_col: str     = 'ends',
                 # Column names from pbulk_parquet (from data_config)
                 pbulk_gene_col: str      = 'gene_id',
                 pbulk_drug_col: str      = 'drug_id',
                 pbulk_dose_col: str      = 'drug_dose',
                 pbulk_expr_col: str      = 'expression',
                 pbulk_cell_line_col: str = 'cell_line',
                 # Column names from drug_meta_csv (from data_config)
                 drug_meta_id_col: str    = 'drug',
                 drug_meta_smiles_col: str = 'canonical_smiles',
                 filter_drugs_by_ids: list = None, # Added from dataset_args
                 regions_strand_col: str = None,    # Added from dataset_args, though not used in current __getitem__
                 regions_set_col: str = 'set',      # New: Name of the column in regions_csv for data splitting
                 target_set: str = None             # New: Specific set to load (e.g., "train", "valid", "test")
                 ):
        super().__init__()

        # store config
        self.seq_len = enformer_input_seq_length
        self.morgan_fp_radius = morgan_fp_radius
        self.morgan_fp_nbits = morgan_fp_nbits

        self.regions_gene_col    = regions_gene_col
        self.regions_chr_col     = regions_chr_col
        self.regions_start_col   = regions_start_col
        self.regions_end_col     = regions_end_col
        self.regions_set_col     = regions_set_col # Store the name of the set column

        self.pbulk_gene_col      = pbulk_gene_col
        self.pbulk_drug_col      = pbulk_drug_col
        self.pbulk_dose_col      = pbulk_dose_col
        self.pbulk_expr_col      = pbulk_expr_col
        self.pbulk_cell_line_col = pbulk_cell_line_col

        self.drug_meta_id_col    = drug_meta_id_col
        self.drug_meta_smiles_col= drug_meta_smiles_col
        
        self.target_set          = target_set # Store the specific set value for this instance

        # --- Morgan Fingerprint Generator (NEW) ---
        self._morgan_gen = rdFingerprintGenerator.GetMorganGenerator(
            radius=self.morgan_fp_radius,
            fpSize=self.morgan_fp_nbits
        )

        # load & merge regions + pseudobulk
        print(f"  Loading TSS regions from: {regions_csv_path}")
        try:
            regs = pd.read_csv(regions_csv_path)
            print(f"    Successfully loaded regions CSV with {len(regs)} gene region entries.")
        except FileNotFoundError:
            print(f"FATAL ERROR: Regions CSV file not found at {regions_csv_path}")
            raise
        except Exception as e:
            print(f"FATAL ERROR loading regions CSV: {e}")
            raise

        print(f"  Loading pseudobulk targets from: {pbulk_parquet_path} (expected Parquet format)")
        try:
            pb = pd.read_parquet(pbulk_parquet_path)
            print(f"    Successfully loaded pseudobulk Parquet file with {len(pb)} entries.")
        except FileNotFoundError:
            print(f"FATAL ERROR: Pseudobulk Parquet file not found at {pbulk_parquet_path}")
            raise
        except Exception as e:
            print(f"FATAL ERROR loading or parsing pseudobulk Parquet file: {e}")
            print("  Ensure the file is a valid Parquet file and you have a Parquet engine like 'pyarrow' or 'fastparquet' installed.")
            raise

        # Ensure gene ID columns are strings for merging
        regs[self.regions_gene_col] = regs[self.regions_gene_col].astype(str)
        pb[self.pbulk_gene_col]     = pb[self.pbulk_gene_col].astype(str)

        print(f"  Merging genomic regions with pseudobulk target data...")
        print(f"    Regions gene column: '{self.regions_gene_col}', Pseudobulk gene column: '{self.pbulk_gene_col}'")
        self.samples_df = regs.merge(
            pb,
            left_on  = self.regions_gene_col,
            right_on = self.pbulk_gene_col,
            how      = 'inner'
        )
        
        if filter_drugs_by_ids and self.pbulk_drug_col in self.samples_df.columns:
            print(f"    Filtering samples to include only drugs: {filter_drugs_by_ids}")
            initial_count = len(self.samples_df)
            self.samples_df = self.samples_df[self.samples_df[self.pbulk_drug_col].isin(filter_drugs_by_ids)]
            print(f"    Retained {len(self.samples_df)} samples after drug filtering (from {initial_count}).")
            if len(self.samples_df) == 0:
                print("WARNING: No samples remaining after filtering by drug IDs. Check your filter_drugs_by_ids list and drug IDs in pbulk data.")

        # Filter by target_set if specified
        if self.target_set:
            if self.regions_set_col in self.samples_df.columns:
                print(f"    Filtering samples for set: '{self.target_set}' using column '{self.regions_set_col}'.")
                initial_count_set_filter = len(self.samples_df)
                self.samples_df = self.samples_df[self.samples_df[self.regions_set_col] == self.target_set].copy()
                print(f"    Retained {len(self.samples_df)} samples after filtering for set '{self.target_set}' (from {initial_count_set_filter}).")
                if len(self.samples_df) == 0:
                    print(f"WARNING: No samples remaining for this dataset instance (target_set='{self.target_set}') after filtering. Check the '{self.regions_set_col}' column in '{regions_csv_path}' for entries matching '{self.target_set}' and their overlap with pseudobulk data.")
            else:
                print(f"WARNING: target_set '{self.target_set}' was specified, but the column '{self.regions_set_col}' was not found in the merged DataFrame. No set-specific filtering was applied for this dataset instance. This instance will contain all data that matched other criteria.")

        # load drug metadata
        print(f"  Loading drug metadata from: {drug_meta_csv_path}")
        try:
            dm = pd.read_csv(drug_meta_csv_path)
            print(f"    Successfully loaded drug metadata with {len(dm)} entries.")
        except FileNotFoundError:
            print(f"FATAL ERROR: Drug metadata CSV not found at {drug_meta_csv_path}")
            raise
        except Exception as e:
            print(f"FATAL ERROR loading drug metadata CSV: {e}")
            raise
        
        # Ensure SMILES and ID columns are present and fill NA SMILES with empty string
        if self.drug_meta_smiles_col not in dm.columns:
            raise ValueError(f"SMILES column '{self.drug_meta_smiles_col}' not found in drug metadata.")
        if self.drug_meta_id_col not in dm.columns:
            raise ValueError(f"Drug ID column '{self.drug_meta_id_col}' not found in drug metadata.")
        dm[self.drug_meta_smiles_col] = dm[self.drug_meta_smiles_col].fillna('').astype(str)
        self.drug_meta = dm.set_index(self.drug_meta_id_col)

        # fasta reader & one-hot encoder
        self.fasta_reader = FastaReader(fasta_file_path)
        self.encoder      = GenomeOneHotEncoder(sequence_length=self.seq_len)
        print("TahoeSMILESDataset initialized.")

    def _generate_morgan_fingerprint(self, smiles_string: str) -> np.ndarray:
        """Generates a Morgan fingerprint from a SMILES string using the new generator API."""
        if not smiles_string:
            return np.zeros(self.morgan_fp_nbits, dtype=np.float32)
        try:
            mol = Chem.MolFromSmiles(smiles_string)
            if mol:
                # Use the generator's NumPy helper:
                fp_array = self._morgan_gen.GetFingerprintAsNumPy(mol)
                return fp_array.astype(np.float32)
            else:
                return np.zeros(self.morgan_fp_nbits, dtype=np.float32)
        except Exception as e:
            return np.zeros(self.morgan_fp_nbits, dtype=np.float32)

    def __len__(self):
        return len(self.samples_df)

    def __getitem__(self, idx):
        row = self.samples_df.iloc[idx]

        # --- DNA sequence ---
        chrom = str(row[self.regions_chr_col])
        start = int(row[self.regions_start_col])
        end   = int(row[self.regions_end_col])
        orig = end - start
        if self.seq_len != orig:
            center = end - orig//2
            half   = self.seq_len//2
            start, end = center-half, center+half

        seq = self.fasta_reader.get_sequence(chrom, start, end)
        oh  = self.encoder.encode(seq)
        seq_tensor = torch.tensor(oh, dtype=torch.float32)

        # --- Morgan Fingerprint --- 
        drug_id_for_fp = row[self.pbulk_drug_col]
        smiles_string = ''
        if drug_id_for_fp in self.drug_meta.index:
            smiles_string = self.drug_meta.loc[drug_id_for_fp, self.drug_meta_smiles_col]
            # If multiple entries for a drug_id, loc might return a Series. Take the first one.
            if isinstance(smiles_string, pd.Series):
                smiles_string = smiles_string.iloc[0]
        else:
            # print(f"Warning: Drug ID {drug_id_for_fp} not found in drug_meta. Using empty SMILES for fingerprint.")
            pass # SMILES string remains empty, will result in zero vector
        
        morgan_fp = self._generate_morgan_fingerprint(str(smiles_string)) # Ensure it's a string
        morgan_fp_tensor = torch.tensor(morgan_fp, dtype=torch.float32)

        # --- dose & target ---
        dose_val = float(row[self.pbulk_dose_col])
        expression_val = float(row[self.pbulk_expr_col])

        dose_tensor = torch.tensor([dose_val], dtype=torch.float32)
        tgt_tensor  = torch.tensor([expression_val], dtype=torch.float32)

        # --- Metadata for Logging ---
        gene_id_meta = str(row[self.pbulk_gene_col]) 
        drug_id_meta = str(row[self.pbulk_drug_col])
        cell_line_meta = str(row[self.pbulk_cell_line_col])

        return seq_tensor, morgan_fp_tensor, dose_tensor, tgt_tensor, gene_id_meta, drug_id_meta, cell_line_meta, chrom, start, end