import pandas as pd import numpy as np import torch from torch.utils.data import DataLoader, Dataset from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder class CardiovascularDataset(Dataset): def __init__(self, X, y): self.X = torch.FloatTensor(X) self.y = torch.LongTensor(y) def __len__(self): return len(self.y) def __getitem__(self, idx): return self.X[idx], self.y[idx] def get_data_loaders(csv_path, batch_size=32): df = pd.read_csv(csv_path) # Drop Patient_ID as it's not a feature if 'Patient_ID' in df.columns: df = df.drop('Patient_ID', axis=1) # Encode categorical features le_smoking = LabelEncoder() df['smoking_status'] = le_smoking.fit_transform(df['smoking_status']) le_family = LabelEncoder() df['family_history_heart_disease'] = le_family.fit_transform(df['family_history_heart_disease']) # Encode target le_risk = LabelEncoder() df['risk_category'] = le_risk.fit_transform(df['risk_category']) # Class mapping: Low=1, Medium=2, High=0 (LabelEncoder sorts alphabetically) # Actually, let's see what LabelEncoder did: # High -> 0, Low -> 1, Medium -> 2 (alphabetical) X = df.drop('risk_category', axis=1).values y = df['risk_category'].values # Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Scale numerical features scaler = StandardScaler() X_train = scaler.fit_transform(X_train) X_test = scaler.transform(X_test) # Reshape for RNN: (batch, seq_len, input_size) # We treat features as a sequence of length 'num_features' with 1 feature per step X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1) X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1) train_dataset = CardiovascularDataset(X_train, y_train) test_dataset = CardiovascularDataset(X_test, y_test) train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) return train_loader, test_loader, X_train.shape[1], len(le_risk.classes_) if __name__ == "__main__": train_loader, test_loader, num_features, num_classes = get_data_loaders('cardiovascular_risk_dataset.csv') print(f"Number of features: {num_features}") print(f"Number of classes: {num_classes}") for X, y in train_loader: print(f"Batch X shape: {X.shape}") print(f"Batch y shape: {y.shape}") break