Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| import torch | |
| from torch.utils.data import DataLoader, Dataset | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler, LabelEncoder | |
| class CardiovascularDataset(Dataset): | |
| def __init__(self, X, y): | |
| self.X = torch.FloatTensor(X) | |
| self.y = torch.LongTensor(y) | |
| def __len__(self): | |
| return len(self.y) | |
| def __getitem__(self, idx): | |
| return self.X[idx], self.y[idx] | |
| def get_data_loaders(csv_path, batch_size=32): | |
| df = pd.read_csv(csv_path) | |
| # Drop Patient_ID as it's not a feature | |
| if 'Patient_ID' in df.columns: | |
| df = df.drop('Patient_ID', axis=1) | |
| # Encode categorical features | |
| le_smoking = LabelEncoder() | |
| df['smoking_status'] = le_smoking.fit_transform(df['smoking_status']) | |
| le_family = LabelEncoder() | |
| df['family_history_heart_disease'] = le_family.fit_transform(df['family_history_heart_disease']) | |
| # Encode target | |
| le_risk = LabelEncoder() | |
| df['risk_category'] = le_risk.fit_transform(df['risk_category']) | |
| # Class mapping: Low=1, Medium=2, High=0 (LabelEncoder sorts alphabetically) | |
| # Actually, let's see what LabelEncoder did: | |
| # High -> 0, Low -> 1, Medium -> 2 (alphabetical) | |
| X = df.drop('risk_category', axis=1).values | |
| y = df['risk_category'].values | |
| # Split data | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # Scale numerical features | |
| scaler = StandardScaler() | |
| X_train = scaler.fit_transform(X_train) | |
| X_test = scaler.transform(X_test) | |
| # Reshape for RNN: (batch, seq_len, input_size) | |
| # We treat features as a sequence of length 'num_features' with 1 feature per step | |
| X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1) | |
| X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1) | |
| train_dataset = CardiovascularDataset(X_train, y_train) | |
| test_dataset = CardiovascularDataset(X_test, y_test) | |
| train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True) | |
| test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False) | |
| return train_loader, test_loader, X_train.shape[1], len(le_risk.classes_) | |
| if __name__ == "__main__": | |
| train_loader, test_loader, num_features, num_classes = get_data_loaders('cardiovascular_risk_dataset.csv') | |
| print(f"Number of features: {num_features}") | |
| print(f"Number of classes: {num_classes}") | |
| for X, y in train_loader: | |
| print(f"Batch X shape: {X.shape}") | |
| print(f"Batch y shape: {y.shape}") | |
| break | |