Spaces:
Sleeping
Sleeping
File size: 2,733 Bytes
f4135ce | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 | import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
class CardiovascularDataset(Dataset):
def __init__(self, X, y):
self.X = torch.FloatTensor(X)
self.y = torch.LongTensor(y)
def __len__(self):
return len(self.y)
def __getitem__(self, idx):
return self.X[idx], self.y[idx]
def get_data_loaders(csv_path, batch_size=32):
df = pd.read_csv(csv_path)
# Drop Patient_ID as it's not a feature
if 'Patient_ID' in df.columns:
df = df.drop('Patient_ID', axis=1)
# Encode categorical features
le_smoking = LabelEncoder()
df['smoking_status'] = le_smoking.fit_transform(df['smoking_status'])
le_family = LabelEncoder()
df['family_history_heart_disease'] = le_family.fit_transform(df['family_history_heart_disease'])
# Encode target
le_risk = LabelEncoder()
df['risk_category'] = le_risk.fit_transform(df['risk_category'])
# Class mapping: Low=1, Medium=2, High=0 (LabelEncoder sorts alphabetically)
# Actually, let's see what LabelEncoder did:
# High -> 0, Low -> 1, Medium -> 2 (alphabetical)
X = df.drop('risk_category', axis=1).values
y = df['risk_category'].values
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale numerical features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Reshape for RNN: (batch, seq_len, input_size)
# We treat features as a sequence of length 'num_features' with 1 feature per step
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
train_dataset = CardiovascularDataset(X_train, y_train)
test_dataset = CardiovascularDataset(X_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
return train_loader, test_loader, X_train.shape[1], len(le_risk.classes_)
if __name__ == "__main__":
train_loader, test_loader, num_features, num_classes = get_data_loaders('cardiovascular_risk_dataset.csv')
print(f"Number of features: {num_features}")
print(f"Number of classes: {num_classes}")
for X, y in train_loader:
print(f"Batch X shape: {X.shape}")
print(f"Batch y shape: {y.shape}")
break
|