File size: 2,733 Bytes
f4135ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

class CardiovascularDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.FloatTensor(X)
        self.y = torch.LongTensor(y)
    
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

def get_data_loaders(csv_path, batch_size=32):
    df = pd.read_csv(csv_path)
    
    # Drop Patient_ID as it's not a feature
    if 'Patient_ID' in df.columns:
        df = df.drop('Patient_ID', axis=1)
    
    # Encode categorical features
    le_smoking = LabelEncoder()
    df['smoking_status'] = le_smoking.fit_transform(df['smoking_status'])
    
    le_family = LabelEncoder()
    df['family_history_heart_disease'] = le_family.fit_transform(df['family_history_heart_disease'])
    
    # Encode target
    le_risk = LabelEncoder()
    df['risk_category'] = le_risk.fit_transform(df['risk_category'])
    
    # Class mapping: Low=1, Medium=2, High=0 (LabelEncoder sorts alphabetically)
    # Actually, let's see what LabelEncoder did:
    # High -> 0, Low -> 1, Medium -> 2 (alphabetical)
    
    X = df.drop('risk_category', axis=1).values
    y = df['risk_category'].values
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Scale numerical features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    # Reshape for RNN: (batch, seq_len, input_size)
    # We treat features as a sequence of length 'num_features' with 1 feature per step
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], 1)
    
    train_dataset = CardiovascularDataset(X_train, y_train)
    test_dataset = CardiovascularDataset(X_test, y_test)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
    
    return train_loader, test_loader, X_train.shape[1], len(le_risk.classes_)

if __name__ == "__main__":
    train_loader, test_loader, num_features, num_classes = get_data_loaders('cardiovascular_risk_dataset.csv')
    print(f"Number of features: {num_features}")
    print(f"Number of classes: {num_classes}")
    for X, y in train_loader:
        print(f"Batch X shape: {X.shape}")
        print(f"Batch y shape: {y.shape}")
        break