File size: 3,604 Bytes
c52261f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
# Author: Juan Parras & Patricia A. Apellániz
# Email: patricia.alonsod@upm.es
# Date: 05/08/2025

# Package imports
import os

import pandas as pd

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split


def scale_numerical_data(data):
    cols_to_scale = [col for col in data.columns if len(data[col].unique()) > 10]
    data_norm = data[cols_to_scale].values
    data_norm = (data_norm - data_norm.mean(axis=0)) / data_norm.std(axis=0)
    data.loc[:, cols_to_scale] = data_norm
    return data


def load_data(dataset_name, args, test_split=0.2, n_patients=1000):
    if dataset_name in ['heart', 'diabetes_h', 'diabetes_130']:
        # These data are already scaled and the column names do not have spaces
        data = pd.read_csv(os.path.join(args['data_folder'], f"{dataset_name}_data.csv"))
        # Keep only n_patients
        data = data.sample(n=n_patients, random_state=0).reset_index(drop=True)
        target_name = data.columns[-1]
        x, y = data.drop(columns=[target_name]), data[target_name]

    elif dataset_name == 'obesity' or dataset_name == 'obesity_bin': # See https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition
        data = pd.read_csv(os.path.join(args['data_folder'], 'obesity.csv'))

        # Keep only n_patients
        data = data.sample(n=n_patients, random_state=0).reset_index(drop=True)

        # Convert all variables to numeric
        data['Gender'] = data['Gender'].apply(lambda x: 1 if x == 'Female' else 0)
        data['family_history_with_overweight'] = data['family_history_with_overweight'].apply(lambda x: 1 if x == 'yes' else 0)
        data['FAVC'] = data['FAVC'].apply(lambda x: 1 if x == 'yes' else 0)
        data['CAEC'] = data['CAEC'].apply(lambda x: 3 if x == 'Always' else (2 if x == 'Frequently' else (1 if x == 'Sometimes' else 0)))
        data['SMOKE'] = data['SMOKE'].apply(lambda x: 1 if x == 'yes' else 0)
        data['SCC'] = data['SCC'].apply(lambda x: 1 if x == 'yes' else 0)
        data['CALC'] = data['CALC'].apply(lambda x: 3 if x == 'Always' else (2 if x == 'Frequently' else (1 if x == 'Sometimes' else 0)))
        data['MTRANS'] = data['MTRANS'].apply(lambda x: 4 if x == 'Automobile' else (3 if x == 'Motorbike' else (2 if x == 'Bike' else (1 if x == 'Public_Transportation' else 0))))
        data['NObeyesdad'] = data['NObeyesdad'].apply(lambda x: 6 if x == 'Obesity_Type_III' else (5 if x == 'Obesity_Type_II' else (4 if x == 'Obesity_Type_I' else (3 if x == 'Overweight_Level_II' else (2 if x == 'Overweight_Level_I' else (1 if x == 'Normal_Weight' else 0))))))

        if dataset_name == 'obesity_bin':
            data['NObeyesdad'] = data['NObeyesdad'].apply(lambda x: 1 if x > 3 else 0)  # Binary classification

        # Impute missing values
        data = data.fillna(data.mean())
        target_name = 'NObeyesdad'
        x, y = data.drop(columns=[target_name]), data[target_name]
        x = scale_numerical_data(x)

    elif dataset_name == 'breast_cancer':
        data = load_breast_cancer(as_frame=True)
        x, y = data.data, data.target

        # Rename columns to remove spaces
        new_cols = [col.replace(' ', '_') for col in x.columns]
        x.columns = new_cols
        x = scale_numerical_data(x)

    else:
        raise ValueError(f"Data name {dataset_name} not found")

    # Split the data into training and test sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_split, random_state=0)

    return x_train, x_test, y_train, y_test