Mar Elizo
clean deploy
c52261f
# Author: Juan Parras & Patricia A. Apellániz
# Email: patricia.alonsod@upm.es
# Date: 05/08/2025
# Package imports
import os
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
def scale_numerical_data(data):
cols_to_scale = [col for col in data.columns if len(data[col].unique()) > 10]
data_norm = data[cols_to_scale].values
data_norm = (data_norm - data_norm.mean(axis=0)) / data_norm.std(axis=0)
data.loc[:, cols_to_scale] = data_norm
return data
def load_data(dataset_name, args, test_split=0.2, n_patients=1000):
if dataset_name in ['heart', 'diabetes_h', 'diabetes_130']:
# These data are already scaled and the column names do not have spaces
data = pd.read_csv(os.path.join(args['data_folder'], f"{dataset_name}_data.csv"))
# Keep only n_patients
data = data.sample(n=n_patients, random_state=0).reset_index(drop=True)
target_name = data.columns[-1]
x, y = data.drop(columns=[target_name]), data[target_name]
elif dataset_name == 'obesity' or dataset_name == 'obesity_bin': # See https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition
data = pd.read_csv(os.path.join(args['data_folder'], 'obesity.csv'))
# Keep only n_patients
data = data.sample(n=n_patients, random_state=0).reset_index(drop=True)
# Convert all variables to numeric
data['Gender'] = data['Gender'].apply(lambda x: 1 if x == 'Female' else 0)
data['family_history_with_overweight'] = data['family_history_with_overweight'].apply(lambda x: 1 if x == 'yes' else 0)
data['FAVC'] = data['FAVC'].apply(lambda x: 1 if x == 'yes' else 0)
data['CAEC'] = data['CAEC'].apply(lambda x: 3 if x == 'Always' else (2 if x == 'Frequently' else (1 if x == 'Sometimes' else 0)))
data['SMOKE'] = data['SMOKE'].apply(lambda x: 1 if x == 'yes' else 0)
data['SCC'] = data['SCC'].apply(lambda x: 1 if x == 'yes' else 0)
data['CALC'] = data['CALC'].apply(lambda x: 3 if x == 'Always' else (2 if x == 'Frequently' else (1 if x == 'Sometimes' else 0)))
data['MTRANS'] = data['MTRANS'].apply(lambda x: 4 if x == 'Automobile' else (3 if x == 'Motorbike' else (2 if x == 'Bike' else (1 if x == 'Public_Transportation' else 0))))
data['NObeyesdad'] = data['NObeyesdad'].apply(lambda x: 6 if x == 'Obesity_Type_III' else (5 if x == 'Obesity_Type_II' else (4 if x == 'Obesity_Type_I' else (3 if x == 'Overweight_Level_II' else (2 if x == 'Overweight_Level_I' else (1 if x == 'Normal_Weight' else 0))))))
if dataset_name == 'obesity_bin':
data['NObeyesdad'] = data['NObeyesdad'].apply(lambda x: 1 if x > 3 else 0) # Binary classification
# Impute missing values
data = data.fillna(data.mean())
target_name = 'NObeyesdad'
x, y = data.drop(columns=[target_name]), data[target_name]
x = scale_numerical_data(x)
elif dataset_name == 'breast_cancer':
data = load_breast_cancer(as_frame=True)
x, y = data.data, data.target
# Rename columns to remove spaces
new_cols = [col.replace(' ', '_') for col in x.columns]
x.columns = new_cols
x = scale_numerical_data(x)
else:
raise ValueError(f"Data name {dataset_name} not found")
# Split the data into training and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_split, random_state=0)
return x_train, x_test, y_train, y_test