Spaces:
Sleeping
Sleeping
| # Author: Juan Parras & Patricia A. Apellániz | |
| # Email: patricia.alonsod@upm.es | |
| # Date: 05/08/2025 | |
| # Package imports | |
| import os | |
| import pandas as pd | |
| from sklearn.datasets import load_breast_cancer | |
| from sklearn.model_selection import train_test_split | |
| def scale_numerical_data(data): | |
| cols_to_scale = [col for col in data.columns if len(data[col].unique()) > 10] | |
| data_norm = data[cols_to_scale].values | |
| data_norm = (data_norm - data_norm.mean(axis=0)) / data_norm.std(axis=0) | |
| data.loc[:, cols_to_scale] = data_norm | |
| return data | |
| def load_data(dataset_name, args, test_split=0.2, n_patients=1000): | |
| if dataset_name in ['heart', 'diabetes_h', 'diabetes_130']: | |
| # These data are already scaled and the column names do not have spaces | |
| data = pd.read_csv(os.path.join(args['data_folder'], f"{dataset_name}_data.csv")) | |
| # Keep only n_patients | |
| data = data.sample(n=n_patients, random_state=0).reset_index(drop=True) | |
| target_name = data.columns[-1] | |
| x, y = data.drop(columns=[target_name]), data[target_name] | |
| elif dataset_name == 'obesity' or dataset_name == 'obesity_bin': # See https://archive.ics.uci.edu/dataset/544/estimation+of+obesity+levels+based+on+eating+habits+and+physical+condition | |
| data = pd.read_csv(os.path.join(args['data_folder'], 'obesity.csv')) | |
| # Keep only n_patients | |
| data = data.sample(n=n_patients, random_state=0).reset_index(drop=True) | |
| # Convert all variables to numeric | |
| data['Gender'] = data['Gender'].apply(lambda x: 1 if x == 'Female' else 0) | |
| data['family_history_with_overweight'] = data['family_history_with_overweight'].apply(lambda x: 1 if x == 'yes' else 0) | |
| data['FAVC'] = data['FAVC'].apply(lambda x: 1 if x == 'yes' else 0) | |
| data['CAEC'] = data['CAEC'].apply(lambda x: 3 if x == 'Always' else (2 if x == 'Frequently' else (1 if x == 'Sometimes' else 0))) | |
| data['SMOKE'] = data['SMOKE'].apply(lambda x: 1 if x == 'yes' else 0) | |
| data['SCC'] = data['SCC'].apply(lambda x: 1 if x == 'yes' else 0) | |
| data['CALC'] = data['CALC'].apply(lambda x: 3 if x == 'Always' else (2 if x == 'Frequently' else (1 if x == 'Sometimes' else 0))) | |
| data['MTRANS'] = data['MTRANS'].apply(lambda x: 4 if x == 'Automobile' else (3 if x == 'Motorbike' else (2 if x == 'Bike' else (1 if x == 'Public_Transportation' else 0)))) | |
| data['NObeyesdad'] = data['NObeyesdad'].apply(lambda x: 6 if x == 'Obesity_Type_III' else (5 if x == 'Obesity_Type_II' else (4 if x == 'Obesity_Type_I' else (3 if x == 'Overweight_Level_II' else (2 if x == 'Overweight_Level_I' else (1 if x == 'Normal_Weight' else 0)))))) | |
| if dataset_name == 'obesity_bin': | |
| data['NObeyesdad'] = data['NObeyesdad'].apply(lambda x: 1 if x > 3 else 0) # Binary classification | |
| # Impute missing values | |
| data = data.fillna(data.mean()) | |
| target_name = 'NObeyesdad' | |
| x, y = data.drop(columns=[target_name]), data[target_name] | |
| x = scale_numerical_data(x) | |
| elif dataset_name == 'breast_cancer': | |
| data = load_breast_cancer(as_frame=True) | |
| x, y = data.data, data.target | |
| # Rename columns to remove spaces | |
| new_cols = [col.replace(' ', '_') for col in x.columns] | |
| x.columns = new_cols | |
| x = scale_numerical_data(x) | |
| else: | |
| raise ValueError(f"Data name {dataset_name} not found") | |
| # Split the data into training and test sets | |
| x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=test_split, random_state=0) | |
| return x_train, x_test, y_train, y_test |