Spaces:
Build error
Build error
| # Contains classes and functions for handling and transforming | |
| # features based on the JSON file information. | |
| import pandas as pd | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.preprocessing import StandardScaler, MinMaxScaler | |
| class FeatureHandler: | |
| def __init__(self, json_content): | |
| self.json_content = json_content | |
| def impute_missing_values(self, feature_details, X_train, X_test=None): | |
| mean_impute_features = [] | |
| median_impute_features = [] | |
| mode_impute_features = [] | |
| for feature, details in feature_details.items(): | |
| if details["missing_values"] == "Impute": | |
| if "mean" in details["impute_with"].lower() or "average" in details["impute_with"].lower(): | |
| mean_impute_features.append(feature) | |
| elif "median" in details["impute_with"].lower(): | |
| median_impute_features.append(feature) | |
| elif "mode" in details["impute_with"].lower() or "most frequent" in details["impute_with"].lower(): | |
| mode_impute_features.append(feature) | |
| if mean_impute_features: | |
| X_train[mean_impute_features] = X_train[mean_impute_features].fillna(X_train[mean_impute_features].mean()) | |
| if median_impute_features: | |
| X_train[median_impute_features] = X_train[median_impute_features].fillna(X_train[median_impute_features].median()) | |
| if mode_impute_features: | |
| X_train[mode_impute_features] = X_train[mode_impute_features].fillna(X_train[mode_impute_features].mode().iloc[0]) | |
| if X_test is not None: | |
| if mean_impute_features: | |
| X_test[mean_impute_features] = X_test[mean_impute_features].fillna(X_train[mean_impute_features].mean()) | |
| if median_impute_features: | |
| X_test[median_impute_features] = X_test[median_impute_features].fillna(X_train[median_impute_features].median()) | |
| if mode_impute_features: | |
| X_test[mode_impute_features] = X_test[mode_impute_features].fillna(X_train[mode_impute_features].mode().iloc[0]) | |
| return X_train, X_test | |
| # TODO: Add imputation for categorical features | |
| def scale_features(self, feature_details, X_train, X_test=None): | |
| min_max_scaler_features = [] | |
| standard_scaler_features = [] | |
| for feature, details in feature_details.items(): | |
| if details["rescaling"] == "MinMaxScaler": | |
| min_max_scaler_features.append(feature) | |
| elif details["rescaling"] == "StandardScaler": | |
| standard_scaler_features.append(feature) | |
| if min_max_scaler_features: | |
| scaler = MinMaxScaler() | |
| X_train_scaled = scaler.fit_transform(X_train[min_max_scaler_features]) | |
| X_train[min_max_scaler_features] = X_train_scaled | |
| if standard_scaler_features: | |
| scaler = StandardScaler() | |
| X_train_scaled = scaler.fit_transform(X_train[standard_scaler_features]) | |
| X_train[standard_scaler_features] = X_train_scaled | |
| if X_test is not None: | |
| if min_max_scaler_features: | |
| X_test_scaled = scaler.fit_transform(X_test[min_max_scaler_features]) | |
| X_test[min_max_scaler_features] = X_test_scaled | |
| if standard_scaler_features: | |
| X_test_scaled = scaler.fit_transform(X_test[standard_scaler_features]) | |
| X_test[standard_scaler_features] = X_test_scaled | |
| return X_train, X_test | |
| def transform_X_features(self, X_train, X_test, feature_details): | |
| X_train_transformed, X_test_transformed = self.impute_missing_values(feature_details, X_train, X_test) | |
| X_train_transformed, X_test_transformed = self.scale_features(feature_details, X_train_transformed, X_test_transformed) | |
| # tokenize and hash the target variable | |
| def tokenize_target_variable(self, y_train, y_test): | |
| # tokenize the target variable | |
| y_train_tokenized = y_train.apply(lambda x: x.split("-")[1]) | |
| y_train_encoded = pd.get_dummies(y_train_tokenized, prefix="Iris") | |
| y_test_tokenized = y_test.apply(lambda x: x.split("-")[1]) | |
| y_test_encoded = pd.get_dummies(y_test_tokenized, prefix="Iris") | |
| return y_train_encoded, y_test_encoded | |
| def transform_y_features(self, y_train, y_test, feature_details, target_variable): | |
| pass | |
| def get_split_dataset(self, selected_features): | |
| design_state = self.json_content["design_state_data"] | |
| dataset = design_state["session_info"]["dataset"] | |
| target_variable = design_state["target"]["target"] | |
| train_info = design_state["train"] | |
| train_ratio = train_info["train_ratio"] | |
| random_seed = train_info["random_seed"] | |
| DATASET_PATH = "data/"+dataset | |
| df = pd.read_csv(DATASET_PATH) | |
| X = df[selected_features] | |
| Y = df[target_variable] | |
| X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=train_ratio, | |
| random_state=random_seed) | |
| return X_train, X_test, y_train, y_test |