DataFlowPro / src /.ipynb_checkpoints /feature_handler-checkpoint.py
boringnose's picture
Upload 25 files
ad98fbf verified
# Contains classes and functions for handling and transforming
# features based on the JSON file information.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
class FeatureHandler:
def __init__(self, json_content):
self.json_content = json_content
def impute_missing_values(self, feature_details, X_train, X_test=None):
mean_impute_features = []
median_impute_features = []
mode_impute_features = []
for feature, details in feature_details.items():
if details["missing_values"] == "Impute":
if "mean" in details["impute_with"].lower() or "average" in details["impute_with"].lower():
mean_impute_features.append(feature)
elif "median" in details["impute_with"].lower():
median_impute_features.append(feature)
elif "mode" in details["impute_with"].lower() or "most frequent" in details["impute_with"].lower():
mode_impute_features.append(feature)
if mean_impute_features:
X_train[mean_impute_features] = X_train[mean_impute_features].fillna(X_train[mean_impute_features].mean())
if median_impute_features:
X_train[median_impute_features] = X_train[median_impute_features].fillna(X_train[median_impute_features].median())
if mode_impute_features:
X_train[mode_impute_features] = X_train[mode_impute_features].fillna(X_train[mode_impute_features].mode().iloc[0])
if X_test is not None:
if mean_impute_features:
X_test[mean_impute_features] = X_test[mean_impute_features].fillna(X_train[mean_impute_features].mean())
if median_impute_features:
X_test[median_impute_features] = X_test[median_impute_features].fillna(X_train[median_impute_features].median())
if mode_impute_features:
X_test[mode_impute_features] = X_test[mode_impute_features].fillna(X_train[mode_impute_features].mode().iloc[0])
return X_train, X_test
# TODO: Add imputation for categorical features
def scale_features(self, feature_details, X_train, X_test=None):
min_max_scaler_features = []
standard_scaler_features = []
for feature, details in feature_details.items():
if details["rescaling"] == "MinMaxScaler":
min_max_scaler_features.append(feature)
elif details["rescaling"] == "StandardScaler":
standard_scaler_features.append(feature)
if min_max_scaler_features:
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train[min_max_scaler_features])
X_train[min_max_scaler_features] = X_train_scaled
if standard_scaler_features:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[standard_scaler_features])
X_train[standard_scaler_features] = X_train_scaled
if X_test is not None:
if min_max_scaler_features:
X_test_scaled = scaler.fit_transform(X_test[min_max_scaler_features])
X_test[min_max_scaler_features] = X_test_scaled
if standard_scaler_features:
X_test_scaled = scaler.fit_transform(X_test[standard_scaler_features])
X_test[standard_scaler_features] = X_test_scaled
return X_train, X_test
def transform_X_features(self, X_train, X_test, feature_details):
X_train_transformed, X_test_transformed = self.impute_missing_values(feature_details, X_train, X_test)
X_train_transformed, X_test_transformed = self.scale_features(feature_details, X_train_transformed, X_test_transformed)
# tokenize and hash the target variable
def tokenize_target_variable(self, y_train, y_test):
# tokenize the target variable
y_train_tokenized = y_train.apply(lambda x: x.split("-")[1])
y_train_encoded = pd.get_dummies(y_train_tokenized, prefix="Iris")
y_test_tokenized = y_test.apply(lambda x: x.split("-")[1])
y_test_encoded = pd.get_dummies(y_test_tokenized, prefix="Iris")
return y_train_encoded, y_test_encoded
def transform_y_features(self, y_train, y_test, feature_details, target_variable):
pass
def get_split_dataset(self, selected_features):
design_state = self.json_content["design_state_data"]
dataset = design_state["session_info"]["dataset"]
target_variable = design_state["target"]["target"]
train_info = design_state["train"]
train_ratio = train_info["train_ratio"]
random_seed = train_info["random_seed"]
DATASET_PATH = "data/"+dataset
df = pd.read_csv(DATASET_PATH)
X = df[selected_features]
Y = df[target_variable]
X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=train_ratio,
random_state=random_seed)
return X_train, X_test, y_train, y_test