File size: 5,264 Bytes
ad98fbf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Contains classes and functions for handling and transforming 
# features based on the JSON file information.
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler

class FeatureHandler:
    def __init__(self, json_content):
        self.json_content = json_content

    def impute_missing_values(self, feature_details, X_train, X_test=None):
        mean_impute_features = []
        median_impute_features = []
        mode_impute_features = []
        for feature, details in feature_details.items():
            if details["missing_values"] == "Impute":
                if "mean" in details["impute_with"].lower() or "average" in details["impute_with"].lower():
                    mean_impute_features.append(feature)
                elif "median" in details["impute_with"].lower():
                    median_impute_features.append(feature)
                elif "mode" in details["impute_with"].lower() or "most frequent" in details["impute_with"].lower():
                    mode_impute_features.append(feature)
        if mean_impute_features:
            X_train[mean_impute_features] = X_train[mean_impute_features].fillna(X_train[mean_impute_features].mean())
        if median_impute_features:
            X_train[median_impute_features] = X_train[median_impute_features].fillna(X_train[median_impute_features].median())
        if mode_impute_features:
            X_train[mode_impute_features] = X_train[mode_impute_features].fillna(X_train[mode_impute_features].mode().iloc[0])
        if X_test is not None:
            if mean_impute_features:
                X_test[mean_impute_features] = X_test[mean_impute_features].fillna(X_train[mean_impute_features].mean())
            if median_impute_features:
                X_test[median_impute_features] = X_test[median_impute_features].fillna(X_train[median_impute_features].median())
            if mode_impute_features:
                X_test[mode_impute_features] = X_test[mode_impute_features].fillna(X_train[mode_impute_features].mode().iloc[0])
        return X_train, X_test

    

    # TODO: Add imputation for categorical features
    def scale_features(self, feature_details, X_train, X_test=None):
        min_max_scaler_features = []
        standard_scaler_features = []
        for feature, details in feature_details.items():
            if details["rescaling"] == "MinMaxScaler":
                min_max_scaler_features.append(feature)
            elif details["rescaling"] == "StandardScaler":
                standard_scaler_features.append(feature)
        
        if min_max_scaler_features:
            scaler = MinMaxScaler()
            X_train_scaled = scaler.fit_transform(X_train[min_max_scaler_features])
            X_train[min_max_scaler_features] = X_train_scaled
        if standard_scaler_features:
            scaler = StandardScaler()
            X_train_scaled = scaler.fit_transform(X_train[standard_scaler_features])
            X_train[standard_scaler_features] = X_train_scaled
        if X_test is not None:
            if min_max_scaler_features:
                X_test_scaled = scaler.fit_transform(X_test[min_max_scaler_features])
                X_test[min_max_scaler_features] = X_test_scaled
            if standard_scaler_features:
                X_test_scaled = scaler.fit_transform(X_test[standard_scaler_features])
                X_test[standard_scaler_features] = X_test_scaled
        return X_train, X_test

    
    def transform_X_features(self, X_train, X_test, feature_details):
        X_train_transformed, X_test_transformed = self.impute_missing_values(feature_details, X_train, X_test)
        X_train_transformed, X_test_transformed = self.scale_features(feature_details, X_train_transformed, X_test_transformed)

    # tokenize and hash the target variable
    def tokenize_target_variable(self, y_train, y_test):
        # tokenize the target variable
        y_train_tokenized = y_train.apply(lambda x: x.split("-")[1])
        y_train_encoded = pd.get_dummies(y_train_tokenized, prefix="Iris")
        y_test_tokenized = y_test.apply(lambda x: x.split("-")[1])
        y_test_encoded = pd.get_dummies(y_test_tokenized, prefix="Iris")
        return y_train_encoded, y_test_encoded

    def transform_y_features(self, y_train, y_test, feature_details, target_variable):
        pass    

    def get_split_dataset(self, selected_features):
        design_state = self.json_content["design_state_data"]
        dataset = design_state["session_info"]["dataset"]
        target_variable = design_state["target"]["target"]
        
        train_info = design_state["train"]
        train_ratio = train_info["train_ratio"]
        random_seed = train_info["random_seed"]

        DATASET_PATH = "data/"+dataset
        df = pd.read_csv(DATASET_PATH)
        X = df[selected_features]
        Y = df[target_variable]

        X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=train_ratio, 
                                                            random_state=random_seed)
        
        return X_train, X_test, y_train, y_test