Spaces:

anly656
/

dr_jones

Sleeping

File size: 37,967 Bytes

8643b59

"""
@author: Edward R Jones
@version 1.34
@copyright 2020 - Edward R Jones, all rights reserved.
"""
#from DT import DT
import sys
import warnings
import numpy  as np
import pandas as pd
from sklearn import preprocessing
from sklearn.impute  import SimpleImputer
from copy import deepcopy #Used to create sentiment word dictionary

import pickle
from enum import Enum
#Class DT - DataType This is setup to provide a clean
#notation for data maps used by ReplaceImputeEncode
class DT(Enum):
    # @attributes: characters recognized in RIE code
    Interval = 'I' #Expected values (lowest value, highest value)
    Binary   = 'B' #Expected values (class0, class1)
    Nominal  = 'N' #Expected values (class0, class1, ... classk)
    Ordinal  = 'O' #Expected values ordered classes (class0, class1, ...)
    String   = 'S' #Expected values ("")
    ID       = 'Z' #Expected values ("")
    Label    = 'L' #Expected values ("")
    Text     = 'T' #Expected values ("")
    Ignore   = 'Z' #Expected values ("")
    interval = 'I' #Allow lower case
    binary   = 'B' #Allow lower case
    nominal  = 'N' #Allow lower case
    ordinal  = 'O' #Allow lower case
    string   = 'S' #Allow lower case
    id       = 'Z' #Expected values ("")
    label    = 'L' #Expected values ("")
    text     = 'T' #Allow lower case
    ignore   = 'Z' #Allow lower case

    # @methods
    def getDataTypes():
        dtype = [
                DT.Interval,
                DT.Binary, 
                DT.Nominal, 
                DT.Ordinal,
                DT.ID, 
                DT.Label,
                DT.Text , 
                DT.String,
                DT.Ignore 
                ]
        return dtype #Returns data type list
    
    def convertDataType(atype):
        if   atype==DT.Interval:
             ctype ='DT.Interval'
        elif atype==DT.Binary:
             ctype ='DT.Binary'
        elif atype==DT.Nominal:
             ctype ='DT.Nominal'
        elif atype==DT.Ordinal:
             ctype ='DT.Ordinal'
        elif atype==DT.String:
             ctype ='DT.String'
        elif atype==DT.ID:
             ctype ='DT.ID'
        elif atype==DT.Label:
             ctype ='DT.Label'
        elif atype==DT.Text:
             ctype ='DT.Text'
        else:ctype ='DT.Ignore'
        return ctype
    
"""
class ReplaceImputeEncode

@parameters:
    *** __init__() ***
    data_map - The metadata dictionary.  
                
    nominal_encoding - Can be 'one-hot', 'SAS' or default None.
    
    interval_scale   - Can be 'std', 'robust' or default None.
    
    no_impute        - default None or list of attributes to exclude from 
                       imputation
    
    drop             - True or default False.  True drops the last nominal 
                       encoded column.  False keeps all nominal encoded 
                       columns.
    
    display          - True or default False.  True displays the number of 
                       missing and outliers found in the data.
                        
    *** fit_transform () ***
    df  - a pandas DataFrame containing the data description
                        by the metadata found in data_map (required)
    data_map - See above description.
    
@Cautions:
    The incoming dataframe, df, and the data_map are deep copied to
    ensure that changes to the dataframe are only held within the class
    object self.copy_df.  The attributes_map is deep copied into
    self.features_map.  All binary and nominal values are encoded to 
    numeric values.
    
    The method draft_data_map returns a of the data_map based upon
    the data.  This must be examined to ensure the data types and 
    allowed values are correct.  This behavior is controled by k_min
    and k_max.  See API and examples for details.
"""

class ReplaceImputeEncode(object):
    def __init__(self, data_map=None, binary_encoding=None,
               nominal_encoding=None, interval_scale=None, no_impute=None, 
               no_encode=None, drop=False, display=False): 
        if interval_scale=='None' or interval_scale=='none':
            self.interval_scale=None
        else:
            self.interval_scale=interval_scale
        self.go_flag = False
        self.features_map = data_map
        self.drop    = drop
        self.display = display
        self.interval_scale = interval_scale
        self.no_impute = no_impute
        self.no_encode = no_encode
        if binary_encoding=='None' or binary_encoding=='none':
            self.binary_encoding = None
        else:
            self.binary_encoding = binary_encoding
        #nominal_encoding can be 'SAS' or 'one-hot'
        if binary_encoding != 'SAS' and binary_encoding != 'one-hot' \
            and binary_encoding != None:
            raise ValueError("***Call to ReplaceImputeEncode invalid. "+
                 "***   binary_encoding="+binary_encoding+" is invalid."+
                 "***   must use None, 'one-hot' or 'SAS'")
            sys.exit()
        if nominal_encoding=='None' or nominal_encoding=='none':
            self.nominal_encoding = None
        else:
            self.nominal_encoding = nominal_encoding
        #nominal_encoding can be 'SAS' or 'one-hot'
        if nominal_encoding != 'SAS' and nominal_encoding != 'one-hot' \
            and nominal_encoding != None:
            raise ValueError("***Call to ReplaceImputeEncode invalid. "+
                 "***   nominal_encoding="+nominal_encoding+" is invalid."+
                 "***   must use None, 'one-hot' or 'SAS'")
            sys.exit()
        if interval_scale != 'std' and interval_scale != 'robust' \
            and interval_scale != None:
            raise ValueError("***Call to ReplaceImputeEncode invalid. "+
                     "***   interval_scale="+interval_scale+" is invalid."+
                     "***   must use None, 'std' or 'robust'")
            sys.exit()
        if data_map==None:
            print("Attributes Map is required.")
            print("Please pass map using data_map attribute.")
            print("If one is not available, try creating one using "+
                  "call to draft_data_map(df)")
            return
        if type(data_map)==str:
            try:
                self.features_map = self.load_data_map(data_map)
            except:
                raise ValueError("Unable to load data map:", data_map)
                sys.exit()
        elif type(data_map)==dict:
            self.features_map = data_map
        else:
            raise ValueError("Supplied Data Map not Dictionary or File")
            sys.exit()
        self.interval_attributes = []
        self.nominal_attributes  = []
        self.binary_attributes   = []
        self.onehot_attributes   = []
        self.onehot_cats         = []
        self.hot_drop_list       = []
        self.missing_counts      = {}
        self.outlier_counts      = {}
        for feature,v in self.features_map.items():
            # Initialize data map missing and outlier counters to zero
            self.missing_counts[feature] = 0
            self.outlier_counts[feature] = 0

            if v[0] not in DT.getDataTypes():
                raise TypeError(
                  "\n***Data Map in call to ReplaceImputeEncode invalid.\n"+
                  "***Data Type for '"+ feature + "' is not recognized. "+
                  "\n***Valid types are: DT.Interval, DT.Binary, DT.Nominal, "+
                  "DT.Text, DT.String, DT.ID, or DT.Ignore")
            if v[0]==DT.Interval:
                self.interval_attributes.append(feature)
            else:
                if v[0]==DT.Binary:
                    self.binary_attributes.append(feature)
                else:
                    if v[0]!=DT.Binary and v[0]!=DT.Nominal: 
                        # Ignore, don't touch this attribute
                        continue
                    # Attribute must be Nominal
                    self.nominal_attributes.append(feature)
                    # Setup column names for Nominal encoding
                    n_cat = len(v[1])
                    self.onehot_cats.append(list(v[1]))
                    data_type = type(v[1][n_cat-1])
                    if self.drop == True:
                        n_cat -= 1
                    for i in range(n_cat):
                        if type(v[1][i]) != data_type:
                            raise TypeError(
                              "\n***Classes invalid for--> '"+feature+"'"+
                              "\n***Must be all numeric or strings, not both.")
                        if type(v[1][i])==int:
                            my_str = feature+str(v[1][i])
                        else:
                            my_str = feature+("%i" %i)+":"+str(v[1][i])[0:10]
                        self.onehot_attributes.append(my_str)

        self.n_interval = len(self.interval_attributes)
        self.n_binary   = len(self.binary_attributes)
        self.n_nominal  = len(self.nominal_attributes)
        self.n_onehot   = len(self.onehot_attributes)
        self.cat        = self.n_binary + self.n_nominal
        if nominal_encoding=='SAS' and drop==False and self.n_nominal>0:
            raise ValueError("***Call to ReplaceImputeEncode invalid. "+
                  "***nominal_encoding='SAS' requested with drop=False "+
                  "***'SAS' encoding requires drop=True")
            sys.exit()
        self.col = []
        for i in range(self.n_interval):
            self.col.append(self.interval_attributes[i])
        for i in range(self.n_binary):
            self.col.append(self.binary_attributes[i])
        if self.nominal_encoding==None:
            for i in range(self.n_nominal):
                self.col.append(self.nominal_attributes[i])
        else:
            for i in range(self.n_onehot):
                self.col.append(self.onehot_attributes[i])

        self.go_flag = True
        
    def fit(self, df, data_map=None):
        self.df_copy = deepcopy(df)
        #self.df_copy = df
        if data_map==None and self.features_map==None:
            raise ValueError("  Call to ReplaceImputeEncode missing required"+
              " Data Map.\n Use function draft_data_map to draft a map.")
            sys.exit()
        if type(self.features_map)==dict:
            pass
        elif type(data_map)==str:
            try:
                self.features_map = self.load_data_map(data_map)
            except:
                raise ValueError("Unable to load data map:", data_map)
                sys.exit()
        elif type(data_map)==dict:
            self.features_map = data_map
        else:
            raise ValueError("Supplied Data Map not Dictionary or File")
            sys.exit()
            
        self.interval_attributes = []
        self.nominal_attributes  = []
        self.binary_attributes   = []
        self.onehot_attributes   = []
        self.onehot_cats         = []
        self.hot_drop_list       = []
        for feature,v in self.features_map.items():
            if v[0] not in DT.getDataTypes():
                raise TypeError(
                  "\n***Data Map in call to ReplaceImputeEncode invalid.\n"+
                  "***Data Type for '"+ feature + "' is not recognized. "+
                  "\n***Valid types are: DT.Interval, DT.Binary, DT.Nominal, "+
                  "DT.Text, DT.String, DT.ID, or DT.Ignore")
            if v[0]==DT.Interval:
                self.interval_attributes.append(feature)
            else:
                if v[0]==DT.Binary:
                    self.binary_attributes.append(feature)
                else:
                    if v[0]==DT.Nominal:
                        self.nominal_attributes.append(feature)
                        self.onehot_cats.append(list(v[1]))
                        for i in range(len(v[1])):
                            if type(v[1][i])==int:
                                my_str = feature+str(v[1][i])
                            else:
                                my_str = feature+("%i" %i)+":"+ \
                                                str(v[1][i])[0:10]
                            self.onehot_attributes.append(my_str)
                        if self.drop==True:
                            self.hot_drop_list.append(my_str)
                    else:
                        if v[0] in DT.getDataTypes():
                            continue
                        else:
                        # Data Map Invalid
                            raise TypeError( 
                  "***Data Map in call to ReplaceImputeEncode invalid.\n"+
                  "***Data Type for '"+ feature + "' invalid")
                        sys.exit()
        self.n_interval = len(self.interval_attributes)
        self.n_binary   = len(self.binary_attributes)
        self.n_nominal  = len(self.nominal_attributes)
        self.n_onehot   = len(self.onehot_attributes)
        self.cat        = self.n_binary + self.n_nominal
        self.n_obs      = df.shape[0]
        self.n_ignored  = df.shape[1] - \
                         self.n_interval-self.n_binary-self.n_nominal
        self.col = []
        for i in range(self.n_interval):
            self.col.append(self.interval_attributes[i])
        for i in range(self.n_binary):
            self.col.append(self.binary_attributes[i])
        if self.nominal_encoding==None:
            for i in range(self.n_nominal):
                self.col.append(self.nominal_attributes[i])
        else:
            for i in range(self.n_onehot):
                self.col.append(self.onehot_attributes[i])

        if self.display:
            print("\n********** Data Preprocessing ***********")
            print("Features Dictionary Contains:\n%i Interval," \
                  %self.n_interval, "\n%i Binary," %self.n_binary,\
                  "\n%i Nominal, and" %self.n_nominal, \
                  "\n%i Excluded Attribute(s).\n" %self.n_ignored)
            print("Data contains %i observations & %i columns.\n" %df.shape)
        self.initial_missing = df.isnull().sum()
        self.feature_names = np.array(df.columns.values)
        for feature in self.feature_names:
            if self.initial_missing[feature]>(self.n_obs/2):
                warnings.warn(feature+":has more than 50% missing." +
                              "Recommend setting Data Type set to DT.Ignore.")
        # Initialize number missing in attribute_map
        for feature,v in self.features_map.items():
            try:
                self.missing_counts[feature] = self.initial_missing[feature]
            except:
                raise RuntimeError(feature + " is not found in Data_Map. ")
                sys.exit()

        # Scan for outliers among interval attributes
        nan_map = df.isnull()
        for index in df.iterrows():
            i = index[0]
        # Check for outliers in interval attributes
            for feature, v in self.features_map.items():
                if nan_map.loc[i,feature]==True:
                    continue
                if v[0]==DT.Interval: # Interval Attribute
                    if type(v[1]) != tuple or len(v[1]) != 2:
                       raise ValueError("\n" +\
                          "***Call to ReplaceImputeEncode invalid.\n"+\
                          "***   Attribute Map has invalid description " +\
                          "for " +feature)
                       sys.exit()
                    l_limit = v[1][0]
                    u_limit = v[1][1]
                    # Check if the value is numeric before comparing
                    try:
                        value = df.loc[i,feature]
                        if pd.isna(value) or pd.isnull(value):
                            continue  # Skip NaN/None values
                        # Try to convert to float for comparison
                        numeric_value = float(value)
                        if numeric_value > u_limit or numeric_value < l_limit:
                            self.outlier_counts[feature] += 1
                            self.df_copy.loc[i,feature] = None
                    except (ValueError, TypeError):
                        # If conversion fails, treat as outlier and convert to None
                        self.outlier_counts[feature] += 1
                        self.df_copy.loc[i,feature] = None
                else: 
                    if v[0]!=DT.Binary and v[0]!=DT.Nominal: 
                        # don't touch this attribute
                        continue
                    # Categorical Attribute
                    in_cat = False
                    for cat in v[1]:
                        if df.loc[i,feature]==cat:
                            in_cat=True
                            continue
                    if in_cat==False:
                        self.df_copy.loc[i,feature] = None
                        self.outlier_counts[feature] += 1
        if self.display:
            print("\nAttribute Counts")
            max_label = 0
            for k, v in self.features_map.items():
                if len(k) > max_label:
                    max_label = len(k)
            max_label += 2
            label_format = ("{:.<%i" %(max_label+5))+"s}{:>8s}{:>10s}" 
            print(label_format.format('', 'Missing', 'Outliers'))
            label_format = ("{:.<%i" %max_label)+"s}{:10d}{:10d}"
            for k,v in self.features_map.items():
                print(label_format.format(k, self.missing_counts[k], \
                                          self.outlier_counts[k]))
                #print(label_format.format(k, v[2][0], v[2][1]))
            
    def draft_data_map(self, df, max_n=10, max_s=30, display_map=True,
                       out=None, replace=False):
        feature_names = np.array(df.columns.values)
        draft_features_map = {}
        print("\nGenerating DATA_MAP for use in ReplaceImputeEncode.")
        print("String attributes with fewer than", max_s, 
                  "unique values are labeled as Binary or Nominal; "+
                  "otherwise Text or String.")
        print("Numerical attributes with fewer than", max_n,
                  "unique values are considered Binary or Nominal;"+
                  " otherwise Interval")
        
        for feature in feature_names:
            n        = df[feature].value_counts()
            n_values = n.index
            astring  = False
            for v in n_values:
                if (type(v)==str):
                    astring = True
                    break
            if astring == False:
                min_ = round(df[feature].min()-0.5,4)
                max_ = round(df[feature].max()+0.5,4)
            if astring == False:
                #Numerical attribute
                if len(n) < max_n:
                    # Numerical Attribute is Binary or Nominal
                    a   = df[feature].unique()
                    # Look for string in a
                    j = 0
                    for i in range(len(a)):
                        if type(a[i])==str:
                            j += 1
                    if j>0:
                        print("WARNING: ", feature, "contains both numbers "+
                              "and strings. Dropping from draft data map.")
                        break
                    a.sort()
                    categories = tuple(a)
                    if len(a) == 2:
                        draft_features_map[feature]=[DT.Binary, 
                                          categories]
                    else:
                        draft_features_map[feature]=[DT.Nominal,
                                          categories]
                else:
                    # Attribute is Interval
                    draft_features_map[feature]=["DT.Interval",
                                      (min_, max_)]

            else:
                # String Attribute is Binary, Nominal or Text or String
                if len(n) < max_s: 
                    # String Attribute is Binary or Nominal
                    a = df[feature].unique()
                    # Look for nan in a
                    no_nan = False
                    while no_nan == False:
                        j = -1
                        for i in range(len(a)):
                            if type(a[i]) != str:
                                j = i
                        if j>=0:
                            a = np.delete(a,j)
                        else:
                            no_nan=True
                    a.sort()
                    categories = tuple(a)
                    if len(a) == 2:
                        draft_features_map[feature]=[DT.Binary, 
                                          categories]
                    else:
                        draft_features_map[feature]=[DT.Nominal,
                                          categories]
                else:
                    k = df[feature].str.len().max()
                    if k>100:
                        # Set attribute to text field
                        draft_features_map[feature]=[DT.Text,("")]
                    else:
                        draft_features_map[feature]=[DT.String,("")]
        if display_map:
            # print the features map
            print("************* DRAFT DATA MAP **************\n")
            print("data_map = {")
            for feature,v in draft_features_map.items():
                w = DT.convertDataType(v[0])
                s = "\t["
                if len(feature)<5:
                    s = "\t\t["
                print("\t'"+feature+"':",s,str(w),",",v[1],"],")
            print("\n}")
        if replace==True:
            # Use this draft map for RIE processing
            self.features_map = draft_features_map
            print("Using Draft Data Map for ReplaceImputeEncode.\n"+
                          "Review Draft for Data Type Accuracy.")
        if out != None:
            #Save this draft map as a pickle file <out>
            self.save_data_map(draft_features_map, out)
        print(draft_features_map)
        return draft_features_map
    
    def update_feature(self, feature, datatype, dataval):
        if type(feature)!=str:
            raise ValueError("feature name not string")
            sys.exit()
        if datatype!=DT.Interval and datatype!=DT.Binary and \
           datatype!=DT.Nominal and datatype!=DT.Text and \
           datatype!=DT.Ignore and datatype!=DT.String:
            raise ValueError("Data Type Value Invalid")
            sys.exit()
        if type(self.features_map)!=dict:
            self.features_map = {}
        self.features_map[feature] = [datatype, dataval]
    
    def save_data_map(self, data_map, fname):
        if type(data_map)!=dict:
            raise RuntimeError("Data Map invalid")
            sys.exit()
        try:
            with open(fname, 'wb') as f:
                pickle.dump(data_map, f, 
                            pickle.DEFAULT_PROTOCOL)
            print("Data Map Saved to Pickle File: ", fname)
        except:
            warnings.warn("Cannot save data map into file: ")
    
    def get_data_map(self): 
        # Returns current data map
        return self.features_map
        
    def load_data_map(self, fname):
        try:
            with open(fname, 'rb') as f:
                data_map = pickle.load(f)
        except:
            raise ValueError("Unable to load data map from:", fname)
            sys.exit()
        if type(data_map)!=dict:
                raise ValueError("Unable to load data map from:", fname)
                sys.exit()
        return data_map
            
    def display_data_map(self):
        # Display Data Map Dictionary
        try:
            if self.features_map==None:
                raise RuntimeError("Data Map Does not Exist")
        except:
            raise RuntimeError("Data Map Does not Exist")
        # print the features map
        print("************* CURRENT DATA MAP **************\n")
        print("data_map = {")
        for feature,v in self.features_map.items():
            w = DT.convertDataType(v[0])
            c = ','
            if w=='DT.Ignore' or w=='DT.Binary':
                c = '  ,'
            elif w=='DT.Nominal':
                c = ' ,'
            elif w=='DT.Text' or w=='DT.String':
                c = '    ,'
            s = "\t["
            if len(feature)<5:
                s = "\t\t["
            print("\t'"+feature+"':",s,str(w), c, v[1],"],")
        print("\n}")
    
    def impute(self):
        self.impute_interval()
        self.impute_binary()
        self.impute_nominal()
        self.imputed_data()
            
    def impute_interval(self):
        if (self.n_interval==0):
            self.imputed_interval_data = np.empty((self.n_obs, 0))
            return
        # Put the interval data from the dataframe into a numpy array
        #depricated= self.df_copy.as_matrix(columns=self.interval_attributes)
        interval_data= self.df_copy[self.interval_attributes].values
        # Create the Imputer for the Interval Data
        #self.interval_imputer = preprocessing.Imputer(strategy='mean')
        self.interval_imputer = SimpleImputer(strategy='mean')
        # Impute the missing values in the Interval data
        self.imputed_interval_data = \
            self.interval_imputer.fit_transform(interval_data)
            
    def impute_binary(self):
        if (self.n_binary==0):
            self.imputed_binary_data = np.empty((self.n_obs, 0))
            return
        # Put the binary data from the dataframe into a numpy array
        #cat_df = df[self.binary_attributes]
        cat_df = pd.DataFrame(columns=self.binary_attributes)
        for feature in self.binary_attributes:
            #cat_df[feature]= self.df_copy[feature].astype('category').cat.codes
            cat_df[feature]= self.df_copy[feature].astype('category')
            cat_df.loc[cat_df[feature]==-1, feature] = None
        #Depricated cat_array = cat_df.as_matrix()
        cat_array = cat_df.values
        # Create Imputer for Categorical Data
        #cat_imputer = preprocessing.Imputer(strategy='most_frequent')
        cat_imputer = SimpleImputer(strategy='most_frequent')
        # Impute the missing values in the Categorical Data
        self.imputed_binary_data = \
            cat_imputer.fit_transform(cat_array)
            
    def impute_nominal(self):
        if (self.n_nominal==0):
            self.imputed_nominal_data = np.empty((self.n_obs, 0))
            return
        # Put the nominal data from the dataframe into a numpy array
        cat_df  = pd.DataFrame(columns=self.nominal_attributes)
        for feature in self.nominal_attributes:
            #self.cat_df[feature]= self.df_copy[feature].astype('category').cat.codes
            cat_df[feature]= self.df_copy[feature].astype('category')
            cat_df.loc[cat_df[feature]==-1, feature] = None
        #Depricated cat_array = cat_df.as_matrix()
        #print(cat_array[0])
        cat_array = cat_df.values
        # Create Imputer for Categorical Data
        #cat_imputer = preprocessing.Imputer(strategy='most_frequent')
        cat_imputer = SimpleImputer(strategy='most_frequent')
        # Impute the missing values in the Categorical Data
        self.imputed_nominal_data = \
            cat_imputer.fit_transform(cat_array)
            
    def imputed_data(self):
        # Bring Interval and Categorial Data Together into a dataframe
        # The Imputed Data
        # Col is not the same as self.col.  col contains the main attribute
        # names, self.col contains the one-hot names
        col = self.interval_attributes + self.binary_attributes + \
                self.nominal_attributes
        # if no_impute is given, replace these attributes with their 
        # original, missing values
        if self.no_impute != None:
            idx = []
            for i in range(len(self.no_impute)):
                idx.append(-1)
            for i in range(len(col)):
                for j in range(len(self.no_impute)):
                    if col[i] == self.no_impute[j]:
                        idx[j] = i
                        break
            for j in range(len(self.no_impute)):
                k = idx[j]
                if k < 0:
                    warnings.warn("  \nArgument "+self.no_impute[j]+ \
                                  " in 'no_impute' is invalid.\n")
                    break
                if k<self.n_interval:
                    for i in range(self.n_obs):
                        self.imputed_interval_data[i,k] = \
                             self.df_copy[self.no_impute[j]][i]
                else:
                    if k < self.n_interval + self.n_binary:
                        k = k - self.n_interval
                        for i in range(self.n_obs):
                            self.imputed_binary_data[i,k] = \
                                 self.df_copy[self.no_impute[j]][i]
                    else:
                        k = k - self.n_interval - self.n_binary
                        for i in range(self.n_obs):
                            self.imputed_nominal_data[i,k] = \
                                 self.df_copy[self.no_impute[j]][i]
       
        self.data_imputed= \
                np.hstack((self.imputed_interval_data,\
                           self.imputed_binary_data, \
                           self.imputed_nominal_data))

        self.imputed_data_df = \
                pd.DataFrame(self.data_imputed, columns=col)
            
    def scale_encode(self):

        self.standardize_interval()
        self.encode_binary()
        self.encode_nominal()
        self.encoded_data()
            
    def standardize_interval(self):
        if (self.n_interval==0 or self.interval_scale==None):
            self.scaled_interval_data = self.imputed_interval_data
            return
        # Standardize Interval Data using Z-Scores
        if self.interval_scale=='std':
            scaler = preprocessing.StandardScaler() 
            scaler.fit(self.imputed_interval_data)
            self.scaled_interval_data = \
                scaler.transform(self.imputed_interval_data)
        # Standardize Interval Data using median and IQR
        if self.interval_scale=='robust':
            scaler = preprocessing.RobustScaler() 
            scaler.fit(self.imputed_interval_data)
            self.scaled_interval_data = \
                scaler.transform(self.imputed_interval_data)
            
    def encode_binary(self):
        # Uses 1 and -1 encoding for binary instead of 0, 1
        # SAS uses the 1, -1 convention
        if self.n_binary == 0 or self.binary_encoding == None:
            return
        if self.binary_encoding == 'SAS':
            low = -1
        else:  # One-hot encoding
            low = 0
        for j in range(self.n_binary):
            k = self.imputed_binary_data[0:,j].argmin()
            smallest = self.imputed_binary_data[k,j]
            for i in range(self.n_obs):
                if self.imputed_binary_data[i,j] == smallest:
                    self.imputed_binary_data[i,j] = low
                else:
                    self.imputed_binary_data[i,j] = 1
   
    def encode_nominal(self):
        if (self.n_nominal==0 or self.nominal_encoding==None):
            return
        # Create an instance of the OneHotEncoder & Selecting Attributes
        # Attributes must all be non-negative integers
        # Missing values may show up as -1 values, which will cause an error
        onehot = preprocessing.OneHotEncoder(categories=self.onehot_cats)
        self.hot_array = \
                onehot.fit_transform(self.imputed_nominal_data).toarray()
        n_features = []
        nominal_categories = 0
        for i in range(self.n_nominal):
            feature = self.nominal_attributes[i]
            v = self.features_map[feature]
            n_features.append(len(v[1]))
            nominal_categories += len(v[1])
        if nominal_categories < self.hot_array.shape[1]:
            raise RuntimeError('  Call to ReplaceImputeEncode Invalid '+ \
               '  Number of one-hot columns is', self.hot_array.shape[1], \
               'but nominal categories is ', nominal_categories, \
               '  Data contains more nominal attributes than '+ \
               'found in the data_map.')
            sys.exit()
            
        # SAS Encoding subtracts the last one-hot vector from the others, 
        # for each nominal attribute.
        if self.nominal_encoding == 'SAS':
            self.sas_encoded = \
                np.zeros((self.n_obs, (self.n_onehot-self.n_nominal)))
            ilast = -1
            idx1  = 0
            idx2  = 0
            for l in range(self.n_nominal):
                m = n_features[l]
                ilast = ilast + m
                for j in range(m-1):
                        for i in range(self.n_obs):
                            last = self.hot_array[i,ilast]
                            self.sas_encoded[i,idx1] = \
                                            self.hot_array[i,idx2] - last
                        idx1 += 1
                        idx2 += 1
                idx2 += 1
            
    def encoded_data(self):
        # Bring encoded and scaled data together into a dataframe
        # The Imputed and Encoded Data
        if self.n_nominal==0:
            if self.interval_scale==None:
                self.data_encoded = np.hstack((self.imputed_interval_data, \
                                       self.imputed_binary_data))
            else:
                self.data_encoded = np.hstack((self.scaled_interval_data, \
                                       self.imputed_binary_data))
            if self.drop==True:
                for i in range(self.n_nominal): #WEIRD! n_nominal=0?
                    self.col.remove(self.hot_drop_list[i])
                    
        if self.n_nominal>0 and self.nominal_encoding==None:
            if self.interval_scale==None:
                self.data_encoded = np.hstack((self.imputed_interval_data, \
                                       self.imputed_binary_data, \
                                       self.imputed_nominal_data))
            else:
                self.data_encoded = np.hstack((self.scaled_interval_data, \
                                       self.imputed_binary_data, \
                                       self.imputed_nominal_data))
                
        if self.n_nominal>0 and self.nominal_encoding == 'SAS':
            if self.interval_scale==None:
                self.data_encoded = np.hstack((self.imputed_interval_data, \
                                       self.imputed_binary_data, \
                                       self.sas_encoded))
            else:
                self.data_encoded = np.hstack((self.scaled_interval_data, \
                                       self.imputed_binary_data, \
                                       self.sas_encoded))
            if self.drop==True:
                for i in range(self.n_nominal):
                    self.col.remove(self.hot_drop_list[i])
                    
        if self.n_nominal>0 and self.nominal_encoding == 'one-hot':
            if self.interval_scale==None:
                self.data_encoded = np.hstack((self.imputed_interval_data, \
                                       self.imputed_binary_data, \
                                       self.hot_array))
            else:
                self.data_encoded = np.hstack((self.scaled_interval_data, \
                                       self.imputed_binary_data, \
                                       self.hot_array))
        # data_encoded array ready for conversion to dataframe
        
        self.encoded_data_df = \
                pd.DataFrame(self.data_encoded, columns=self.col)

        if self.nominal_encoding == 'one-hot' and self.drop==True:
            self.encoded_data_df = \
                self.encoded_data_df.drop(self.hot_drop_list, axis=1)
            for i in range(self.n_nominal):
                self.col.remove(self.hot_drop_list[i])
                
    def transform(self):
        self.impute()
        self.scale_encode()
        #Check for constant data columns
        feature_names = np.array(self.encoded_data_df.columns.values)
        for feature in feature_names:
            if feature in self.interval_attributes:
                self.encoded_data_df[feature] = \
                    self.encoded_data_df[feature].astype('float64')
            elif feature in self.onehot_attributes:
                self.encoded_data_df[feature] = \
                    self.encoded_data_df[feature].astype('int')
            elif feature in self.binary_attributes:
                if self.binary_encoding != None:
                    self.encoded_data_df[feature] = \
                        self.encoded_data_df[feature].astype('int')
                else:
                    self.encoded_data_df[feature] = \
                        self.encoded_data_df[feature].astype(\
                                        self.df_copy[feature].dtype)
            else:
                if type(self.encoded_data_df[feature]) == int:
                    self.encoded_data_df[feature] = \
                        self.encoded_data_df[feature].astype('int')
                else: 
                    self.encoded_data_df[feature] = \
                        self.encoded_data_df[feature].astype(\
                                        self.df_copy[feature].dtype)
            n = self.encoded_data_df[feature].value_counts()
            if len(n)==1:
                print("WARNING:  Data for ", feature, " is constant.")
        return self.encoded_data_df
        
    def fit_transform(self, df, data_map=None):
        self.fit(df, data_map)
        self.transform()
        return self.encoded_data_df