Spaces:

anly656
/

dr_jones

Sleeping

App Files Files Community

dr_jones / AdvancedAnalytics /ReplaceImputeEncode.py

anly656

Upload 50 files

8643b59 verified about 2 months ago

raw

history blame contribute delete

38 kB

	"""
	@author: Edward R Jones
	@version 1.34
	@copyright 2020 - Edward R Jones, all rights reserved.
	"""
	#from DT import DT
	import sys
	import warnings
	import numpy as np
	import pandas as pd
	from sklearn import preprocessing
	from sklearn.impute import SimpleImputer
	from copy import deepcopy #Used to create sentiment word dictionary

	import pickle
	from enum import Enum
	#Class DT - DataType This is setup to provide a clean
	#notation for data maps used by ReplaceImputeEncode
	class DT(Enum):
	# @attributes: characters recognized in RIE code
	Interval = 'I' #Expected values (lowest value, highest value)
	Binary = 'B' #Expected values (class0, class1)
	Nominal = 'N' #Expected values (class0, class1, ... classk)
	Ordinal = 'O' #Expected values ordered classes (class0, class1, ...)
	String = 'S' #Expected values ("")
	ID = 'Z' #Expected values ("")
	Label = 'L' #Expected values ("")
	Text = 'T' #Expected values ("")
	Ignore = 'Z' #Expected values ("")
	interval = 'I' #Allow lower case
	binary = 'B' #Allow lower case
	nominal = 'N' #Allow lower case
	ordinal = 'O' #Allow lower case
	string = 'S' #Allow lower case
	id = 'Z' #Expected values ("")
	label = 'L' #Expected values ("")
	text = 'T' #Allow lower case
	ignore = 'Z' #Allow lower case

	# @methods
	def getDataTypes():
	dtype = [
	DT.Interval,
	DT.Binary,
	DT.Nominal,
	DT.Ordinal,
	DT.ID,
	DT.Label,
	DT.Text ,
	DT.String,
	DT.Ignore
	]
	return dtype #Returns data type list

	def convertDataType(atype):
	if atype==DT.Interval:
	ctype ='DT.Interval'
	elif atype==DT.Binary:
	ctype ='DT.Binary'
	elif atype==DT.Nominal:
	ctype ='DT.Nominal'
	elif atype==DT.Ordinal:
	ctype ='DT.Ordinal'
	elif atype==DT.String:
	ctype ='DT.String'
	elif atype==DT.ID:
	ctype ='DT.ID'
	elif atype==DT.Label:
	ctype ='DT.Label'
	elif atype==DT.Text:
	ctype ='DT.Text'
	else:ctype ='DT.Ignore'
	return ctype

	"""
	class ReplaceImputeEncode

	@parameters:
	* __init__() *
	data_map - The metadata dictionary.

	nominal_encoding - Can be 'one-hot', 'SAS' or default None.

	interval_scale - Can be 'std', 'robust' or default None.

	no_impute - default None or list of attributes to exclude from
	imputation

	drop - True or default False. True drops the last nominal
	encoded column. False keeps all nominal encoded
	columns.

	display - True or default False. True displays the number of
	missing and outliers found in the data.

	* fit_transform () *
	df - a pandas DataFrame containing the data description
	by the metadata found in data_map (required)
	data_map - See above description.

	@Cautions:
	The incoming dataframe, df, and the data_map are deep copied to
	ensure that changes to the dataframe are only held within the class
	object self.copy_df. The attributes_map is deep copied into
	self.features_map. All binary and nominal values are encoded to
	numeric values.

	The method draft_data_map returns a of the data_map based upon
	the data. This must be examined to ensure the data types and
	allowed values are correct. This behavior is controled by k_min
	and k_max. See API and examples for details.
	"""

	class ReplaceImputeEncode(object):
	def __init__(self, data_map=None, binary_encoding=None,
	nominal_encoding=None, interval_scale=None, no_impute=None,
	no_encode=None, drop=False, display=False):
	if interval_scale=='None' or interval_scale=='none':
	self.interval_scale=None
	else:
	self.interval_scale=interval_scale
	self.go_flag = False
	self.features_map = data_map
	self.drop = drop
	self.display = display
	self.interval_scale = interval_scale
	self.no_impute = no_impute
	self.no_encode = no_encode
	if binary_encoding=='None' or binary_encoding=='none':
	self.binary_encoding = None
	else:
	self.binary_encoding = binary_encoding
	#nominal_encoding can be 'SAS' or 'one-hot'
	if binary_encoding != 'SAS' and binary_encoding != 'one-hot' \
	and binary_encoding != None:
	raise ValueError("***Call to ReplaceImputeEncode invalid. "+
	"*** binary_encoding="+binary_encoding+" is invalid."+
	"*** must use None, 'one-hot' or 'SAS'")
	sys.exit()
	if nominal_encoding=='None' or nominal_encoding=='none':
	self.nominal_encoding = None
	else:
	self.nominal_encoding = nominal_encoding
	#nominal_encoding can be 'SAS' or 'one-hot'
	if nominal_encoding != 'SAS' and nominal_encoding != 'one-hot' \
	and nominal_encoding != None:
	raise ValueError("***Call to ReplaceImputeEncode invalid. "+
	"*** nominal_encoding="+nominal_encoding+" is invalid."+
	"*** must use None, 'one-hot' or 'SAS'")
	sys.exit()
	if interval_scale != 'std' and interval_scale != 'robust' \
	and interval_scale != None:
	raise ValueError("***Call to ReplaceImputeEncode invalid. "+
	"*** interval_scale="+interval_scale+" is invalid."+
	"*** must use None, 'std' or 'robust'")
	sys.exit()
	if data_map==None:
	print("Attributes Map is required.")
	print("Please pass map using data_map attribute.")
	print("If one is not available, try creating one using "+
	"call to draft_data_map(df)")
	return
	if type(data_map)==str:
	try:
	self.features_map = self.load_data_map(data_map)
	except:
	raise ValueError("Unable to load data map:", data_map)
	sys.exit()
	elif type(data_map)==dict:
	self.features_map = data_map
	else:
	raise ValueError("Supplied Data Map not Dictionary or File")
	sys.exit()
	self.interval_attributes = []
	self.nominal_attributes = []
	self.binary_attributes = []
	self.onehot_attributes = []
	self.onehot_cats = []
	self.hot_drop_list = []
	self.missing_counts = {}
	self.outlier_counts = {}
	for feature,v in self.features_map.items():
	# Initialize data map missing and outlier counters to zero
	self.missing_counts[feature] = 0
	self.outlier_counts[feature] = 0

	if v[0] not in DT.getDataTypes():
	raise TypeError(
	"\n***Data Map in call to ReplaceImputeEncode invalid.\n"+
	"***Data Type for '"+ feature + "' is not recognized. "+
	"\n***Valid types are: DT.Interval, DT.Binary, DT.Nominal, "+
	"DT.Text, DT.String, DT.ID, or DT.Ignore")
	if v[0]==DT.Interval:
	self.interval_attributes.append(feature)
	else:
	if v[0]==DT.Binary:
	self.binary_attributes.append(feature)
	else:
	if v[0]!=DT.Binary and v[0]!=DT.Nominal:
	# Ignore, don't touch this attribute
	continue
	# Attribute must be Nominal
	self.nominal_attributes.append(feature)
	# Setup column names for Nominal encoding
	n_cat = len(v[1])
	self.onehot_cats.append(list(v[1]))
	data_type = type(v[1][n_cat-1])
	if self.drop == True:
	n_cat -= 1
	for i in range(n_cat):
	if type(v[1][i]) != data_type:
	raise TypeError(
	"\n***Classes invalid for--> '"+feature+"'"+
	"\n***Must be all numeric or strings, not both.")
	if type(v[1][i])==int:
	my_str = feature+str(v[1][i])
	else:
	my_str = feature+("%i" %i)+":"+str(v[1][i])[0:10]
	self.onehot_attributes.append(my_str)

	self.n_interval = len(self.interval_attributes)
	self.n_binary = len(self.binary_attributes)
	self.n_nominal = len(self.nominal_attributes)
	self.n_onehot = len(self.onehot_attributes)
	self.cat = self.n_binary + self.n_nominal
	if nominal_encoding=='SAS' and drop==False and self.n_nominal>0:
	raise ValueError("***Call to ReplaceImputeEncode invalid. "+
	"***nominal_encoding='SAS' requested with drop=False "+
	"***'SAS' encoding requires drop=True")
	sys.exit()
	self.col = []
	for i in range(self.n_interval):
	self.col.append(self.interval_attributes[i])
	for i in range(self.n_binary):
	self.col.append(self.binary_attributes[i])
	if self.nominal_encoding==None:
	for i in range(self.n_nominal):
	self.col.append(self.nominal_attributes[i])
	else:
	for i in range(self.n_onehot):
	self.col.append(self.onehot_attributes[i])

	self.go_flag = True

	def fit(self, df, data_map=None):
	self.df_copy = deepcopy(df)
	#self.df_copy = df
	if data_map==None and self.features_map==None:
	raise ValueError(" Call to ReplaceImputeEncode missing required"+
	" Data Map.\n Use function draft_data_map to draft a map.")
	sys.exit()
	if type(self.features_map)==dict:
	pass
	elif type(data_map)==str:
	try:
	self.features_map = self.load_data_map(data_map)
	except:
	raise ValueError("Unable to load data map:", data_map)
	sys.exit()
	elif type(data_map)==dict:
	self.features_map = data_map
	else:
	raise ValueError("Supplied Data Map not Dictionary or File")
	sys.exit()

	self.interval_attributes = []
	self.nominal_attributes = []
	self.binary_attributes = []
	self.onehot_attributes = []
	self.onehot_cats = []
	self.hot_drop_list = []
	for feature,v in self.features_map.items():
	if v[0] not in DT.getDataTypes():
	raise TypeError(
	"\n***Data Map in call to ReplaceImputeEncode invalid.\n"+
	"***Data Type for '"+ feature + "' is not recognized. "+
	"\n***Valid types are: DT.Interval, DT.Binary, DT.Nominal, "+
	"DT.Text, DT.String, DT.ID, or DT.Ignore")
	if v[0]==DT.Interval:
	self.interval_attributes.append(feature)
	else:
	if v[0]==DT.Binary:
	self.binary_attributes.append(feature)
	else:
	if v[0]==DT.Nominal:
	self.nominal_attributes.append(feature)
	self.onehot_cats.append(list(v[1]))
	for i in range(len(v[1])):
	if type(v[1][i])==int:
	my_str = feature+str(v[1][i])
	else:
	my_str = feature+("%i" %i)+":"+ \
	str(v[1][i])[0:10]
	self.onehot_attributes.append(my_str)
	if self.drop==True:
	self.hot_drop_list.append(my_str)
	else:
	if v[0] in DT.getDataTypes():
	continue
	else:
	# Data Map Invalid
	raise TypeError(
	"***Data Map in call to ReplaceImputeEncode invalid.\n"+
	"***Data Type for '"+ feature + "' invalid")
	sys.exit()
	self.n_interval = len(self.interval_attributes)
	self.n_binary = len(self.binary_attributes)
	self.n_nominal = len(self.nominal_attributes)
	self.n_onehot = len(self.onehot_attributes)
	self.cat = self.n_binary + self.n_nominal
	self.n_obs = df.shape[0]
	self.n_ignored = df.shape[1] - \
	self.n_interval-self.n_binary-self.n_nominal
	self.col = []
	for i in range(self.n_interval):
	self.col.append(self.interval_attributes[i])
	for i in range(self.n_binary):
	self.col.append(self.binary_attributes[i])
	if self.nominal_encoding==None:
	for i in range(self.n_nominal):
	self.col.append(self.nominal_attributes[i])
	else:
	for i in range(self.n_onehot):
	self.col.append(self.onehot_attributes[i])

	if self.display:
	print("\n******** Data Preprocessing *********")
	print("Features Dictionary Contains:\n%i Interval," \
	%self.n_interval, "\n%i Binary," %self.n_binary,\
	"\n%i Nominal, and" %self.n_nominal, \
	"\n%i Excluded Attribute(s).\n" %self.n_ignored)
	print("Data contains %i observations & %i columns.\n" %df.shape)
	self.initial_missing = df.isnull().sum()
	self.feature_names = np.array(df.columns.values)
	for feature in self.feature_names:
	if self.initial_missing[feature]>(self.n_obs/2):
	warnings.warn(feature+":has more than 50% missing." +
	"Recommend setting Data Type set to DT.Ignore.")
	# Initialize number missing in attribute_map
	for feature,v in self.features_map.items():
	try:
	self.missing_counts[feature] = self.initial_missing[feature]
	except:
	raise RuntimeError(feature + " is not found in Data_Map. ")
	sys.exit()

	# Scan for outliers among interval attributes
	nan_map = df.isnull()
	for index in df.iterrows():
	i = index[0]
	# Check for outliers in interval attributes
	for feature, v in self.features_map.items():
	if nan_map.loc[i,feature]==True:
	continue
	if v[0]==DT.Interval: # Interval Attribute
	if type(v[1]) != tuple or len(v[1]) != 2:
	raise ValueError("\n" +\
	"***Call to ReplaceImputeEncode invalid.\n"+\
	"*** Attribute Map has invalid description " +\
	"for " +feature)
	sys.exit()
	l_limit = v[1][0]
	u_limit = v[1][1]
	# Check if the value is numeric before comparing
	try:
	value = df.loc[i,feature]
	if pd.isna(value) or pd.isnull(value):
	continue # Skip NaN/None values
	# Try to convert to float for comparison
	numeric_value = float(value)
	if numeric_value > u_limit or numeric_value < l_limit:
	self.outlier_counts[feature] += 1
	self.df_copy.loc[i,feature] = None
	except (ValueError, TypeError):
	# If conversion fails, treat as outlier and convert to None
	self.outlier_counts[feature] += 1
	self.df_copy.loc[i,feature] = None
	else:
	if v[0]!=DT.Binary and v[0]!=DT.Nominal:
	# don't touch this attribute
	continue
	# Categorical Attribute
	in_cat = False
	for cat in v[1]:
	if df.loc[i,feature]==cat:
	in_cat=True
	continue
	if in_cat==False:
	self.df_copy.loc[i,feature] = None
	self.outlier_counts[feature] += 1
	if self.display:
	print("\nAttribute Counts")
	max_label = 0
	for k, v in self.features_map.items():
	if len(k) > max_label:
	max_label = len(k)
	max_label += 2
	label_format = ("{:.<%i" %(max_label+5))+"s}{:>8s}{:>10s}"
	print(label_format.format('', 'Missing', 'Outliers'))
	label_format = ("{:.<%i" %max_label)+"s}{:10d}{:10d}"
	for k,v in self.features_map.items():
	print(label_format.format(k, self.missing_counts[k], \
	self.outlier_counts[k]))
	#print(label_format.format(k, v[2][0], v[2][1]))

	def draft_data_map(self, df, max_n=10, max_s=30, display_map=True,
	out=None, replace=False):
	feature_names = np.array(df.columns.values)
	draft_features_map = {}
	print("\nGenerating DATA_MAP for use in ReplaceImputeEncode.")
	print("String attributes with fewer than", max_s,
	"unique values are labeled as Binary or Nominal; "+
	"otherwise Text or String.")
	print("Numerical attributes with fewer than", max_n,
	"unique values are considered Binary or Nominal;"+
	" otherwise Interval")

	for feature in feature_names:
	n = df[feature].value_counts()
	n_values = n.index
	astring = False
	for v in n_values:
	if (type(v)==str):
	astring = True
	break
	if astring == False:
	min_ = round(df[feature].min()-0.5,4)
	max_ = round(df[feature].max()+0.5,4)
	if astring == False:
	#Numerical attribute
	if len(n) < max_n:
	# Numerical Attribute is Binary or Nominal
	a = df[feature].unique()
	# Look for string in a
	j = 0
	for i in range(len(a)):
	if type(a[i])==str:
	j += 1
	if j>0:
	print("WARNING: ", feature, "contains both numbers "+
	"and strings. Dropping from draft data map.")
	break
	a.sort()
	categories = tuple(a)
	if len(a) == 2:
	draft_features_map[feature]=[DT.Binary,
	categories]
	else:
	draft_features_map[feature]=[DT.Nominal,
	categories]
	else:
	# Attribute is Interval
	draft_features_map[feature]=["DT.Interval",
	(min_, max_)]

	else:
	# String Attribute is Binary, Nominal or Text or String
	if len(n) < max_s:
	# String Attribute is Binary or Nominal
	a = df[feature].unique()
	# Look for nan in a
	no_nan = False
	while no_nan == False:
	j = -1
	for i in range(len(a)):
	if type(a[i]) != str:
	j = i
	if j>=0:
	a = np.delete(a,j)
	else:
	no_nan=True
	a.sort()
	categories = tuple(a)
	if len(a) == 2:
	draft_features_map[feature]=[DT.Binary,
	categories]
	else:
	draft_features_map[feature]=[DT.Nominal,
	categories]
	else:
	k = df[feature].str.len().max()
	if k>100:
	# Set attribute to text field
	draft_features_map[feature]=[DT.Text,("")]
	else:
	draft_features_map[feature]=[DT.String,("")]
	if display_map:
	# print the features map
	print("*********** DRAFT DATA MAP ************\n")
	print("data_map = {")
	for feature,v in draft_features_map.items():
	w = DT.convertDataType(v[0])
	s = "\t["
	if len(feature)<5:
	s = "\t\t["
	print("\t'"+feature+"':",s,str(w),",",v[1],"],")
	print("\n}")
	if replace==True:
	# Use this draft map for RIE processing
	self.features_map = draft_features_map
	print("Using Draft Data Map for ReplaceImputeEncode.\n"+
	"Review Draft for Data Type Accuracy.")
	if out != None:
	#Save this draft map as a pickle file <out>
	self.save_data_map(draft_features_map, out)
	print(draft_features_map)
	return draft_features_map

	def update_feature(self, feature, datatype, dataval):
	if type(feature)!=str:
	raise ValueError("feature name not string")
	sys.exit()
	if datatype!=DT.Interval and datatype!=DT.Binary and \
	datatype!=DT.Nominal and datatype!=DT.Text and \
	datatype!=DT.Ignore and datatype!=DT.String:
	raise ValueError("Data Type Value Invalid")
	sys.exit()
	if type(self.features_map)!=dict:
	self.features_map = {}
	self.features_map[feature] = [datatype, dataval]

	def save_data_map(self, data_map, fname):
	if type(data_map)!=dict:
	raise RuntimeError("Data Map invalid")
	sys.exit()
	try:
	with open(fname, 'wb') as f:
	pickle.dump(data_map, f,
	pickle.DEFAULT_PROTOCOL)
	print("Data Map Saved to Pickle File: ", fname)
	except:
	warnings.warn("Cannot save data map into file: ")

	def get_data_map(self):
	# Returns current data map
	return self.features_map

	def load_data_map(self, fname):
	try:
	with open(fname, 'rb') as f:
	data_map = pickle.load(f)
	except:
	raise ValueError("Unable to load data map from:", fname)
	sys.exit()
	if type(data_map)!=dict:
	raise ValueError("Unable to load data map from:", fname)
	sys.exit()
	return data_map

	def display_data_map(self):
	# Display Data Map Dictionary
	try:
	if self.features_map==None:
	raise RuntimeError("Data Map Does not Exist")
	except:
	raise RuntimeError("Data Map Does not Exist")
	# print the features map
	print("*********** CURRENT DATA MAP ************\n")
	print("data_map = {")
	for feature,v in self.features_map.items():
	w = DT.convertDataType(v[0])
	c = ','
	if w=='DT.Ignore' or w=='DT.Binary':
	c = ' ,'
	elif w=='DT.Nominal':
	c = ' ,'
	elif w=='DT.Text' or w=='DT.String':
	c = ' ,'
	s = "\t["
	if len(feature)<5:
	s = "\t\t["
	print("\t'"+feature+"':",s,str(w), c, v[1],"],")
	print("\n}")

	def impute(self):
	self.impute_interval()
	self.impute_binary()
	self.impute_nominal()
	self.imputed_data()

	def impute_interval(self):
	if (self.n_interval==0):
	self.imputed_interval_data = np.empty((self.n_obs, 0))
	return
	# Put the interval data from the dataframe into a numpy array
	#depricated= self.df_copy.as_matrix(columns=self.interval_attributes)
	interval_data= self.df_copy[self.interval_attributes].values
	# Create the Imputer for the Interval Data
	#self.interval_imputer = preprocessing.Imputer(strategy='mean')
	self.interval_imputer = SimpleImputer(strategy='mean')
	# Impute the missing values in the Interval data
	self.imputed_interval_data = \
	self.interval_imputer.fit_transform(interval_data)

	def impute_binary(self):
	if (self.n_binary==0):
	self.imputed_binary_data = np.empty((self.n_obs, 0))
	return
	# Put the binary data from the dataframe into a numpy array
	#cat_df = df[self.binary_attributes]
	cat_df = pd.DataFrame(columns=self.binary_attributes)
	for feature in self.binary_attributes:
	#cat_df[feature]= self.df_copy[feature].astype('category').cat.codes
	cat_df[feature]= self.df_copy[feature].astype('category')
	cat_df.loc[cat_df[feature]==-1, feature] = None
	#Depricated cat_array = cat_df.as_matrix()
	cat_array = cat_df.values
	# Create Imputer for Categorical Data
	#cat_imputer = preprocessing.Imputer(strategy='most_frequent')
	cat_imputer = SimpleImputer(strategy='most_frequent')
	# Impute the missing values in the Categorical Data
	self.imputed_binary_data = \
	cat_imputer.fit_transform(cat_array)

	def impute_nominal(self):
	if (self.n_nominal==0):
	self.imputed_nominal_data = np.empty((self.n_obs, 0))
	return
	# Put the nominal data from the dataframe into a numpy array
	cat_df = pd.DataFrame(columns=self.nominal_attributes)
	for feature in self.nominal_attributes:
	#self.cat_df[feature]= self.df_copy[feature].astype('category').cat.codes
	cat_df[feature]= self.df_copy[feature].astype('category')
	cat_df.loc[cat_df[feature]==-1, feature] = None
	#Depricated cat_array = cat_df.as_matrix()
	#print(cat_array[0])
	cat_array = cat_df.values
	# Create Imputer for Categorical Data
	#cat_imputer = preprocessing.Imputer(strategy='most_frequent')
	cat_imputer = SimpleImputer(strategy='most_frequent')
	# Impute the missing values in the Categorical Data
	self.imputed_nominal_data = \
	cat_imputer.fit_transform(cat_array)

	def imputed_data(self):
	# Bring Interval and Categorial Data Together into a dataframe
	# The Imputed Data
	# Col is not the same as self.col. col contains the main attribute
	# names, self.col contains the one-hot names
	col = self.interval_attributes + self.binary_attributes + \
	self.nominal_attributes
	# if no_impute is given, replace these attributes with their
	# original, missing values
	if self.no_impute != None:
	idx = []
	for i in range(len(self.no_impute)):
	idx.append(-1)
	for i in range(len(col)):
	for j in range(len(self.no_impute)):
	if col[i] == self.no_impute[j]:
	idx[j] = i
	break
	for j in range(len(self.no_impute)):
	k = idx[j]
	if k < 0:
	warnings.warn(" \nArgument "+self.no_impute[j]+ \
	" in 'no_impute' is invalid.\n")
	break
	if k<self.n_interval:
	for i in range(self.n_obs):
	self.imputed_interval_data[i,k] = \
	self.df_copy[self.no_impute[j]][i]
	else:
	if k < self.n_interval + self.n_binary:
	k = k - self.n_interval
	for i in range(self.n_obs):
	self.imputed_binary_data[i,k] = \
	self.df_copy[self.no_impute[j]][i]
	else:
	k = k - self.n_interval - self.n_binary
	for i in range(self.n_obs):
	self.imputed_nominal_data[i,k] = \
	self.df_copy[self.no_impute[j]][i]

	self.data_imputed= \
	np.hstack((self.imputed_interval_data,\
	self.imputed_binary_data, \
	self.imputed_nominal_data))

	self.imputed_data_df = \
	pd.DataFrame(self.data_imputed, columns=col)

	def scale_encode(self):

	self.standardize_interval()
	self.encode_binary()
	self.encode_nominal()
	self.encoded_data()

	def standardize_interval(self):
	if (self.n_interval==0 or self.interval_scale==None):
	self.scaled_interval_data = self.imputed_interval_data
	return
	# Standardize Interval Data using Z-Scores
	if self.interval_scale=='std':
	scaler = preprocessing.StandardScaler()
	scaler.fit(self.imputed_interval_data)
	self.scaled_interval_data = \
	scaler.transform(self.imputed_interval_data)
	# Standardize Interval Data using median and IQR
	if self.interval_scale=='robust':
	scaler = preprocessing.RobustScaler()
	scaler.fit(self.imputed_interval_data)
	self.scaled_interval_data = \
	scaler.transform(self.imputed_interval_data)

	def encode_binary(self):
	# Uses 1 and -1 encoding for binary instead of 0, 1
	# SAS uses the 1, -1 convention
	if self.n_binary == 0 or self.binary_encoding == None:
	return
	if self.binary_encoding == 'SAS':
	low = -1
	else: # One-hot encoding
	low = 0
	for j in range(self.n_binary):
	k = self.imputed_binary_data[0:,j].argmin()
	smallest = self.imputed_binary_data[k,j]
	for i in range(self.n_obs):
	if self.imputed_binary_data[i,j] == smallest:
	self.imputed_binary_data[i,j] = low
	else:
	self.imputed_binary_data[i,j] = 1

	def encode_nominal(self):
	if (self.n_nominal==0 or self.nominal_encoding==None):
	return
	# Create an instance of the OneHotEncoder & Selecting Attributes
	# Attributes must all be non-negative integers
	# Missing values may show up as -1 values, which will cause an error
	onehot = preprocessing.OneHotEncoder(categories=self.onehot_cats)
	self.hot_array = \
	onehot.fit_transform(self.imputed_nominal_data).toarray()
	n_features = []
	nominal_categories = 0
	for i in range(self.n_nominal):
	feature = self.nominal_attributes[i]
	v = self.features_map[feature]
	n_features.append(len(v[1]))
	nominal_categories += len(v[1])
	if nominal_categories < self.hot_array.shape[1]:
	raise RuntimeError(' Call to ReplaceImputeEncode Invalid '+ \
	' Number of one-hot columns is', self.hot_array.shape[1], \
	'but nominal categories is ', nominal_categories, \
	' Data contains more nominal attributes than '+ \
	'found in the data_map.')
	sys.exit()

	# SAS Encoding subtracts the last one-hot vector from the others,
	# for each nominal attribute.
	if self.nominal_encoding == 'SAS':
	self.sas_encoded = \
	np.zeros((self.n_obs, (self.n_onehot-self.n_nominal)))
	ilast = -1
	idx1 = 0
	idx2 = 0
	for l in range(self.n_nominal):
	m = n_features[l]
	ilast = ilast + m
	for j in range(m-1):
	for i in range(self.n_obs):
	last = self.hot_array[i,ilast]
	self.sas_encoded[i,idx1] = \
	self.hot_array[i,idx2] - last
	idx1 += 1
	idx2 += 1
	idx2 += 1

	def encoded_data(self):
	# Bring encoded and scaled data together into a dataframe
	# The Imputed and Encoded Data
	if self.n_nominal==0:
	if self.interval_scale==None:
	self.data_encoded = np.hstack((self.imputed_interval_data, \
	self.imputed_binary_data))
	else:
	self.data_encoded = np.hstack((self.scaled_interval_data, \
	self.imputed_binary_data))
	if self.drop==True:
	for i in range(self.n_nominal): #WEIRD! n_nominal=0?
	self.col.remove(self.hot_drop_list[i])

	if self.n_nominal>0 and self.nominal_encoding==None:
	if self.interval_scale==None:
	self.data_encoded = np.hstack((self.imputed_interval_data, \
	self.imputed_binary_data, \
	self.imputed_nominal_data))
	else:
	self.data_encoded = np.hstack((self.scaled_interval_data, \
	self.imputed_binary_data, \
	self.imputed_nominal_data))

	if self.n_nominal>0 and self.nominal_encoding == 'SAS':
	if self.interval_scale==None:
	self.data_encoded = np.hstack((self.imputed_interval_data, \
	self.imputed_binary_data, \
	self.sas_encoded))
	else:
	self.data_encoded = np.hstack((self.scaled_interval_data, \
	self.imputed_binary_data, \
	self.sas_encoded))
	if self.drop==True:
	for i in range(self.n_nominal):
	self.col.remove(self.hot_drop_list[i])

	if self.n_nominal>0 and self.nominal_encoding == 'one-hot':
	if self.interval_scale==None:
	self.data_encoded = np.hstack((self.imputed_interval_data, \
	self.imputed_binary_data, \
	self.hot_array))
	else:
	self.data_encoded = np.hstack((self.scaled_interval_data, \
	self.imputed_binary_data, \
	self.hot_array))
	# data_encoded array ready for conversion to dataframe

	self.encoded_data_df = \
	pd.DataFrame(self.data_encoded, columns=self.col)

	if self.nominal_encoding == 'one-hot' and self.drop==True:
	self.encoded_data_df = \
	self.encoded_data_df.drop(self.hot_drop_list, axis=1)
	for i in range(self.n_nominal):
	self.col.remove(self.hot_drop_list[i])

	def transform(self):
	self.impute()
	self.scale_encode()
	#Check for constant data columns
	feature_names = np.array(self.encoded_data_df.columns.values)
	for feature in feature_names:
	if feature in self.interval_attributes:
	self.encoded_data_df[feature] = \
	self.encoded_data_df[feature].astype('float64')
	elif feature in self.onehot_attributes:
	self.encoded_data_df[feature] = \
	self.encoded_data_df[feature].astype('int')
	elif feature in self.binary_attributes:
	if self.binary_encoding != None:
	self.encoded_data_df[feature] = \
	self.encoded_data_df[feature].astype('int')
	else:
	self.encoded_data_df[feature] = \
	self.encoded_data_df[feature].astype(\
	self.df_copy[feature].dtype)
	else:
	if type(self.encoded_data_df[feature]) == int:
	self.encoded_data_df[feature] = \
	self.encoded_data_df[feature].astype('int')
	else:
	self.encoded_data_df[feature] = \
	self.encoded_data_df[feature].astype(\
	self.df_copy[feature].dtype)
	n = self.encoded_data_df[feature].value_counts()
	if len(n)==1:
	print("WARNING: Data for ", feature, " is constant.")
	return self.encoded_data_df

	def fit_transform(self, df, data_map=None):
	self.fit(df, data_map)
	self.transform()
	return self.encoded_data_df