File size: 2,520 Bytes
a648b7a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
class FeatureEngineer(BaseEstimator, TransformerMixin):
def __init__(self):
# We need to store the LabelEncoders
# so they can be applied consistently to new data.
self.le_prod = LabelEncoder()
self.le_store = LabelEncoder()
def fit(self, X, y=None):
# Create a new feature 'Product_Id_Cd' from the first two characters of Product_Id.
X['Product_Id_Cd'] = X['Product_Id'].apply(lambda x: x[:2])
# Correct 'Product_Sugar_Content' to 'Product_Sugar_Content_Corr'
X['Product_Sugar_Content_Corr'] = X['Product_Sugar_Content'].str.replace('reg', 'Regular', regex=True)
# Calculate 'Operation_Years'
X['Operation_Years'] = 2025 - X['Store_Establishment_Year']
self.le_prod.fit(X['Product_Id_Cd'])
le_feat=['Product_Sugar_Content_Corr','Store_Size','Store_Location_City_Type','Store_Type','Product_Id_Cd']
for i in le_feat:
self.le_prod.fit(X[i])
# Fit LabelEncoder for 'Store'
self.le_store.fit(X['Store_Id'])
return self
def transform(self, X):
X_copy = X.copy()
# Apply the transformations
X_copy['Product_Id_Cd'] = X_copy['Product_Id'].apply(lambda x: x[:2])
X_copy['Product_Sugar_Content_Corr'] = X_copy['Product_Sugar_Content'].str.replace('reg', 'Regular', regex=True)
X_copy['Operation_Years'] = 2013 - X_copy['Store_Establishment_Year']
# Using a try-except block to handle unseen categories gracefully
try:
le_feat=['Product_Sugar_Content_Corr','Store_Size','Store_Location_City_Type','Store_Type','Product_Id_Cd']
for i in le_feat:
X_copy[i] = self.le_prod.transform(X_copy[i])
except ValueError:
# Handling unknown categories in production data
X_copy['Product_Id_Cd'] = -1
# Apply LabelEncoder to 'Store_Id'
try:
X_copy['Store'] = self.le_store.transform(X_copy['Store_Id'])
except ValueError:
X_copy['Store'] = -1
# Droping the features which have been processed into new features already
rem_feat=['Product_Id','Store_Id','Product_Sugar_Content','Product_Type', 'Store_Establishment_Year']
X_copy.drop(rem_feat, axis=1, inplace=True)
return X_copy |