import pandas as pd import numpy as np from pathlib import Path from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.feature_selection import VarianceThreshold from mlpipeline.entity import FeatureEngineeringConfig, FeatureEngineeringArtifact from mlpipeline.logging.logger import get_logger from mlpipeline.exception import FeatureEngineeringException from mlpipeline.utils.common import save_object import sys import os logger = get_logger(__name__) class FeatureEngineering: def __init__(self, config: FeatureEngineeringConfig): self.config = config self.label_encoders = {} self.scaler = None def engineer_features(self) -> FeatureEngineeringArtifact: try: logger.info("Starting feature engineering") train_df = pd.read_csv(self.config.train_path) test_df = pd.read_csv(self.config.test_path) train_df = self._handle_missing_values(train_df) test_df = self._handle_missing_values(test_df) train_df = self._encode_categorical(train_df, is_train=True) test_df = self._encode_categorical(test_df, is_train=False) train_df = self._create_interaction_features(train_df) test_df = self._create_interaction_features(test_df) train_df = self._remove_low_variance(train_df, is_train=True) test_df = self._remove_low_variance(test_df, is_train=False) numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist() if 'target' in numeric_cols: numeric_cols.remove('target') if numeric_cols: self.scaler = StandardScaler() train_df[numeric_cols] = self.scaler.fit_transform(train_df[numeric_cols]) test_df[numeric_cols] = self.scaler.transform(test_df[numeric_cols]) os.makedirs(self.config.root_dir, exist_ok=True) train_df.to_csv(self.config.output_train_path, index=False) test_df.to_csv(self.config.output_test_path, index=False) preprocessor_path = Path(self.config.root_dir) / "preprocessor.pkl" save_object(preprocessor_path, { 'scaler': self.scaler, 'label_encoders': self.label_encoders }) logger.info(f"Feature engineering completed. Train shape: {train_df.shape}, Test shape: {test_df.shape}") return FeatureEngineeringArtifact( train_features_path=self.config.output_train_path, test_features_path=self.config.output_test_path, is_engineered=True, message=f"Features engineered: {train_df.shape[1]} features" ) except Exception as e: raise FeatureEngineeringException(str(e), sys) def _handle_missing_values(self, df): for col in df.columns: if df[col].dtype in [np.float64, np.int64]: df[col].fillna(df[col].median(), inplace=True) else: df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'missing', inplace=True) return df def _encode_categorical(self, df, is_train=True): categorical_cols = df.select_dtypes(include=['object']).columns for col in categorical_cols: if is_train: self.label_encoders[col] = LabelEncoder() df[col] = self.label_encoders[col].fit_transform(df[col].astype(str)) else: if col in self.label_encoders: df[col] = df[col].astype(str).map( lambda x: self.label_encoders[col].transform([x])[0] if x in self.label_encoders[col].classes_ else -1 ) return df def _create_interaction_features(self, df): numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() if 'target' in numeric_cols: numeric_cols.remove('target') if len(numeric_cols) >= 2: df[f'{numeric_cols[0]}_x_{numeric_cols[1]}'] = df[numeric_cols[0]] * df[numeric_cols[1]] return df def _remove_low_variance(self, df, is_train=True, threshold=0.01): if 'target' in df.columns: target = df['target'] features = df.drop(columns=['target']) else: target = None features = df if is_train: self.variance_selector = VarianceThreshold(threshold=threshold) self.variance_selector.fit(features) if hasattr(self, 'variance_selector'): features_selected = pd.DataFrame( self.variance_selector.transform(features), columns=features.columns[self.variance_selector.get_support()], index=features.index ) if target is not None: return pd.concat([features_selected, target], axis=1) return features_selected return df