Spaces:
Running
Running
| import pandas as pd | |
| import numpy as np | |
| from pathlib import Path | |
| from sklearn.preprocessing import StandardScaler, LabelEncoder | |
| from sklearn.feature_selection import VarianceThreshold | |
| from mlpipeline.entity import FeatureEngineeringConfig, FeatureEngineeringArtifact | |
| from mlpipeline.logging.logger import get_logger | |
| from mlpipeline.exception import FeatureEngineeringException | |
| from mlpipeline.utils.common import save_object | |
| import sys | |
| import os | |
| logger = get_logger(__name__) | |
| class FeatureEngineering: | |
| def __init__(self, config: FeatureEngineeringConfig): | |
| self.config = config | |
| self.label_encoders = {} | |
| self.scaler = None | |
| def engineer_features(self) -> FeatureEngineeringArtifact: | |
| try: | |
| logger.info("Starting feature engineering") | |
| train_df = pd.read_csv(self.config.train_path) | |
| test_df = pd.read_csv(self.config.test_path) | |
| train_df = self._handle_missing_values(train_df) | |
| test_df = self._handle_missing_values(test_df) | |
| train_df = self._encode_categorical(train_df, is_train=True) | |
| test_df = self._encode_categorical(test_df, is_train=False) | |
| train_df = self._create_interaction_features(train_df) | |
| test_df = self._create_interaction_features(test_df) | |
| train_df = self._remove_low_variance(train_df, is_train=True) | |
| test_df = self._remove_low_variance(test_df, is_train=False) | |
| numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist() | |
| if 'target' in numeric_cols: | |
| numeric_cols.remove('target') | |
| if numeric_cols: | |
| self.scaler = StandardScaler() | |
| train_df[numeric_cols] = self.scaler.fit_transform(train_df[numeric_cols]) | |
| test_df[numeric_cols] = self.scaler.transform(test_df[numeric_cols]) | |
| os.makedirs(self.config.root_dir, exist_ok=True) | |
| train_df.to_csv(self.config.output_train_path, index=False) | |
| test_df.to_csv(self.config.output_test_path, index=False) | |
| preprocessor_path = Path(self.config.root_dir) / "preprocessor.pkl" | |
| save_object(preprocessor_path, { | |
| 'scaler': self.scaler, | |
| 'label_encoders': self.label_encoders | |
| }) | |
| logger.info(f"Feature engineering completed. Train shape: {train_df.shape}, Test shape: {test_df.shape}") | |
| return FeatureEngineeringArtifact( | |
| train_features_path=self.config.output_train_path, | |
| test_features_path=self.config.output_test_path, | |
| is_engineered=True, | |
| message=f"Features engineered: {train_df.shape[1]} features" | |
| ) | |
| except Exception as e: | |
| raise FeatureEngineeringException(str(e), sys) | |
| def _handle_missing_values(self, df): | |
| for col in df.columns: | |
| if df[col].dtype in [np.float64, np.int64]: | |
| df[col].fillna(df[col].median(), inplace=True) | |
| else: | |
| df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'missing', inplace=True) | |
| return df | |
| def _encode_categorical(self, df, is_train=True): | |
| categorical_cols = df.select_dtypes(include=['object']).columns | |
| for col in categorical_cols: | |
| if is_train: | |
| self.label_encoders[col] = LabelEncoder() | |
| df[col] = self.label_encoders[col].fit_transform(df[col].astype(str)) | |
| else: | |
| if col in self.label_encoders: | |
| df[col] = df[col].astype(str).map( | |
| lambda x: self.label_encoders[col].transform([x])[0] | |
| if x in self.label_encoders[col].classes_ else -1 | |
| ) | |
| return df | |
| def _create_interaction_features(self, df): | |
| numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist() | |
| if 'target' in numeric_cols: | |
| numeric_cols.remove('target') | |
| if len(numeric_cols) >= 2: | |
| df[f'{numeric_cols[0]}_x_{numeric_cols[1]}'] = df[numeric_cols[0]] * df[numeric_cols[1]] | |
| return df | |
| def _remove_low_variance(self, df, is_train=True, threshold=0.01): | |
| if 'target' in df.columns: | |
| target = df['target'] | |
| features = df.drop(columns=['target']) | |
| else: | |
| target = None | |
| features = df | |
| if is_train: | |
| self.variance_selector = VarianceThreshold(threshold=threshold) | |
| self.variance_selector.fit(features) | |
| if hasattr(self, 'variance_selector'): | |
| features_selected = pd.DataFrame( | |
| self.variance_selector.transform(features), | |
| columns=features.columns[self.variance_selector.get_support()], | |
| index=features.index | |
| ) | |
| if target is not None: | |
| return pd.concat([features_selected, target], axis=1) | |
| return features_selected | |
| return df |