Spaces:
Sleeping
Sleeping
File size: 5,252 Bytes
a7d80f2 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.feature_selection import VarianceThreshold
from mlpipeline.entity import FeatureEngineeringConfig, FeatureEngineeringArtifact
from mlpipeline.logging.logger import get_logger
from mlpipeline.exception import FeatureEngineeringException
from mlpipeline.utils.common import save_object
import sys
import os
logger = get_logger(__name__)
class FeatureEngineering:
def __init__(self, config: FeatureEngineeringConfig):
self.config = config
self.label_encoders = {}
self.scaler = None
def engineer_features(self) -> FeatureEngineeringArtifact:
try:
logger.info("Starting feature engineering")
train_df = pd.read_csv(self.config.train_path)
test_df = pd.read_csv(self.config.test_path)
train_df = self._handle_missing_values(train_df)
test_df = self._handle_missing_values(test_df)
train_df = self._encode_categorical(train_df, is_train=True)
test_df = self._encode_categorical(test_df, is_train=False)
train_df = self._create_interaction_features(train_df)
test_df = self._create_interaction_features(test_df)
train_df = self._remove_low_variance(train_df, is_train=True)
test_df = self._remove_low_variance(test_df, is_train=False)
numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
if 'target' in numeric_cols:
numeric_cols.remove('target')
if numeric_cols:
self.scaler = StandardScaler()
train_df[numeric_cols] = self.scaler.fit_transform(train_df[numeric_cols])
test_df[numeric_cols] = self.scaler.transform(test_df[numeric_cols])
os.makedirs(self.config.root_dir, exist_ok=True)
train_df.to_csv(self.config.output_train_path, index=False)
test_df.to_csv(self.config.output_test_path, index=False)
preprocessor_path = Path(self.config.root_dir) / "preprocessor.pkl"
save_object(preprocessor_path, {
'scaler': self.scaler,
'label_encoders': self.label_encoders
})
logger.info(f"Feature engineering completed. Train shape: {train_df.shape}, Test shape: {test_df.shape}")
return FeatureEngineeringArtifact(
train_features_path=self.config.output_train_path,
test_features_path=self.config.output_test_path,
is_engineered=True,
message=f"Features engineered: {train_df.shape[1]} features"
)
except Exception as e:
raise FeatureEngineeringException(str(e), sys)
def _handle_missing_values(self, df):
for col in df.columns:
if df[col].dtype in [np.float64, np.int64]:
df[col].fillna(df[col].median(), inplace=True)
else:
df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'missing', inplace=True)
return df
def _encode_categorical(self, df, is_train=True):
categorical_cols = df.select_dtypes(include=['object']).columns
for col in categorical_cols:
if is_train:
self.label_encoders[col] = LabelEncoder()
df[col] = self.label_encoders[col].fit_transform(df[col].astype(str))
else:
if col in self.label_encoders:
df[col] = df[col].astype(str).map(
lambda x: self.label_encoders[col].transform([x])[0]
if x in self.label_encoders[col].classes_ else -1
)
return df
def _create_interaction_features(self, df):
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
if 'target' in numeric_cols:
numeric_cols.remove('target')
if len(numeric_cols) >= 2:
df[f'{numeric_cols[0]}_x_{numeric_cols[1]}'] = df[numeric_cols[0]] * df[numeric_cols[1]]
return df
def _remove_low_variance(self, df, is_train=True, threshold=0.01):
if 'target' in df.columns:
target = df['target']
features = df.drop(columns=['target'])
else:
target = None
features = df
if is_train:
self.variance_selector = VarianceThreshold(threshold=threshold)
self.variance_selector.fit(features)
if hasattr(self, 'variance_selector'):
features_selected = pd.DataFrame(
self.variance_selector.transform(features),
columns=features.columns[self.variance_selector.get_support()],
index=features.index
)
if target is not None:
return pd.concat([features_selected, target], axis=1)
return features_selected
return df |