import pandas as pd import numpy as np from sklearn.base import BaseEstimator, TransformerMixin class FeatureEngineer(BaseEstimator, TransformerMixin): def fit(self, X, y=None): return self def transform(self, X): # Ensure X is a DataFrame and copy it. if isinstance(X, pd.DataFrame): df = X.copy() else: # These are the expected column names after initial preprocessing # They should be consistent with the features defined in the overall dataset. df.columns = (df.columns .str.strip() .str.replace(" ","_") .str.replace(r"[^\w]","_",regex=True) .str.lower() ) core_sensor_cols = df.columns.tolist() # ===== diff features for col_name in df.select_dtypes(include=np.number).columns: df[f"{col_name}_diff"] = df[col_name].diff() # ===== rolling mean for col_name in core_sensor_cols: if col_name in df.columns: df[f"{col_name}_roll5"] = df[col_name].rolling(5).mean() # ===== anomaly flag (3-sigma) for col_name in core_sensor_cols: if col_name in df.columns: std = df[col_name].std() if std > 1e-9: # Use a small epsilon to check for non-zero std df[f"{col_name}_anom"] = (df[col_name].diff().abs() > 3 * std).astype(int) else: df[f"{col_name}_anom"] = 0 # No anomaly if data is constant # ===== aggregates # Corrected: Use actual string column names instead of integer indices df["temp_gap"] = df['lub_oil_temp'] - df['coolant_temp'] # oil vs coolant df["pressure_sum"] = df[['lub_oil_pressure','fuel_pressure', 'coolant_pressure']].sum(axis=1) df = df.fillna(0) # Return DataFrame with new column names for easier debugging and feature name extraction return df