Spaces:
Running
Running
File size: 2,033 Bytes
5fd18c2 7b86f6a be4c9ca a82e8ef 98db584 a82e8ef 279b85e a82e8ef 279b85e a82e8ef 279b85e a82e8ef |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 |
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class FeatureEngineer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
# Ensure X is a DataFrame and copy it.
if isinstance(X, pd.DataFrame):
df = X.copy()
else:
# These are the expected column names after initial preprocessing
# They should be consistent with the features defined in the overall dataset.
df.columns = (df.columns
.str.strip()
.str.replace(" ","_")
.str.replace(r"[^\w]","_",regex=True)
.str.lower()
)
core_sensor_cols = df.columns.tolist()
# ===== diff features
for col_name in df.select_dtypes(include=np.number).columns:
df[f"{col_name}_diff"] = df[col_name].diff()
# ===== rolling mean
for col_name in core_sensor_cols:
if col_name in df.columns:
df[f"{col_name}_roll5"] = df[col_name].rolling(5).mean()
# ===== anomaly flag (3-sigma)
for col_name in core_sensor_cols:
if col_name in df.columns:
std = df[col_name].std()
if std > 1e-9: # Use a small epsilon to check for non-zero std
df[f"{col_name}_anom"] = (df[col_name].diff().abs() > 3 * std).astype(int)
else:
df[f"{col_name}_anom"] = 0 # No anomaly if data is constant
# ===== aggregates
# Corrected: Use actual string column names instead of integer indices
df["temp_gap"] = df['lub_oil_temp'] - df['coolant_temp'] # oil vs coolant
df["pressure_sum"] = df[['lub_oil_pressure','fuel_pressure', 'coolant_pressure']].sum(axis=1)
df = df.fillna(0)
# Return DataFrame with new column names for easier debugging and feature name extraction
return df
|