File size: 2,033 Bytes
5fd18c2
7b86f6a
be4c9ca
a82e8ef
 
 
 
 
 
 
 
 
 
 
 
 
98db584
a82e8ef
 
 
279b85e
a82e8ef
 
279b85e
a82e8ef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279b85e
 
 
a82e8ef
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
class FeatureEngineer(BaseEstimator, TransformerMixin):

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        # Ensure X is a DataFrame and copy it.
        if isinstance(X, pd.DataFrame):
            df = X.copy()
        else:
            # These are the expected column names after initial preprocessing
            # They should be consistent with the features defined in the overall dataset.

            df.columns = (df.columns
                           .str.strip()
                           .str.replace(" ","_")
                           .str.replace(r"[^\w]","_",regex=True)
                           .str.lower()
        )

        core_sensor_cols = df.columns.tolist()
        # ===== diff features
        for col_name in df.select_dtypes(include=np.number).columns:
            df[f"{col_name}_diff"] = df[col_name].diff()

        # ===== rolling mean
        for col_name in core_sensor_cols:
            if col_name in df.columns:
                df[f"{col_name}_roll5"] = df[col_name].rolling(5).mean()

        # ===== anomaly flag (3-sigma)
        for col_name in core_sensor_cols:
            if col_name in df.columns:
                std = df[col_name].std()
                if std > 1e-9: # Use a small epsilon to check for non-zero std
                    df[f"{col_name}_anom"] = (df[col_name].diff().abs() > 3 * std).astype(int)
                else:
                    df[f"{col_name}_anom"] = 0 # No anomaly if data is constant

        # ===== aggregates
        # Corrected: Use actual string column names instead of integer indices

        df["temp_gap"] = df['lub_oil_temp'] - df['coolant_temp']   # oil vs coolant
        df["pressure_sum"] = df[['lub_oil_pressure','fuel_pressure', 'coolant_pressure']].sum(axis=1)

        df = df.fillna(0)

        # Return DataFrame with new column names for easier debugging and feature name extraction
        return df