Spaces:
Sleeping
Sleeping
Upload train.py with huggingface_hub
Browse files
train.py
CHANGED
|
@@ -31,6 +31,7 @@ from sklearn.ensemble import RandomForestClassifier # Added for RandomForest
|
|
| 31 |
from featureengineer import FeatureEngineer
|
| 32 |
from outliercapper import OutlierCapper
|
| 33 |
|
|
|
|
| 34 |
api = HfApi()
|
| 35 |
|
| 36 |
Xtrain_path = "hf://datasets/sudhirpgcmma02/Engine_PM/Xtrain.csv"
|
|
@@ -56,32 +57,19 @@ class FeatureEngineer(BaseEstimator, TransformerMixin):
|
|
| 56 |
else:
|
| 57 |
# These are the expected column names after initial preprocessing
|
| 58 |
# They should be consistent with the features defined in the overall dataset.
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
col=self.columns_
|
| 64 |
-
#df = pd.DataFrame(X, columns=expected_column_names)
|
| 65 |
-
df = pd.DataFrame(X, columns=col)
|
| 66 |
-
|
| 67 |
-
df.columns = (df.columns
|
| 68 |
.str.strip()
|
| 69 |
.str.replace(" ","_")
|
| 70 |
.str.replace(r"[^\w]","_",regex=True)
|
|
|
|
| 71 |
)
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
coolant_pressure_col = 'Coolant_pressure'
|
| 77 |
-
lub_oil_temp_col = 'lub_oil_temp'
|
| 78 |
-
coolant_temp_col = 'Coolant_temp'
|
| 79 |
-
|
| 80 |
-
core_sensor_cols = [
|
| 81 |
-
engine_rpm_col, lub_oil_pressure_col, fuel_pressure_col,
|
| 82 |
-
coolant_pressure_col, lub_oil_temp_col, coolant_temp_col
|
| 83 |
-
]
|
| 84 |
-
|
| 85 |
# ===== diff features
|
| 86 |
for col_name in df.select_dtypes(include=np.number).columns:
|
| 87 |
df[f"{col_name}_diff"] = df[col_name].diff()
|
|
@@ -102,8 +90,9 @@ class FeatureEngineer(BaseEstimator, TransformerMixin):
|
|
| 102 |
|
| 103 |
# ===== aggregates
|
| 104 |
# Corrected: Use actual string column names instead of integer indices
|
| 105 |
-
|
| 106 |
-
df["
|
|
|
|
| 107 |
|
| 108 |
df = df.fillna(0)
|
| 109 |
|
|
@@ -162,12 +151,26 @@ df.columns = (df.columns
|
|
| 162 |
.str.strip()
|
| 163 |
.str.replace(" ","_")
|
| 164 |
.str.replace(r"[^\w]","_",regex=True)
|
|
|
|
| 165 |
)
|
| 166 |
-
print(df.head(10))
|
| 167 |
|
| 168 |
# Split into X (features) and y (target)
|
| 169 |
-
Xtrain =X_train.copy()
|
|
|
|
| 170 |
ytrain =y_train.copy()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 171 |
print("########################### independent, dependent varial split completed ################################")
|
| 172 |
|
| 173 |
# Extract column names as lists for the ColumnTransformer
|
|
@@ -483,6 +486,7 @@ for name,model in best_model.named_estimators_.items():
|
|
| 483 |
print(f"\n * Base model - {name}")
|
| 484 |
pprint(model.get_params())
|
| 485 |
|
|
|
|
| 486 |
# printing the model performance (FP / FN evaluation)
|
| 487 |
print("best slected model | classification report \n",classification_report(ytest, y_pred))
|
| 488 |
print("best slected model | confusion matrix \n",confusion_matrix(ytest, y_pred))
|
|
@@ -567,3 +571,4 @@ api.upload_file(
|
|
| 567 |
repo_id=repo_id,
|
| 568 |
repo_type=repo_type,
|
| 569 |
)
|
|
|
|
|
|
| 31 |
from featureengineer import FeatureEngineer
|
| 32 |
from outliercapper import OutlierCapper
|
| 33 |
|
| 34 |
+
|
| 35 |
api = HfApi()
|
| 36 |
|
| 37 |
Xtrain_path = "hf://datasets/sudhirpgcmma02/Engine_PM/Xtrain.csv"
|
|
|
|
| 57 |
else:
|
| 58 |
# These are the expected column names after initial preprocessing
|
| 59 |
# They should be consistent with the features defined in the overall dataset.
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
print("columna names #######################\n",df.columns)
|
| 63 |
+
df.columns = (df.columns
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
.str.strip()
|
| 65 |
.str.replace(" ","_")
|
| 66 |
.str.replace(r"[^\w]","_",regex=True)
|
| 67 |
+
.str.lower()
|
| 68 |
)
|
| 69 |
+
print("columna names #######################\n",df.columns)
|
| 70 |
+
|
| 71 |
+
core_sensor_cols =df.columns.tolist()
|
| 72 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
# ===== diff features
|
| 74 |
for col_name in df.select_dtypes(include=np.number).columns:
|
| 75 |
df[f"{col_name}_diff"] = df[col_name].diff()
|
|
|
|
| 90 |
|
| 91 |
# ===== aggregates
|
| 92 |
# Corrected: Use actual string column names instead of integer indices
|
| 93 |
+
|
| 94 |
+
df["temp_gap"] = df['lub_oil_temp'] - df['coolant_temp'] # oil vs coolant
|
| 95 |
+
df["pressure_sum"] = df[['lub_oil_pressure','fuel_pressure','coolant_pressure']].sum(axis=1)
|
| 96 |
|
| 97 |
df = df.fillna(0)
|
| 98 |
|
|
|
|
| 151 |
.str.strip()
|
| 152 |
.str.replace(" ","_")
|
| 153 |
.str.replace(r"[^\w]","_",regex=True)
|
| 154 |
+
.str.lower()
|
| 155 |
)
|
| 156 |
+
print("printing 10 row",df.head(10))
|
| 157 |
|
| 158 |
# Split into X (features) and y (target)
|
| 159 |
+
#Xtrain =X_train.copy()
|
| 160 |
+
Xtrain=df.copy()
|
| 161 |
ytrain =y_train.copy()
|
| 162 |
+
|
| 163 |
+
ytrain.columns=(ytrain.columns
|
| 164 |
+
.str.strip()
|
| 165 |
+
.str.replace(" ","_")
|
| 166 |
+
.str.replace(r"[^\w]","_",regex=True)
|
| 167 |
+
)
|
| 168 |
+
Xtest.columns=(Xtest.columns
|
| 169 |
+
.str.strip()
|
| 170 |
+
.str.replace(" ","_")
|
| 171 |
+
.str.replace(r"[^\w]","_",regex=True)
|
| 172 |
+
.str.lower()
|
| 173 |
+
)
|
| 174 |
print("########################### independent, dependent varial split completed ################################")
|
| 175 |
|
| 176 |
# Extract column names as lists for the ColumnTransformer
|
|
|
|
| 486 |
print(f"\n * Base model - {name}")
|
| 487 |
pprint(model.get_params())
|
| 488 |
|
| 489 |
+
|
| 490 |
# printing the model performance (FP / FN evaluation)
|
| 491 |
print("best slected model | classification report \n",classification_report(ytest, y_pred))
|
| 492 |
print("best slected model | confusion matrix \n",confusion_matrix(ytest, y_pred))
|
|
|
|
| 571 |
repo_id=repo_id,
|
| 572 |
repo_type=repo_type,
|
| 573 |
)
|
| 574 |
+
|