sudhirpgcmma02 commited on
Commit
c1360e0
·
verified ·
1 Parent(s): b7c087b

Upload train.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. train.py +31 -26
train.py CHANGED
@@ -31,6 +31,7 @@ from sklearn.ensemble import RandomForestClassifier # Added for RandomForest
31
  from featureengineer import FeatureEngineer
32
  from outliercapper import OutlierCapper
33
 
 
34
  api = HfApi()
35
 
36
  Xtrain_path = "hf://datasets/sudhirpgcmma02/Engine_PM/Xtrain.csv"
@@ -56,32 +57,19 @@ class FeatureEngineer(BaseEstimator, TransformerMixin):
56
  else:
57
  # These are the expected column names after initial preprocessing
58
  # They should be consistent with the features defined in the overall dataset.
59
- """expected_column_names = [
60
- 'Engine_rpm', 'Lub_oil_pressure', 'Fuel_pressure',
61
- 'Coolant_pressure', 'lub_oil_temp', 'Coolant_temp'
62
- ]"""
63
- col=self.columns_
64
- #df = pd.DataFrame(X, columns=expected_column_names)
65
- df = pd.DataFrame(X, columns=col)
66
-
67
- df.columns = (df.columns
68
  .str.strip()
69
  .str.replace(" ","_")
70
  .str.replace(r"[^\w]","_",regex=True)
 
71
  )
72
-
73
- engine_rpm_col = 'Engine_rpm'
74
- lub_oil_pressure_col = 'Lub_oil_pressure'
75
- fuel_pressure_col = 'Fuel_pressure'
76
- coolant_pressure_col = 'Coolant_pressure'
77
- lub_oil_temp_col = 'lub_oil_temp'
78
- coolant_temp_col = 'Coolant_temp'
79
-
80
- core_sensor_cols = [
81
- engine_rpm_col, lub_oil_pressure_col, fuel_pressure_col,
82
- coolant_pressure_col, lub_oil_temp_col, coolant_temp_col
83
- ]
84
-
85
  # ===== diff features
86
  for col_name in df.select_dtypes(include=np.number).columns:
87
  df[f"{col_name}_diff"] = df[col_name].diff()
@@ -102,8 +90,9 @@ class FeatureEngineer(BaseEstimator, TransformerMixin):
102
 
103
  # ===== aggregates
104
  # Corrected: Use actual string column names instead of integer indices
105
- df["temp_gap"] = df[lub_oil_temp_col] - df[coolant_temp_col] # oil vs coolant
106
- df["pressure_sum"] = df[[lub_oil_pressure_col, fuel_pressure_col, coolant_pressure_col]].sum(axis=1)
 
107
 
108
  df = df.fillna(0)
109
 
@@ -162,12 +151,26 @@ df.columns = (df.columns
162
  .str.strip()
163
  .str.replace(" ","_")
164
  .str.replace(r"[^\w]","_",regex=True)
 
165
  )
166
- print(df.head(10))
167
 
168
  # Split into X (features) and y (target)
169
- Xtrain =X_train.copy()
 
170
  ytrain =y_train.copy()
 
 
 
 
 
 
 
 
 
 
 
 
171
  print("########################### independent, dependent varial split completed ################################")
172
 
173
  # Extract column names as lists for the ColumnTransformer
@@ -483,6 +486,7 @@ for name,model in best_model.named_estimators_.items():
483
  print(f"\n * Base model - {name}")
484
  pprint(model.get_params())
485
 
 
486
  # printing the model performance (FP / FN evaluation)
487
  print("best slected model | classification report \n",classification_report(ytest, y_pred))
488
  print("best slected model | confusion matrix \n",confusion_matrix(ytest, y_pred))
@@ -567,3 +571,4 @@ api.upload_file(
567
  repo_id=repo_id,
568
  repo_type=repo_type,
569
  )
 
 
31
  from featureengineer import FeatureEngineer
32
  from outliercapper import OutlierCapper
33
 
34
+
35
  api = HfApi()
36
 
37
  Xtrain_path = "hf://datasets/sudhirpgcmma02/Engine_PM/Xtrain.csv"
 
57
  else:
58
  # These are the expected column names after initial preprocessing
59
  # They should be consistent with the features defined in the overall dataset.
60
+
61
+
62
+ print("columna names #######################\n",df.columns)
63
+ df.columns = (df.columns
 
 
 
 
 
64
  .str.strip()
65
  .str.replace(" ","_")
66
  .str.replace(r"[^\w]","_",regex=True)
67
+ .str.lower()
68
  )
69
+ print("columna names #######################\n",df.columns)
70
+
71
+ core_sensor_cols =df.columns.tolist()
72
+
 
 
 
 
 
 
 
 
 
73
  # ===== diff features
74
  for col_name in df.select_dtypes(include=np.number).columns:
75
  df[f"{col_name}_diff"] = df[col_name].diff()
 
90
 
91
  # ===== aggregates
92
  # Corrected: Use actual string column names instead of integer indices
93
+
94
+ df["temp_gap"] = df['lub_oil_temp'] - df['coolant_temp'] # oil vs coolant
95
+ df["pressure_sum"] = df[['lub_oil_pressure','fuel_pressure','coolant_pressure']].sum(axis=1)
96
 
97
  df = df.fillna(0)
98
 
 
151
  .str.strip()
152
  .str.replace(" ","_")
153
  .str.replace(r"[^\w]","_",regex=True)
154
+ .str.lower()
155
  )
156
+ print("printing 10 row",df.head(10))
157
 
158
  # Split into X (features) and y (target)
159
+ #Xtrain =X_train.copy()
160
+ Xtrain=df.copy()
161
  ytrain =y_train.copy()
162
+
163
+ ytrain.columns=(ytrain.columns
164
+ .str.strip()
165
+ .str.replace(" ","_")
166
+ .str.replace(r"[^\w]","_",regex=True)
167
+ )
168
+ Xtest.columns=(Xtest.columns
169
+ .str.strip()
170
+ .str.replace(" ","_")
171
+ .str.replace(r"[^\w]","_",regex=True)
172
+ .str.lower()
173
+ )
174
  print("########################### independent, dependent varial split completed ################################")
175
 
176
  # Extract column names as lists for the ColumnTransformer
 
486
  print(f"\n * Base model - {name}")
487
  pprint(model.get_params())
488
 
489
+
490
  # printing the model performance (FP / FN evaluation)
491
  print("best slected model | classification report \n",classification_report(ytest, y_pred))
492
  print("best slected model | confusion matrix \n",confusion_matrix(ytest, y_pred))
 
571
  repo_id=repo_id,
572
  repo_type=repo_type,
573
  )
574
+