harikrishna1985 commited on
Commit
7585e98
·
verified ·
1 Parent(s): 66cae1e

Upload src/02_train.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. src/02_train.py +259 -0
src/02_train.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from pathlib import Path
4
+
5
+ import joblib
6
+ import pandas as pd
7
+ from huggingface_hub import hf_hub_download, HfApi
8
+
9
+ from sklearn.metrics import accuracy_score, f1_score
10
+ from sklearn.model_selection import ParameterGrid
11
+
12
+ from sklearn.tree import DecisionTreeClassifier
13
+ from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
14
+
15
+ try:
16
+ from xgboost import XGBClassifier
17
+ XGBOOST_AVAILABLE = True
18
+ except Exception:
19
+ XGBOOST_AVAILABLE = False
20
+
21
+
22
+ # =========================
23
+ # CONFIG
24
+ # =========================
25
+ DATASET_REPO_ID = "harikrishna1985/Engine_data"
26
+ MODEL_REPO_ID = "harikrishna1985/predictive-maintenance-model"
27
+
28
+ TRAIN_FILENAME = "processed/train.csv"
29
+ TEST_FILENAME = "processed/test.csv"
30
+
31
+ TARGET_COLUMN = "engine_condition"
32
+
33
+ LOCAL_ARTIFACTS_DIR = Path("artifacts")
34
+ LOCAL_ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
35
+
36
+ BEST_MODEL_FILE = LOCAL_ARTIFACTS_DIR / "best_model.pkl"
37
+ RESULTS_FILE = LOCAL_ARTIFACTS_DIR / "tuning_results.csv"
38
+ BEST_MODEL_INFO_FILE = LOCAL_ARTIFACTS_DIR / "best_model_info.json"
39
+
40
+
41
+ # =========================
42
+ # HELPERS
43
+ # =========================
44
+ def get_hf_api() -> HfApi:
45
+ token = os.getenv("HF_TOKEN")
46
+ return HfApi(token=token)
47
+
48
+
49
+ def download_train_test() -> tuple[pd.DataFrame, pd.DataFrame]:
50
+ train_path = hf_hub_download(
51
+ repo_id=DATASET_REPO_ID,
52
+ filename=TRAIN_FILENAME,
53
+ repo_type="dataset",
54
+ )
55
+ test_path = hf_hub_download(
56
+ repo_id=DATASET_REPO_ID,
57
+ filename=TEST_FILENAME,
58
+ repo_type="dataset",
59
+ )
60
+
61
+ train_df = pd.read_csv(train_path)
62
+ test_df = pd.read_csv(test_path)
63
+
64
+ print(f"Train shape: {train_df.shape}")
65
+ print(f"Test shape: {test_df.shape}")
66
+ return train_df, test_df
67
+
68
+
69
+ def prepare_features(train_df: pd.DataFrame, test_df: pd.DataFrame):
70
+ target_col_clean = TARGET_COLUMN.strip().lower().replace(" ", "_")
71
+
72
+ train_df.columns = [c.strip().lower().replace(" ", "_") for c in train_df.columns]
73
+ test_df.columns = [c.strip().lower().replace(" ", "_") for c in test_df.columns]
74
+
75
+ if target_col_clean not in train_df.columns or target_col_clean not in test_df.columns:
76
+ raise ValueError(f"Target column '{target_col_clean}' not found in train/test data.")
77
+
78
+ X_train = train_df.drop(columns=[target_col_clean])
79
+ y_train = train_df[target_col_clean]
80
+
81
+ X_test = test_df.drop(columns=[target_col_clean])
82
+ y_test = test_df[target_col_clean]
83
+
84
+ # keep common columns only, same order
85
+ common_cols = [c for c in X_train.columns if c in X_test.columns]
86
+ X_train = X_train[common_cols]
87
+ X_test = X_test[common_cols]
88
+
89
+ # one-hot encode categoricals if any
90
+ X_train = pd.get_dummies(X_train, drop_first=False)
91
+ X_test = pd.get_dummies(X_test, drop_first=False)
92
+
93
+ X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)
94
+
95
+ return X_train, X_test, y_train, y_test
96
+
97
+
98
+ def build_model_candidates():
99
+ candidates = {
100
+ "decision_tree": {
101
+ "model_class": DecisionTreeClassifier,
102
+ "param_grid": {
103
+ "max_depth": [3, 5, 10, None],
104
+ "min_samples_split": [2, 5],
105
+ "random_state": [42],
106
+ },
107
+ },
108
+ "random_forest": {
109
+ "model_class": RandomForestClassifier,
110
+ "param_grid": {
111
+ "n_estimators": [100, 200],
112
+ "max_depth": [5, 10, None],
113
+ "min_samples_split": [2, 5],
114
+ "random_state": [42],
115
+ "n_jobs": [-1],
116
+ },
117
+ },
118
+ "adaboost": {
119
+ "model_class": AdaBoostClassifier,
120
+ "param_grid": {
121
+ "n_estimators": [50, 100, 200],
122
+ "learning_rate": [0.5, 1.0],
123
+ "random_state": [42],
124
+ },
125
+ },
126
+ "gradient_boosting": {
127
+ "model_class": GradientBoostingClassifier,
128
+ "param_grid": {
129
+ "n_estimators": [100, 200],
130
+ "learning_rate": [0.05, 0.1],
131
+ "max_depth": [3, 5],
132
+ "random_state": [42],
133
+ },
134
+ },
135
+ "bagging": {
136
+ "model_class": BaggingClassifier,
137
+ "param_grid": {
138
+ "n_estimators": [50, 100],
139
+ "random_state": [42],
140
+ },
141
+ },
142
+ }
143
+
144
+ if XGBOOST_AVAILABLE:
145
+ candidates["xgboost"] = {
146
+ "model_class": XGBClassifier,
147
+ "param_grid": {
148
+ "n_estimators": [100, 200],
149
+ "max_depth": [3, 5],
150
+ "learning_rate": [0.05, 0.1],
151
+ "subsample": [0.8, 1.0],
152
+ "colsample_bytree": [0.8, 1.0],
153
+ "random_state": [42],
154
+ "eval_metric": ["mlogloss"],
155
+ },
156
+ }
157
+
158
+ return candidates
159
+
160
+
161
+ def train_and_tune(X_train, y_train, X_test, y_test):
162
+ candidates = build_model_candidates()
163
+
164
+ all_results = []
165
+ best_model = None
166
+ best_score = -1
167
+ best_info = None
168
+
169
+ for model_name, model_spec in candidates.items():
170
+ model_class = model_spec["model_class"]
171
+ grid = list(ParameterGrid(model_spec["param_grid"]))
172
+
173
+ print(f"\nTraining model: {model_name}")
174
+ print(f"Parameter combinations: {len(grid)}")
175
+
176
+ for params in grid:
177
+ try:
178
+ model = model_class(**params)
179
+ model.fit(X_train, y_train)
180
+
181
+ preds = model.predict(X_test)
182
+
183
+ acc = accuracy_score(y_test, preds)
184
+ f1 = f1_score(y_test, preds, average="weighted")
185
+
186
+ row = {
187
+ "model_name": model_name,
188
+ "params": json.dumps(params),
189
+ "accuracy": acc,
190
+ "f1_weighted": f1,
191
+ }
192
+ all_results.append(row)
193
+
194
+ if f1 > best_score:
195
+ best_score = f1
196
+ best_model = model
197
+ best_info = {
198
+ "model_name": model_name,
199
+ "params": params,
200
+ "accuracy": acc,
201
+ "f1_weighted": f1,
202
+ "feature_columns": X_train.columns.tolist(),
203
+ "target_column": TARGET_COLUMN.strip().lower().replace(" ", "_"),
204
+ }
205
+
206
+ print(f"{model_name} | params={params} | acc={acc:.4f} | f1={f1:.4f}")
207
+
208
+ except Exception as e:
209
+ print(f"Skipping params due to error: {params} | error={e}")
210
+
211
+ if best_model is None or best_info is None:
212
+ raise RuntimeError("No model was trained successfully.")
213
+
214
+ results_df = pd.DataFrame(all_results).sort_values(by="f1_weighted", ascending=False)
215
+ results_df.to_csv(RESULTS_FILE, index=False)
216
+
217
+ joblib.dump(best_model, BEST_MODEL_FILE)
218
+ with open(BEST_MODEL_INFO_FILE, "w", encoding="utf-8") as f:
219
+ json.dump(best_info, f, indent=2)
220
+
221
+ print(f"\nBest model saved to: {BEST_MODEL_FILE}")
222
+ print(f"Tuning results saved to: {RESULTS_FILE}")
223
+ print(f"Best model info saved to: {BEST_MODEL_INFO_FILE}")
224
+ print(f"Best model: {best_info['model_name']} | f1={best_info['f1_weighted']:.4f}")
225
+
226
+ return best_model, best_info
227
+
228
+
229
+ def upload_model_artifacts():
230
+ api = get_hf_api()
231
+
232
+ files_to_upload = [
233
+ (str(BEST_MODEL_FILE), "best_model.pkl"),
234
+ (str(RESULTS_FILE), "tuning_results.csv"),
235
+ (str(BEST_MODEL_INFO_FILE), "best_model_info.json"),
236
+ ]
237
+
238
+ for local_file, path_in_repo in files_to_upload:
239
+ print(f"Uploading {local_file} -> {path_in_repo}")
240
+ api.upload_file(
241
+ path_or_fileobj=local_file,
242
+ path_in_repo=path_in_repo,
243
+ repo_id=MODEL_REPO_ID,
244
+ repo_type="model",
245
+ )
246
+
247
+ print("Best model and tuning artifacts uploaded successfully to HF model repo.")
248
+
249
+
250
+ def main():
251
+ train_df, test_df = download_train_test()
252
+ X_train, X_test, y_train, y_test = prepare_features(train_df, test_df)
253
+ train_and_tune(X_train, y_train, X_test, y_test)
254
+ upload_model_artifacts()
255
+ print("Training completed successfully.")
256
+
257
+
258
+ if __name__ == "__main__":
259
+ main()