Spaces:

harikrishna1985
/

predictive-maintenance-space

Sleeping

App Files Files Community

harikrishna1985 commited on 23 days ago

Commit

7585e98

verified ·

1 Parent(s): 66cae1e

Upload src/02_train.py with huggingface_hub

Browse files

Files changed (1) hide show

src/02_train.py +259 -0

src/02_train.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import os
+import json
+from pathlib import Path
+import joblib
+import pandas as pd
+from huggingface_hub import hf_hub_download, HfApi
+from sklearn.metrics import accuracy_score, f1_score
+from sklearn.model_selection import ParameterGrid
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
+try:
+    from xgboost import XGBClassifier
+    XGBOOST_AVAILABLE = True
+except Exception:
+    XGBOOST_AVAILABLE = False
+# =========================
+# CONFIG
+# =========================
+DATASET_REPO_ID = "harikrishna1985/Engine_data"
+MODEL_REPO_ID = "harikrishna1985/predictive-maintenance-model"
+TRAIN_FILENAME = "processed/train.csv"
+TEST_FILENAME = "processed/test.csv"
+TARGET_COLUMN = "engine_condition"
+LOCAL_ARTIFACTS_DIR = Path("artifacts")
+LOCAL_ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)
+BEST_MODEL_FILE = LOCAL_ARTIFACTS_DIR / "best_model.pkl"
+RESULTS_FILE = LOCAL_ARTIFACTS_DIR / "tuning_results.csv"
+BEST_MODEL_INFO_FILE = LOCAL_ARTIFACTS_DIR / "best_model_info.json"
+# =========================
+# HELPERS
+# =========================
+def get_hf_api() -> HfApi:
+    token = os.getenv("HF_TOKEN")
+    return HfApi(token=token)
+def download_train_test() -> tuple[pd.DataFrame, pd.DataFrame]:
+    train_path = hf_hub_download(
+        repo_id=DATASET_REPO_ID,
+        filename=TRAIN_FILENAME,
+        repo_type="dataset",
+    )
+    test_path = hf_hub_download(
+        repo_id=DATASET_REPO_ID,
+        filename=TEST_FILENAME,
+        repo_type="dataset",
+    )
+    train_df = pd.read_csv(train_path)
+    test_df = pd.read_csv(test_path)
+    print(f"Train shape: {train_df.shape}")
+    print(f"Test shape: {test_df.shape}")
+    return train_df, test_df
+def prepare_features(train_df: pd.DataFrame, test_df: pd.DataFrame):
+    target_col_clean = TARGET_COLUMN.strip().lower().replace(" ", "_")
+    train_df.columns = [c.strip().lower().replace(" ", "_") for c in train_df.columns]
+    test_df.columns = [c.strip().lower().replace(" ", "_") for c in test_df.columns]
+    if target_col_clean not in train_df.columns or target_col_clean not in test_df.columns:
+        raise ValueError(f"Target column '{target_col_clean}' not found in train/test data.")
+    X_train = train_df.drop(columns=[target_col_clean])
+    y_train = train_df[target_col_clean]
+    X_test = test_df.drop(columns=[target_col_clean])
+    y_test = test_df[target_col_clean]
+    # keep common columns only, same order
+    common_cols = [c for c in X_train.columns if c in X_test.columns]
+    X_train = X_train[common_cols]
+    X_test = X_test[common_cols]
+    # one-hot encode categoricals if any
+    X_train = pd.get_dummies(X_train, drop_first=False)
+    X_test = pd.get_dummies(X_test, drop_first=False)
+    X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)
+    return X_train, X_test, y_train, y_test
+def build_model_candidates():
+    candidates = {
+        "decision_tree": {
+            "model_class": DecisionTreeClassifier,
+            "param_grid": {
+                "max_depth": [3, 5, 10, None],
+                "min_samples_split": [2, 5],
+                "random_state": [42],
+            },
+        },
+        "random_forest": {
+            "model_class": RandomForestClassifier,
+            "param_grid": {
+                "n_estimators": [100, 200],
+                "max_depth": [5, 10, None],
+                "min_samples_split": [2, 5],
+                "random_state": [42],
+                "n_jobs": [-1],
+            },
+        },
+        "adaboost": {
+            "model_class": AdaBoostClassifier,
+            "param_grid": {
+                "n_estimators": [50, 100, 200],
+                "learning_rate": [0.5, 1.0],
+                "random_state": [42],
+            },
+        },
+        "gradient_boosting": {
+            "model_class": GradientBoostingClassifier,
+            "param_grid": {
+                "n_estimators": [100, 200],
+                "learning_rate": [0.05, 0.1],
+                "max_depth": [3, 5],
+                "random_state": [42],
+            },
+        },
+        "bagging": {
+            "model_class": BaggingClassifier,
+            "param_grid": {
+                "n_estimators": [50, 100],
+                "random_state": [42],
+            },
+        },
+    }
+    if XGBOOST_AVAILABLE:
+        candidates["xgboost"] = {
+            "model_class": XGBClassifier,
+            "param_grid": {
+                "n_estimators": [100, 200],
+                "max_depth": [3, 5],
+                "learning_rate": [0.05, 0.1],
+                "subsample": [0.8, 1.0],
+                "colsample_bytree": [0.8, 1.0],
+                "random_state": [42],
+                "eval_metric": ["mlogloss"],
+            },
+        }
+    return candidates
+def train_and_tune(X_train, y_train, X_test, y_test):
+    candidates = build_model_candidates()
+    all_results = []
+    best_model = None
+    best_score = -1
+    best_info = None
+    for model_name, model_spec in candidates.items():
+        model_class = model_spec["model_class"]
+        grid = list(ParameterGrid(model_spec["param_grid"]))
+        print(f"\nTraining model: {model_name}")
+        print(f"Parameter combinations: {len(grid)}")
+        for params in grid:
+            try:
+                model = model_class(**params)
+                model.fit(X_train, y_train)
+                preds = model.predict(X_test)
+                acc = accuracy_score(y_test, preds)
+                f1 = f1_score(y_test, preds, average="weighted")
+                row = {
+                    "model_name": model_name,
+                    "params": json.dumps(params),
+                    "accuracy": acc,
+                    "f1_weighted": f1,
+                }
+                all_results.append(row)
+                if f1 > best_score:
+                    best_score = f1
+                    best_model = model
+                    best_info = {
+                        "model_name": model_name,
+                        "params": params,
+                        "accuracy": acc,
+                        "f1_weighted": f1,
+                        "feature_columns": X_train.columns.tolist(),
+                        "target_column": TARGET_COLUMN.strip().lower().replace(" ", "_"),
+                    }
+                print(f"{model_name} | params={params} | acc={acc:.4f} | f1={f1:.4f}")
+            except Exception as e:
+                print(f"Skipping params due to error: {params} | error={e}")
+    if best_model is None or best_info is None:
+        raise RuntimeError("No model was trained successfully.")
+    results_df = pd.DataFrame(all_results).sort_values(by="f1_weighted", ascending=False)
+    results_df.to_csv(RESULTS_FILE, index=False)
+    joblib.dump(best_model, BEST_MODEL_FILE)
+    with open(BEST_MODEL_INFO_FILE, "w", encoding="utf-8") as f:
+        json.dump(best_info, f, indent=2)
+    print(f"\nBest model saved to: {BEST_MODEL_FILE}")
+    print(f"Tuning results saved to: {RESULTS_FILE}")
+    print(f"Best model info saved to: {BEST_MODEL_INFO_FILE}")
+    print(f"Best model: {best_info['model_name']} | f1={best_info['f1_weighted']:.4f}")
+    return best_model, best_info
+def upload_model_artifacts():
+    api = get_hf_api()
+    files_to_upload = [
+        (str(BEST_MODEL_FILE), "best_model.pkl"),
+        (str(RESULTS_FILE), "tuning_results.csv"),
+        (str(BEST_MODEL_INFO_FILE), "best_model_info.json"),
+    ]
+    for local_file, path_in_repo in files_to_upload:
+        print(f"Uploading {local_file} -> {path_in_repo}")
+        api.upload_file(
+            path_or_fileobj=local_file,
+            path_in_repo=path_in_repo,
+            repo_id=MODEL_REPO_ID,
+            repo_type="model",
+        )
+    print("Best model and tuning artifacts uploaded successfully to HF model repo.")
+def main():
+    train_df, test_df = download_train_test()
+    X_train, X_test, y_train, y_test = prepare_features(train_df, test_df)
+    train_and_tune(X_train, y_train, X_test, y_test)
+    upload_model_artifacts()
+    print("Training completed successfully.")
+if __name__ == "__main__":
+    main()