Spaces:

harikrishna1985
/

predictive-maintenance-space

Sleeping

File size: 8,172 Bytes

7585e98

import os
import json
from pathlib import Path

import joblib
import pandas as pd
from huggingface_hub import hf_hub_download, HfApi

from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import ParameterGrid

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier

try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except Exception:
    XGBOOST_AVAILABLE = False


# =========================
# CONFIG
# =========================
DATASET_REPO_ID = "harikrishna1985/Engine_data"
MODEL_REPO_ID = "harikrishna1985/predictive-maintenance-model"

TRAIN_FILENAME = "processed/train.csv"
TEST_FILENAME = "processed/test.csv"

TARGET_COLUMN = "engine_condition"

LOCAL_ARTIFACTS_DIR = Path("artifacts")
LOCAL_ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

BEST_MODEL_FILE = LOCAL_ARTIFACTS_DIR / "best_model.pkl"
RESULTS_FILE = LOCAL_ARTIFACTS_DIR / "tuning_results.csv"
BEST_MODEL_INFO_FILE = LOCAL_ARTIFACTS_DIR / "best_model_info.json"


# =========================
# HELPERS
# =========================
def get_hf_api() -> HfApi:
    token = os.getenv("HF_TOKEN")
    return HfApi(token=token)


def download_train_test() -> tuple[pd.DataFrame, pd.DataFrame]:
    train_path = hf_hub_download(
        repo_id=DATASET_REPO_ID,
        filename=TRAIN_FILENAME,
        repo_type="dataset",
    )
    test_path = hf_hub_download(
        repo_id=DATASET_REPO_ID,
        filename=TEST_FILENAME,
        repo_type="dataset",
    )

    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)

    print(f"Train shape: {train_df.shape}")
    print(f"Test shape: {test_df.shape}")
    return train_df, test_df


def prepare_features(train_df: pd.DataFrame, test_df: pd.DataFrame):
    target_col_clean = TARGET_COLUMN.strip().lower().replace(" ", "_")

    train_df.columns = [c.strip().lower().replace(" ", "_") for c in train_df.columns]
    test_df.columns = [c.strip().lower().replace(" ", "_") for c in test_df.columns]

    if target_col_clean not in train_df.columns or target_col_clean not in test_df.columns:
        raise ValueError(f"Target column '{target_col_clean}' not found in train/test data.")

    X_train = train_df.drop(columns=[target_col_clean])
    y_train = train_df[target_col_clean]

    X_test = test_df.drop(columns=[target_col_clean])
    y_test = test_df[target_col_clean]

    # keep common columns only, same order
    common_cols = [c for c in X_train.columns if c in X_test.columns]
    X_train = X_train[common_cols]
    X_test = X_test[common_cols]

    # one-hot encode categoricals if any
    X_train = pd.get_dummies(X_train, drop_first=False)
    X_test = pd.get_dummies(X_test, drop_first=False)

    X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

    return X_train, X_test, y_train, y_test


def build_model_candidates():
    candidates = {
        "decision_tree": {
            "model_class": DecisionTreeClassifier,
            "param_grid": {
                "max_depth": [3, 5, 10, None],
                "min_samples_split": [2, 5],
                "random_state": [42],
            },
        },
        "random_forest": {
            "model_class": RandomForestClassifier,
            "param_grid": {
                "n_estimators": [100, 200],
                "max_depth": [5, 10, None],
                "min_samples_split": [2, 5],
                "random_state": [42],
                "n_jobs": [-1],
            },
        },
        "adaboost": {
            "model_class": AdaBoostClassifier,
            "param_grid": {
                "n_estimators": [50, 100, 200],
                "learning_rate": [0.5, 1.0],
                "random_state": [42],
            },
        },
        "gradient_boosting": {
            "model_class": GradientBoostingClassifier,
            "param_grid": {
                "n_estimators": [100, 200],
                "learning_rate": [0.05, 0.1],
                "max_depth": [3, 5],
                "random_state": [42],
            },
        },
        "bagging": {
            "model_class": BaggingClassifier,
            "param_grid": {
                "n_estimators": [50, 100],
                "random_state": [42],
            },
        },
    }

    if XGBOOST_AVAILABLE:
        candidates["xgboost"] = {
            "model_class": XGBClassifier,
            "param_grid": {
                "n_estimators": [100, 200],
                "max_depth": [3, 5],
                "learning_rate": [0.05, 0.1],
                "subsample": [0.8, 1.0],
                "colsample_bytree": [0.8, 1.0],
                "random_state": [42],
                "eval_metric": ["mlogloss"],
            },
        }

    return candidates


def train_and_tune(X_train, y_train, X_test, y_test):
    candidates = build_model_candidates()

    all_results = []
    best_model = None
    best_score = -1
    best_info = None

    for model_name, model_spec in candidates.items():
        model_class = model_spec["model_class"]
        grid = list(ParameterGrid(model_spec["param_grid"]))

        print(f"\nTraining model: {model_name}")
        print(f"Parameter combinations: {len(grid)}")

        for params in grid:
            try:
                model = model_class(**params)
                model.fit(X_train, y_train)

                preds = model.predict(X_test)

                acc = accuracy_score(y_test, preds)
                f1 = f1_score(y_test, preds, average="weighted")

                row = {
                    "model_name": model_name,
                    "params": json.dumps(params),
                    "accuracy": acc,
                    "f1_weighted": f1,
                }
                all_results.append(row)

                if f1 > best_score:
                    best_score = f1
                    best_model = model
                    best_info = {
                        "model_name": model_name,
                        "params": params,
                        "accuracy": acc,
                        "f1_weighted": f1,
                        "feature_columns": X_train.columns.tolist(),
                        "target_column": TARGET_COLUMN.strip().lower().replace(" ", "_"),
                    }

                print(f"{model_name} | params={params} | acc={acc:.4f} | f1={f1:.4f}")

            except Exception as e:
                print(f"Skipping params due to error: {params} | error={e}")

    if best_model is None or best_info is None:
        raise RuntimeError("No model was trained successfully.")

    results_df = pd.DataFrame(all_results).sort_values(by="f1_weighted", ascending=False)
    results_df.to_csv(RESULTS_FILE, index=False)

    joblib.dump(best_model, BEST_MODEL_FILE)
    with open(BEST_MODEL_INFO_FILE, "w", encoding="utf-8") as f:
        json.dump(best_info, f, indent=2)

    print(f"\nBest model saved to: {BEST_MODEL_FILE}")
    print(f"Tuning results saved to: {RESULTS_FILE}")
    print(f"Best model info saved to: {BEST_MODEL_INFO_FILE}")
    print(f"Best model: {best_info['model_name']} | f1={best_info['f1_weighted']:.4f}")

    return best_model, best_info


def upload_model_artifacts():
    api = get_hf_api()

    files_to_upload = [
        (str(BEST_MODEL_FILE), "best_model.pkl"),
        (str(RESULTS_FILE), "tuning_results.csv"),
        (str(BEST_MODEL_INFO_FILE), "best_model_info.json"),
    ]

    for local_file, path_in_repo in files_to_upload:
        print(f"Uploading {local_file} -> {path_in_repo}")
        api.upload_file(
            path_or_fileobj=local_file,
            path_in_repo=path_in_repo,
            repo_id=MODEL_REPO_ID,
            repo_type="model",
        )

    print("Best model and tuning artifacts uploaded successfully to HF model repo.")


def main():
    train_df, test_df = download_train_test()
    X_train, X_test, y_train, y_test = prepare_features(train_df, test_df)
    train_and_tune(X_train, y_train, X_test, y_test)
    upload_model_artifacts()
    print("Training completed successfully.")


if __name__ == "__main__":
    main()