Spaces:

harikrishna1985
/

predictive-maintenance-space

Sleeping

App Files Files Community

predictive-maintenance-space / src /02_train.py

harikrishna1985

Upload src/02_train.py with huggingface_hub

7585e98 verified 9 days ago

raw

history blame contribute delete

8.17 kB

	import os
	import json
	from pathlib import Path

	import joblib
	import pandas as pd
	from huggingface_hub import hf_hub_download, HfApi

	from sklearn.metrics import accuracy_score, f1_score
	from sklearn.model_selection import ParameterGrid

	from sklearn.tree import DecisionTreeClassifier
	from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier

	try:
	from xgboost import XGBClassifier
	XGBOOST_AVAILABLE = True
	except Exception:
	XGBOOST_AVAILABLE = False


	# =========================
	# CONFIG
	# =========================
	DATASET_REPO_ID = "harikrishna1985/Engine_data"
	MODEL_REPO_ID = "harikrishna1985/predictive-maintenance-model"

	TRAIN_FILENAME = "processed/train.csv"
	TEST_FILENAME = "processed/test.csv"

	TARGET_COLUMN = "engine_condition"

	LOCAL_ARTIFACTS_DIR = Path("artifacts")
	LOCAL_ARTIFACTS_DIR.mkdir(parents=True, exist_ok=True)

	BEST_MODEL_FILE = LOCAL_ARTIFACTS_DIR / "best_model.pkl"
	RESULTS_FILE = LOCAL_ARTIFACTS_DIR / "tuning_results.csv"
	BEST_MODEL_INFO_FILE = LOCAL_ARTIFACTS_DIR / "best_model_info.json"


	# =========================
	# HELPERS
	# =========================
	def get_hf_api() -> HfApi:
	token = os.getenv("HF_TOKEN")
	return HfApi(token=token)


	def download_train_test() -> tuple[pd.DataFrame, pd.DataFrame]:
	train_path = hf_hub_download(
	repo_id=DATASET_REPO_ID,
	filename=TRAIN_FILENAME,
	repo_type="dataset",
	)
	test_path = hf_hub_download(
	repo_id=DATASET_REPO_ID,
	filename=TEST_FILENAME,
	repo_type="dataset",
	)

	train_df = pd.read_csv(train_path)
	test_df = pd.read_csv(test_path)

	print(f"Train shape: {train_df.shape}")
	print(f"Test shape: {test_df.shape}")
	return train_df, test_df


	def prepare_features(train_df: pd.DataFrame, test_df: pd.DataFrame):
	target_col_clean = TARGET_COLUMN.strip().lower().replace(" ", "_")

	train_df.columns = [c.strip().lower().replace(" ", "_") for c in train_df.columns]
	test_df.columns = [c.strip().lower().replace(" ", "_") for c in test_df.columns]

	if target_col_clean not in train_df.columns or target_col_clean not in test_df.columns:
	raise ValueError(f"Target column '{target_col_clean}' not found in train/test data.")

	X_train = train_df.drop(columns=[target_col_clean])
	y_train = train_df[target_col_clean]

	X_test = test_df.drop(columns=[target_col_clean])
	y_test = test_df[target_col_clean]

	# keep common columns only, same order
	common_cols = [c for c in X_train.columns if c in X_test.columns]
	X_train = X_train[common_cols]
	X_test = X_test[common_cols]

	# one-hot encode categoricals if any
	X_train = pd.get_dummies(X_train, drop_first=False)
	X_test = pd.get_dummies(X_test, drop_first=False)

	X_train, X_test = X_train.align(X_test, join="left", axis=1, fill_value=0)

	return X_train, X_test, y_train, y_test


	def build_model_candidates():
	candidates = {
	"decision_tree": {
	"model_class": DecisionTreeClassifier,
	"param_grid": {
	"max_depth": [3, 5, 10, None],
	"min_samples_split": [2, 5],
	"random_state": [42],
	},
	},
	"random_forest": {
	"model_class": RandomForestClassifier,
	"param_grid": {
	"n_estimators": [100, 200],
	"max_depth": [5, 10, None],
	"min_samples_split": [2, 5],
	"random_state": [42],
	"n_jobs": [-1],
	},
	},
	"adaboost": {
	"model_class": AdaBoostClassifier,
	"param_grid": {
	"n_estimators": [50, 100, 200],
	"learning_rate": [0.5, 1.0],
	"random_state": [42],
	},
	},
	"gradient_boosting": {
	"model_class": GradientBoostingClassifier,
	"param_grid": {
	"n_estimators": [100, 200],
	"learning_rate": [0.05, 0.1],
	"max_depth": [3, 5],
	"random_state": [42],
	},
	},
	"bagging": {
	"model_class": BaggingClassifier,
	"param_grid": {
	"n_estimators": [50, 100],
	"random_state": [42],
	},
	},
	}

	if XGBOOST_AVAILABLE:
	candidates["xgboost"] = {
	"model_class": XGBClassifier,
	"param_grid": {
	"n_estimators": [100, 200],
	"max_depth": [3, 5],
	"learning_rate": [0.05, 0.1],
	"subsample": [0.8, 1.0],
	"colsample_bytree": [0.8, 1.0],
	"random_state": [42],
	"eval_metric": ["mlogloss"],
	},
	}

	return candidates


	def train_and_tune(X_train, y_train, X_test, y_test):
	candidates = build_model_candidates()

	all_results = []
	best_model = None
	best_score = -1
	best_info = None

	for model_name, model_spec in candidates.items():
	model_class = model_spec["model_class"]
	grid = list(ParameterGrid(model_spec["param_grid"]))

	print(f"\nTraining model: {model_name}")
	print(f"Parameter combinations: {len(grid)}")

	for params in grid:
	try:
	model = model_class(**params)
	model.fit(X_train, y_train)

	preds = model.predict(X_test)

	acc = accuracy_score(y_test, preds)
	f1 = f1_score(y_test, preds, average="weighted")

	row = {
	"model_name": model_name,
	"params": json.dumps(params),
	"accuracy": acc,
	"f1_weighted": f1,
	}
	all_results.append(row)

	if f1 > best_score:
	best_score = f1
	best_model = model
	best_info = {
	"model_name": model_name,
	"params": params,
	"accuracy": acc,
	"f1_weighted": f1,
	"feature_columns": X_train.columns.tolist(),
	"target_column": TARGET_COLUMN.strip().lower().replace(" ", "_"),
	}

	print(f"{model_name} \| params={params} \| acc={acc:.4f} \| f1={f1:.4f}")

	except Exception as e:
	print(f"Skipping params due to error: {params} \| error={e}")

	if best_model is None or best_info is None:
	raise RuntimeError("No model was trained successfully.")

	results_df = pd.DataFrame(all_results).sort_values(by="f1_weighted", ascending=False)
	results_df.to_csv(RESULTS_FILE, index=False)

	joblib.dump(best_model, BEST_MODEL_FILE)
	with open(BEST_MODEL_INFO_FILE, "w", encoding="utf-8") as f:
	json.dump(best_info, f, indent=2)

	print(f"\nBest model saved to: {BEST_MODEL_FILE}")
	print(f"Tuning results saved to: {RESULTS_FILE}")
	print(f"Best model info saved to: {BEST_MODEL_INFO_FILE}")
	print(f"Best model: {best_info['model_name']} \| f1={best_info['f1_weighted']:.4f}")

	return best_model, best_info


	def upload_model_artifacts():
	api = get_hf_api()

	files_to_upload = [
	(str(BEST_MODEL_FILE), "best_model.pkl"),
	(str(RESULTS_FILE), "tuning_results.csv"),
	(str(BEST_MODEL_INFO_FILE), "best_model_info.json"),
	]

	for local_file, path_in_repo in files_to_upload:
	print(f"Uploading {local_file} -> {path_in_repo}")
	api.upload_file(
	path_or_fileobj=local_file,
	path_in_repo=path_in_repo,
	repo_id=MODEL_REPO_ID,
	repo_type="model",
	)

	print("Best model and tuning artifacts uploaded successfully to HF model repo.")


	def main():
	train_df, test_df = download_train_test()
	X_train, X_test, y_train, y_test = prepare_features(train_df, test_df)
	train_and_tune(X_train, y_train, X_test, y_test)
	upload_model_artifacts()
	print("Training completed successfully.")


	if __name__ == "__main__":
	main()