Spaces:

SharleyK
/

PredictiveMaintenance

Sleeping

App Files Files Community

SharleyK commited on Feb 8

Commit

9d8621a

verified ·

1 Parent(s): adf74a1

Upload folder using huggingface_hub

Browse files

Files changed (21) hide show

.github/workflows/pipeline.yml +139 -0
predictive-maintenance-pipeline.zip +3 -0
predictive-maintenance-pipeline/.github/workflows/pipeline.yml +139 -0
predictive-maintenance-pipeline/.gitignore +33 -0
predictive-maintenance-pipeline/README.md +78 -0
predictive-maintenance-pipeline/requirements.txt +12 -0
predictive-maintenance-pipeline/scripts/01_create_folders.py +15 -0
predictive-maintenance-pipeline/scripts/02_register_data.py +40 -0
predictive-maintenance-pipeline/scripts/03_verify_data.py +27 -0
predictive-maintenance-pipeline/scripts/05_run_eda.py +53 -0
predictive-maintenance-pipeline/scripts/07_clean_data.py +57 -0
predictive-maintenance-pipeline/scripts/09_train_test_split.py +44 -0
predictive-maintenance-pipeline/scripts/10_upload_processed_data.py +39 -0
predictive-maintenance-pipeline/scripts/13_train_decision_tree.py +98 -0
predictive-maintenance-pipeline/scripts/14_train_bagging.py +63 -0
predictive-maintenance-pipeline/scripts/15_train_random_forest.py +63 -0
predictive-maintenance-pipeline/scripts/16_train_adaboost.py +63 -0
predictive-maintenance-pipeline/scripts/17_train_gradient_boosting.py +63 -0
predictive-maintenance-pipeline/scripts/18_train_xgboost.py +63 -0
predictive-maintenance-pipeline/scripts/19_compare_models.py +26 -0
predictive-maintenance-pipeline/scripts/20_register_best_model.py +38 -0

.github/workflows/pipeline.yml ADDED Viewed

	@@ -0,0 +1,139 @@

+name: Predictive Maintenance ML Pipeline
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+env:
+  PYTHON_VERSION: '3.11'
+  HF_USERNAME: 'SharleyK'
+  DATASET_NAME: 'PredictiveMaintenance'
+  MODEL_NAME: 'engine-predictive-maintenance'
+  MLFLOW_TRACKING_URI: 'file:./mlruns'
+jobs:
+  data-registration:
+    name: Data Registration
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Create project folders
+        run: |
+          python scripts/01_create_folders.py
+      - name: Register data to Hugging Face
+        env:
+          HF_TOKEN: ${ secrets.HF_TOKEN }
+        run: |
+          python scripts/02_register_data.py
+      - name: Verify data registration
+        env:
+          HF_TOKEN: ${ secrets.HF_TOKEN }
+        run: |
+          python scripts/03_verify_data.py
+  exploratory-analysis:
+    name: Exploratory Data Analysis
+    runs-on: ubuntu-latest
+    needs: data-registration
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Run EDA analysis
+        env:
+          HF_TOKEN: ${ secrets.HF_TOKEN }
+        run: |
+          python scripts/05_run_eda.py
+  data-preparation:
+    name: Data Preparation
+    runs-on: ubuntu-latest
+    needs: exploratory-analysis
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Clean and prepare data
+        env:
+          HF_TOKEN: ${ secrets.HF_TOKEN }
+        run: |
+          python scripts/07_clean_data.py
+          python scripts/09_train_test_split.py
+          python scripts/10_upload_processed_data.py
+  model-training:
+    name: Model Training & Experimentation
+    runs-on: ubuntu-latest
+    needs: data-preparation
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Train all models
+        env:
+          HF_TOKEN: ${ secrets.HF_TOKEN }
+        run: |
+          python scripts/13_train_decision_tree.py
+          python scripts/14_train_bagging.py
+          python scripts/15_train_random_forest.py
+          python scripts/16_train_adaboost.py
+          python scripts/17_train_gradient_boosting.py
+          python scripts/18_train_xgboost.py
+      - name: Compare and register best model
+        env:
+          HF_TOKEN: ${ secrets.HF_TOKEN }
+        run: |
+          python scripts/19_compare_models.py
+          python scripts/20_register_best_model.py

predictive-maintenance-pipeline.zip ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:942403e9ef0aae01674d35092c2bf19c2367a99e0100bbdf70039ba3c1abb30b
+size 16138

predictive-maintenance-pipeline/.github/workflows/pipeline.yml ADDED Viewed

	@@ -0,0 +1,139 @@

+name: Predictive Maintenance ML Pipeline
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+  workflow_dispatch:
+env:
+  PYTHON_VERSION: '3.11'
+  HF_USERNAME: 'SharleyK'
+  DATASET_NAME: 'PredictiveMaintenance'
+  MODEL_NAME: 'engine-predictive-maintenance'
+  MLFLOW_TRACKING_URI: 'file:./mlruns'
+jobs:
+  data-registration:
+    name: Data Registration
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Create project folders
+        run: |
+          python scripts/01_create_folders.py
+      - name: Register data to Hugging Face
+        env:
+          HF_TOKEN: ${ secrets.HF_TOKEN }
+        run: |
+          python scripts/02_register_data.py
+      - name: Verify data registration
+        env:
+          HF_TOKEN: ${ secrets.HF_TOKEN }
+        run: |
+          python scripts/03_verify_data.py
+  exploratory-analysis:
+    name: Exploratory Data Analysis
+    runs-on: ubuntu-latest
+    needs: data-registration
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Run EDA analysis
+        env:
+          HF_TOKEN: ${ secrets.HF_TOKEN }
+        run: |
+          python scripts/05_run_eda.py
+  data-preparation:
+    name: Data Preparation
+    runs-on: ubuntu-latest
+    needs: exploratory-analysis
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Clean and prepare data
+        env:
+          HF_TOKEN: ${ secrets.HF_TOKEN }
+        run: |
+          python scripts/07_clean_data.py
+          python scripts/09_train_test_split.py
+          python scripts/10_upload_processed_data.py
+  model-training:
+    name: Model Training & Experimentation
+    runs-on: ubuntu-latest
+    needs: data-preparation
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install -r requirements.txt
+      - name: Train all models
+        env:
+          HF_TOKEN: ${ secrets.HF_TOKEN }
+        run: |
+          python scripts/13_train_decision_tree.py
+          python scripts/14_train_bagging.py
+          python scripts/15_train_random_forest.py
+          python scripts/16_train_adaboost.py
+          python scripts/17_train_gradient_boosting.py
+          python scripts/18_train_xgboost.py
+      - name: Compare and register best model
+        env:
+          HF_TOKEN: ${ secrets.HF_TOKEN }
+        run: |
+          python scripts/19_compare_models.py
+          python scripts/20_register_best_model.py

predictive-maintenance-pipeline/.gitignore ADDED Viewed

	@@ -0,0 +1,33 @@

+# Python
+__pycache__/
+*.py[cod]
+*.so
+.Python
+venv/
+env/
+# Jupyter
+.ipynb_checkpoints
+# Data (optional - uncomment if you want to track data files)
+# *.csv
+# *.xlsx
+# Models
+models/*.pkl
+!models/.gitkeep
+# MLflow
+mlruns/
+# IDE
+.vscode/
+.idea/
+# OS
+.DS_Store
+Thumbs.db
+# Secrets
+.env
+*.token

predictive-maintenance-pipeline/README.md ADDED Viewed

	@@ -0,0 +1,78 @@

+# Predictive Maintenance ML Pipeline
+Automated end-to-end machine learning pipeline for engine predictive maintenance using GitHub Actions.
+##  Quick Start
+### 1. Prerequisites
+- GitHub account
+- Hugging Face account with token
+- Python 3.9+
+### 2. Setup
+**Add GitHub Secret:**
+1. Go to Repository Settings → Secrets → Actions
+2. Add secret: `HF_TOKEN` = your Hugging Face token
+**Create Hugging Face Repositories:**
+- Dataset: `SharleyK/PredictiveMaintenance`
+- Model: `SharleyK/engine-predictive-maintenance`
+### 3. Usage
+**Upload your data:**
+```bash
+# Place engine_data.csv in data/ folder
+```
+**Push to GitHub:**
+```bash
+git add .
+git commit -m "Initial commit"
+git push origin main
+```
+The pipeline will run automatically!
+##  Pipeline Overview
+1. **Data Registration** - Upload data to Hugging Face
+2. **EDA** - Exploratory data analysis
+3. **Data Preparation** - Clean and split data
+4. **Model Training** - Train 6 models with MLflow
+   - Decision Tree
+   - Bagging
+   - Random Forest
+   - AdaBoost
+   - Gradient Boosting
+   - XGBoost
+5. **Model Registration** - Upload best model to HF
+##  Project Structure
+```
+predictive-maintenance-pipeline/
+├── .github/workflows/pipeline.yml  # GitHub Actions workflow
+├── scripts/                        # Python scripts
+├── data/                          # Data files
+├── models/                        # Trained models
+├── outputs/                       # Results
+└── requirements.txt               # Dependencies
+```
+##  Results
+After pipeline execution:
+- Data available at: `https://huggingface.co/datasets/SharleyK/PredictiveMaintenance`
+- Model available at: `https://huggingface.co/SharleyK/engine-predictive-maintenance`
+- MLflow experiments in `mlruns/` folder
+##  Documentation
+- Full implementation guide in repository
+- MLflow UI: `mlflow ui --backend-store-uri file:./mlruns`
+##  Support
+For issues, create a GitHub issue in this repository.

predictive-maintenance-pipeline/requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+pandas==2.0.3
+numpy==1.24.3
+scikit-learn==1.3.0
+matplotlib==3.7.2
+seaborn==0.12.2
+plotly==5.15.0
+xgboost==1.7.6
+mlflow==2.5.0
+huggingface-hub==0.16.4
+datasets==2.14.0
+joblib==1.3.1
+python-dotenv==1.0.0

predictive-maintenance-pipeline/scripts/01_create_folders.py ADDED Viewed

	@@ -0,0 +1,15 @@

+#!/usr/bin/env python3
+"""Create project folder structure"""
+import os
+import logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+folders = ['data', 'models', 'outputs', 'outputs/eda', 'outputs/models', 'mlruns']
+for folder in folders:
+    os.makedirs(folder, exist_ok=True)
+    logger.info(f"✓ Created: {folder}")
+logger.info("✓ Folder structure created successfully!")

predictive-maintenance-pipeline/scripts/02_register_data.py ADDED Viewed

	@@ -0,0 +1,40 @@

+#!/usr/bin/env python3
+"""Register dataset to Hugging Face Hub"""
+import os
+import logging
+from huggingface_hub import HfApi, create_repo
+from huggingface_hub.utils import RepositoryNotFoundError
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
+DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
+repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
+if not HF_TOKEN:
+    raise ValueError("HF_TOKEN not set!")
+logger.info(f"Registering dataset: {repo_id}")
+api = HfApi(token=HF_TOKEN)
+try:
+    api.repo_info(repo_id=repo_id, repo_type="dataset")
+    logger.info(f"✓ Repository exists: {repo_id}")
+except RepositoryNotFoundError:
+    create_repo(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN, private=False)
+    logger.info(f"✓ Created repository: {repo_id}")
+if os.path.exists("data/engine_data.csv"):
+    api.upload_file(
+        path_or_fileobj="data/engine_data.csv",
+        path_in_repo="engine_data.csv",
+        repo_id=repo_id,
+        repo_type="dataset",
+        token=HF_TOKEN
+    )
+    logger.info("✓ Uploaded engine_data.csv")
+logger.info("✓ Data registration completed!")

predictive-maintenance-pipeline/scripts/03_verify_data.py ADDED Viewed

	@@ -0,0 +1,27 @@

+#!/usr/bin/env python3
+"""Verify data on Hugging Face"""
+import os
+import logging
+import pandas as pd
+from huggingface_hub import hf_hub_download
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
+DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
+repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
+logger.info(f"Verifying dataset: {repo_id}")
+downloaded_file = hf_hub_download(
+    repo_id=repo_id,
+    repo_type="dataset",
+    filename="engine_data.csv",
+    token=HF_TOKEN)
+df = pd.read_csv(downloaded_file)
+logger.info(f"✓ Dataset shape: {df.shape}")
+logger.info(f"✓ Columns: {list(df.columns)}")
+logger.info("✓ Data verification completed!")

predictive-maintenance-pipeline/scripts/05_run_eda.py ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/usr/bin/env python3
+"""Run Exploratory Data Analysis"""
+import os
+import logging
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+import seaborn as sns
+from huggingface_hub import hf_hub_download
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
+DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
+repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
+logger.info("Starting EDA...")
+# Download data
+file_path = hf_hub_download(
+    repo_id=repo_id,
+    repo_type="dataset",
+    filename="engine_data.csv",
+    token=HF_TOKEN )
+df = pd.read_csv(file_path)
+os.makedirs("outputs/eda", exist_ok=True)
+# Univariate Analysis
+logger.info("Creating univariate plots...")
+for col in df.select_dtypes(include=[np.number]).columns:
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
+    df[col].hist(ax=ax1, bins=30)
+    ax1.set_title(f'{col} Distribution')
+    df.boxplot(column=col, ax=ax2)
+    ax2.set_title(f'{col} Box Plot')
+    plt.tight_layout()
+    plt.savefig(f'outputs/eda/{col}_univariate.png')
+    plt.close()
+# Correlation Heatmap
+logger.info("Creating correlation heatmap...")
+plt.figure(figsize=(10, 8))
+sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
+plt.title('Correlation Matrix')
+plt.tight_layout()
+plt.savefig('outputs/eda/correlation_heatmap.png')
+plt.close()
+logger.info("✓ EDA completed! Plots saved to outputs/eda/")

predictive-maintenance-pipeline/scripts/07_clean_data.py ADDED Viewed

	@@ -0,0 +1,57 @@

+#!/usr/bin/env python3
+"""Clean and prepare data"""
+import os
+import logging
+import pandas as pd
+from huggingface_hub import hf_hub_download
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
+DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
+repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
+logger.info("Cleaning data...")
+# Download data
+file_path = hf_hub_download(repo_id=repo_id, repo_type="dataset",
+                             filename="engine_data.csv", token=HF_TOKEN)
+df = pd.read_csv(file_path)
+logger.info(f"Original shape: {df.shape}")
+logger.info(f"Original columns: {df.columns.tolist()}")
+# Standardize column names to lowercase with underscores
+# This ensures consistency regardless of how they're named in the source
+df.columns = df.columns.str.lower().str.replace(' ', '_')
+logger.info(f"Standardized columns: {df.columns.tolist()}")
+# Verify the target column exists
+if 'engine_condition' not in df.columns:
+    logger.error(f"Target column 'engine_condition' not found after standardization!")
+    logger.error(f"Available columns: {df.columns.tolist()}")
+    raise KeyError("Missing expected target column")
+# Remove duplicates
+initial_rows = df.shape[0]
+df = df.drop_duplicates()
+logger.info(f"After removing duplicates: {df.shape} (removed {initial_rows - df.shape[0]} rows)")
+# Handle missing values (if any)
+initial_rows = df.shape[0]
+df = df.dropna()
+logger.info(f"After dropping NA: {df.shape} (removed {initial_rows - df.shape[0]} rows)")
+# Log target distribution
+logger.info(f"Target distribution:
+{df['engine_condition'].value_counts()}")
+# Save cleaned data
+os.makedirs("data", exist_ok=True)
+df.to_csv("data/cleaned_data.csv", index=False)
+logger.info("✓ Data cleaning completed!")
+logger.info(f"✓ Final columns: {df.columns.tolist()}")

predictive-maintenance-pipeline/scripts/09_train_test_split.py ADDED Viewed

	@@ -0,0 +1,44 @@

+#!/usr/bin/env python3
+"""Split data into train and test sets"""
+import os
+import logging
+import pandas as pd
+from sklearn.model_selection import train_test_split
+from sklearn.preprocessing import StandardScaler
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+logger.info("Splitting data...")
+# Load cleaned data
+df = pd.read_csv("data/cleaned_data.csv")
+# Separate features and target
+X = df.drop('engine_condition', axis=1)
+y = df['engine_condition']
+# Split data
+X_train, X_test, y_train, y_test = train_test_split(
+    X, y, test_size=0.2, random_state=42, stratify=y
+)
+logger.info(f"Train shape: {X_train.shape}")
+logger.info(f"Test shape: {X_test.shape}")
+# Scale features
+scaler = StandardScaler()
+X_train_scaled = scaler.fit_transform(X_train)
+X_test_scaled = scaler.transform(X_test)
+# Save as DataFrames
+train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
+train_df['engine_condition'] = y_train.values
+test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
+test_df['engine_condition'] = y_test.values
+train_df.to_csv('data/train_scaled.csv', index=False)
+test_df.to_csv('data/test_scaled.csv', index=False)
+logger.info("✓ Train-test split completed!")

predictive-maintenance-pipeline/scripts/10_upload_processed_data.py ADDED Viewed

	@@ -0,0 +1,39 @@

+#!/usr/bin/env python3
+"""Upload processed data to Hugging Face"""
+import os
+import logging
+from huggingface_hub import HfApi
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
+DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
+repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
+api = HfApi(token=HF_TOKEN)
+logger.info("Uploading processed data...")
+# Upload train data
+api.upload_file(
+    path_or_fileobj="data/train_scaled.csv",
+    path_in_repo="train_scaled.csv",
+    repo_id=repo_id,
+    repo_type="dataset",
+    token=HF_TOKEN
+)
+logger.info("✓ Uploaded train_scaled.csv")
+# Upload test data
+api.upload_file(
+    path_or_fileobj="data/test_scaled.csv",
+    path_in_repo="test_scaled.csv",
+    repo_id=repo_id,
+    repo_type="dataset",
+    token=HF_TOKEN
+)
+logger.info("✓ Uploaded test_scaled.csv")
+logger.info("✓ Data upload completed!")

predictive-maintenance-pipeline/scripts/13_train_decision_tree.py ADDED Viewed

	@@ -0,0 +1,98 @@

+#!/usr/bin/env python3
+"""Train Decision Tree model with MLflow tracking"""
+import os
+import logging
+import pandas as pd
+import mlflow
+import mlflow.sklearn
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+from huggingface_hub import hf_hub_download
+import joblib
+import json
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
+DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
+repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
+# Set up MLflow
+mlflow.set_tracking_uri("file:./mlruns")
+mlflow.set_experiment("Predictive_Maintenance")
+logger.info("Loading data from Hugging Face...")
+# Download train and test data
+train_file = hf_hub_download(repo_id=repo_id, repo_type="dataset",
+                              filename="train_scaled.csv", token=HF_TOKEN)
+test_file = hf_hub_download(repo_id=repo_id, repo_type="dataset",
+                             filename="test_scaled.csv", token=HF_TOKEN)
+train_df = pd.read_csv(train_file)
+test_df = pd.read_csv(test_file)
+X_train = train_df.drop('engine_condition', axis=1)
+y_train = train_df['engine_condition']
+X_test = test_df.drop('engine_condition', axis=1)
+y_test = test_df['engine_condition']
+logger.info("Training Decision Tree...")
+param_grid = {
+    'max_depth': [5, 10, 15, None],
+    'min_samples_split': [2, 5, 10],
+    'min_samples_leaf': [1, 2, 4]
+}
+with mlflow.start_run(run_name="Decision_Tree"):
+    mlflow.set_tag("model_type", "Decision Tree")
+    dt_model = DecisionTreeClassifier(random_state=42)
+    grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='f1', n_jobs=-1)
+    grid_search.fit(X_train, y_train)
+    best_model = grid_search.best_estimator_
+    # Log parameters
+    mlflow.log_params(grid_search.best_params_)
+    # Make predictions
+    y_pred = best_model.predict(X_test)
+    # Calculate metrics
+    accuracy = accuracy_score(y_test, y_pred)
+    precision = precision_score(y_test, y_pred)
+    recall = recall_score(y_test, y_pred)
+    f1 = f1_score(y_test, y_pred)
+    # Log metrics
+    mlflow.log_metric("accuracy", accuracy)
+    mlflow.log_metric("precision", precision)
+    mlflow.log_metric("recall", recall)
+    mlflow.log_metric("f1_score", f1)
+    # Log model
+    mlflow.sklearn.log_model(best_model, "model")
+    # Save model locally
+    os.makedirs("models", exist_ok=True)
+    joblib.dump(best_model, "models/decision_tree.pkl")
+    # Save metrics
+    metrics = {
+        "model": "Decision Tree",
+        "accuracy": round(accuracy, 4),
+        "precision": round(precision, 4),
+        "recall": round(recall, 4),
+        "f1_score": round(f1, 4)
+    }
+    os.makedirs("outputs/models", exist_ok=True)
+    with open("outputs/models/decision_tree_metrics.json", "w") as f:
+        json.dump(metrics, f, indent=4)
+    logger.info(f"✓ Decision Tree trained! F1-Score: {f1:.4f}")

predictive-maintenance-pipeline/scripts/14_train_bagging.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3
+"""Train Bagging model with MLflow tracking"""
+import os, logging, pandas as pd, mlflow, mlflow.sklearn, joblib, json
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+from huggingface_hub import hf_hub_download
+from sklearn.ensemble import BaggingClassifier
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
+DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
+repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
+mlflow.set_tracking_uri("file:./mlruns")
+mlflow.set_experiment("Predictive_Maintenance")
+# Load data
+train_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="train_scaled.csv", token=HF_TOKEN)
+test_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="test_scaled.csv", token=HF_TOKEN)
+train_df = pd.read_csv(train_file)
+test_df = pd.read_csv(test_file)
+X_train = train_df.drop('engine_condition', axis=1)
+y_train = train_df['engine_condition']
+X_test = test_df.drop('engine_condition', axis=1)
+y_test = test_df['engine_condition']
+logger.info("Training Bagging...")
+param_grid = {'n_estimators': [50, 100, 200], 'max_samples': [0.5, 0.7, 1.0]}
+with mlflow.start_run(run_name="Bagging"):
+    mlflow.set_tag("model_type", "Bagging")
+    model = BaggingClassifier(random_state=42)
+    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
+    grid_search.fit(X_train, y_train)
+    best_model = grid_search.best_estimator_
+    mlflow.log_params(grid_search.best_params_)
+    y_pred = best_model.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+    precision = precision_score(y_test, y_pred)
+    recall = recall_score(y_test, y_pred)
+    f1 = f1_score(y_test, y_pred)
+    mlflow.log_metric("accuracy", accuracy)
+    mlflow.log_metric("precision", precision)
+    mlflow.log_metric("recall", recall)
+    mlflow.log_metric("f1_score", f1)
+    mlflow.sklearn.log_model(best_model, "model")
+    os.makedirs("models", exist_ok=True)
+    joblib.dump(best_model, "models/bagging.pkl")
+    logger.info(f"✓ Bagging trained! F1-Score: {f1:.4f}")

predictive-maintenance-pipeline/scripts/15_train_random_forest.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3
+"""Train Random Forest model with MLflow tracking"""
+import os, logging, pandas as pd, mlflow, mlflow.sklearn, joblib, json
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+from huggingface_hub import hf_hub_download
+from sklearn.ensemble import RandomForestClassifier
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
+DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
+repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
+mlflow.set_tracking_uri("file:./mlruns")
+mlflow.set_experiment("Predictive_Maintenance")
+# Load data
+train_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="train_scaled.csv", token=HF_TOKEN)
+test_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="test_scaled.csv", token=HF_TOKEN)
+train_df = pd.read_csv(train_file)
+test_df = pd.read_csv(test_file)
+X_train = train_df.drop('engine_condition', axis=1)
+y_train = train_df['engine_condition']
+X_test = test_df.drop('engine_condition', axis=1)
+y_test = test_df['engine_condition']
+logger.info("Training Random Forest...")
+param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5]}
+with mlflow.start_run(run_name="Random_Forest"):
+    mlflow.set_tag("model_type", "Random Forest")
+    model = RandomForestClassifier(random_state=42)
+    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
+    grid_search.fit(X_train, y_train)
+    best_model = grid_search.best_estimator_
+    mlflow.log_params(grid_search.best_params_)
+    y_pred = best_model.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+    precision = precision_score(y_test, y_pred)
+    recall = recall_score(y_test, y_pred)
+    f1 = f1_score(y_test, y_pred)
+    mlflow.log_metric("accuracy", accuracy)
+    mlflow.log_metric("precision", precision)
+    mlflow.log_metric("recall", recall)
+    mlflow.log_metric("f1_score", f1)
+    mlflow.sklearn.log_model(best_model, "model")
+    os.makedirs("models", exist_ok=True)
+    joblib.dump(best_model, "models/random_forest.pkl")
+    logger.info(f"✓ Random Forest trained! F1-Score: {f1:.4f}")

predictive-maintenance-pipeline/scripts/16_train_adaboost.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3
+"""Train AdaBoost model with MLflow tracking"""
+import os, logging, pandas as pd, mlflow, mlflow.sklearn, joblib, json
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+from huggingface_hub import hf_hub_download
+from sklearn.ensemble import AdaBoostClassifier
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
+DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
+repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
+mlflow.set_tracking_uri("file:./mlruns")
+mlflow.set_experiment("Predictive_Maintenance")
+# Load data
+train_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="train_scaled.csv", token=HF_TOKEN)
+test_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="test_scaled.csv", token=HF_TOKEN)
+train_df = pd.read_csv(train_file)
+test_df = pd.read_csv(test_file)
+X_train = train_df.drop('engine_condition', axis=1)
+y_train = train_df['engine_condition']
+X_test = test_df.drop('engine_condition', axis=1)
+y_test = test_df['engine_condition']
+logger.info("Training AdaBoost...")
+param_grid = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]}
+with mlflow.start_run(run_name="AdaBoost"):
+    mlflow.set_tag("model_type", "AdaBoost")
+    model = AdaBoostClassifier(random_state=42)
+    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
+    grid_search.fit(X_train, y_train)
+    best_model = grid_search.best_estimator_
+    mlflow.log_params(grid_search.best_params_)
+    y_pred = best_model.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+    precision = precision_score(y_test, y_pred)
+    recall = recall_score(y_test, y_pred)
+    f1 = f1_score(y_test, y_pred)
+    mlflow.log_metric("accuracy", accuracy)
+    mlflow.log_metric("precision", precision)
+    mlflow.log_metric("recall", recall)
+    mlflow.log_metric("f1_score", f1)
+    mlflow.sklearn.log_model(best_model, "model")
+    os.makedirs("models", exist_ok=True)
+    joblib.dump(best_model, "models/adaboost.pkl")
+    logger.info(f"✓ AdaBoost trained! F1-Score: {f1:.4f}")

predictive-maintenance-pipeline/scripts/17_train_gradient_boosting.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3
+"""Train Gradient Boosting model with MLflow tracking"""
+import os, logging, pandas as pd, mlflow, mlflow.sklearn, joblib, json
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+from huggingface_hub import hf_hub_download
+from sklearn.ensemble import GradientBoostingClassifier
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
+DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
+repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
+mlflow.set_tracking_uri("file:./mlruns")
+mlflow.set_experiment("Predictive_Maintenance")
+# Load data
+train_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="train_scaled.csv", token=HF_TOKEN)
+test_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="test_scaled.csv", token=HF_TOKEN)
+train_df = pd.read_csv(train_file)
+test_df = pd.read_csv(test_file)
+X_train = train_df.drop('engine_condition', axis=1)
+y_train = train_df['engine_condition']
+X_test = test_df.drop('engine_condition', axis=1)
+y_test = test_df['engine_condition']
+logger.info("Training Gradient Boosting...")
+param_grid = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]}
+with mlflow.start_run(run_name="Gradient_Boosting"):
+    mlflow.set_tag("model_type", "Gradient Boosting")
+    model = GradientBoostingClassifier(random_state=42)
+    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
+    grid_search.fit(X_train, y_train)
+    best_model = grid_search.best_estimator_
+    mlflow.log_params(grid_search.best_params_)
+    y_pred = best_model.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+    precision = precision_score(y_test, y_pred)
+    recall = recall_score(y_test, y_pred)
+    f1 = f1_score(y_test, y_pred)
+    mlflow.log_metric("accuracy", accuracy)
+    mlflow.log_metric("precision", precision)
+    mlflow.log_metric("recall", recall)
+    mlflow.log_metric("f1_score", f1)
+    mlflow.sklearn.log_model(best_model, "model")
+    os.makedirs("models", exist_ok=True)
+    joblib.dump(best_model, "models/gradient_boosting.pkl")
+    logger.info(f"✓ Gradient Boosting trained! F1-Score: {f1:.4f}")

predictive-maintenance-pipeline/scripts/18_train_xgboost.py ADDED Viewed

	@@ -0,0 +1,63 @@

+#!/usr/bin/env python3
+"""Train XGBoost model with MLflow tracking"""
+import os, logging, pandas as pd, mlflow, mlflow.sklearn, joblib, json
+from sklearn.model_selection import GridSearchCV
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
+from huggingface_hub import hf_hub_download
+from xgboost import XGBClassifier
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
+DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
+repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
+mlflow.set_tracking_uri("file:./mlruns")
+mlflow.set_experiment("Predictive_Maintenance")
+# Load data
+train_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="train_scaled.csv", token=HF_TOKEN)
+test_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="test_scaled.csv", token=HF_TOKEN)
+train_df = pd.read_csv(train_file)
+test_df = pd.read_csv(test_file)
+X_train = train_df.drop('engine_condition', axis=1)
+y_train = train_df['engine_condition']
+X_test = test_df.drop('engine_condition', axis=1)
+y_test = test_df['engine_condition']
+logger.info("Training XGBoost...")
+param_grid = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5, 7]}
+with mlflow.start_run(run_name="XGBoost"):
+    mlflow.set_tag("model_type", "XGBoost")
+    model = XGBClassifier(random_state=42)
+    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
+    grid_search.fit(X_train, y_train)
+    best_model = grid_search.best_estimator_
+    mlflow.log_params(grid_search.best_params_)
+    y_pred = best_model.predict(X_test)
+    accuracy = accuracy_score(y_test, y_pred)
+    precision = precision_score(y_test, y_pred)
+    recall = recall_score(y_test, y_pred)
+    f1 = f1_score(y_test, y_pred)
+    mlflow.log_metric("accuracy", accuracy)
+    mlflow.log_metric("precision", precision)
+    mlflow.log_metric("recall", recall)
+    mlflow.log_metric("f1_score", f1)
+    mlflow.sklearn.log_model(best_model, "model")
+    os.makedirs("models", exist_ok=True)
+    joblib.dump(best_model, "models/xgboost.pkl")
+    logger.info(f"✓ XGBoost trained! F1-Score: {f1:.4f}")

predictive-maintenance-pipeline/scripts/19_compare_models.py ADDED Viewed

	@@ -0,0 +1,26 @@

+#!/usr/bin/env python3
+"""Compare all trained models"""
+import os, json, pandas as pd, logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+logger.info("Comparing models...")
+results = []
+for file in os.listdir("outputs/models"):
+    if file.endswith("_metrics.json"):
+        with open(f"outputs/models/{file}", "r") as f:
+            results.append(json.load(f))
+df = pd.DataFrame(results)
+df = df.sort_values("f1_score", ascending=False)
+df.to_csv("outputs/model_comparison.csv", index=False)
+logger.info("
+Model Comparison:")
+logger.info(f"
+{df.to_string()}")
+logger.info(f"
+✓ Best Model: {df.iloc[0]['model']} (F1: {df.iloc[0]['f1_score']:.4f})")

predictive-maintenance-pipeline/scripts/20_register_best_model.py ADDED Viewed

	@@ -0,0 +1,38 @@

+#!/usr/bin/env python3
+"""Register best model to Hugging Face"""
+import os, json, logging, joblib
+from huggingface_hub import HfApi, create_repo
+from huggingface_hub.utils import RepositoryNotFoundError
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+HF_TOKEN = os.getenv("HF_TOKEN")
+HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
+MODEL_NAME = os.getenv("MODEL_NAME", "engine-predictive-maintenance")
+repo_id = f"{HF_USERNAME}/{MODEL_NAME}"
+api = HfApi(token=HF_TOKEN)
+# Get best model
+comparison = __import__('pandas').read_csv("outputs/model_comparison.csv")
+best_model_name = comparison.iloc[0]['model'].lower().replace(' ', '_')
+logger.info(f"Registering best model: {best_model_name}")
+# Create model repo
+try:
+    api.repo_info(repo_id=repo_id, repo_type="model")
+except RepositoryNotFoundError:
+    create_repo(repo_id=repo_id, repo_type="model", token=HF_TOKEN)
+# Upload model
+api.upload_file(
+    path_or_fileobj=f"models/{best_model_name}.pkl",
+    path_in_repo="best_model.pkl",
+    repo_id=repo_id,
+    repo_type="model",
+    token=HF_TOKEN
+)
+logger.info(f"✓ Model registered to Hugging Face: {repo_id}")