SharleyK commited on
Commit
9d8621a
·
verified ·
1 Parent(s): adf74a1

Upload folder using huggingface_hub

Browse files
.github/workflows/pipeline.yml ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Predictive Maintenance ML Pipeline
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+ workflow_dispatch:
9
+
10
+ env:
11
+ PYTHON_VERSION: '3.11'
12
+ HF_USERNAME: 'SharleyK'
13
+ DATASET_NAME: 'PredictiveMaintenance'
14
+ MODEL_NAME: 'engine-predictive-maintenance'
15
+ MLFLOW_TRACKING_URI: 'file:./mlruns'
16
+
17
+ jobs:
18
+ data-registration:
19
+ name: Data Registration
20
+ runs-on: ubuntu-latest
21
+
22
+ steps:
23
+ - name: Checkout repository
24
+ uses: actions/checkout@v3
25
+
26
+ - name: Set up Python
27
+ uses: actions/setup-python@v4
28
+ with:
29
+ python-version: 3.11
30
+
31
+ - name: Install dependencies
32
+ run: |
33
+ python -m pip install --upgrade pip
34
+ pip install -r requirements.txt
35
+
36
+ - name: Create project folders
37
+ run: |
38
+ python scripts/01_create_folders.py
39
+
40
+ - name: Register data to Hugging Face
41
+ env:
42
+ HF_TOKEN: ${ secrets.HF_TOKEN }
43
+ run: |
44
+ python scripts/02_register_data.py
45
+
46
+ - name: Verify data registration
47
+ env:
48
+ HF_TOKEN: ${ secrets.HF_TOKEN }
49
+ run: |
50
+ python scripts/03_verify_data.py
51
+
52
+ exploratory-analysis:
53
+ name: Exploratory Data Analysis
54
+ runs-on: ubuntu-latest
55
+ needs: data-registration
56
+
57
+ steps:
58
+ - name: Checkout repository
59
+ uses: actions/checkout@v3
60
+
61
+ - name: Set up Python
62
+ uses: actions/setup-python@v4
63
+ with:
64
+ python-version: 3.11
65
+
66
+ - name: Install dependencies
67
+ run: |
68
+ python -m pip install --upgrade pip
69
+ pip install -r requirements.txt
70
+
71
+ - name: Run EDA analysis
72
+ env:
73
+ HF_TOKEN: ${ secrets.HF_TOKEN }
74
+ run: |
75
+ python scripts/05_run_eda.py
76
+
77
+ data-preparation:
78
+ name: Data Preparation
79
+ runs-on: ubuntu-latest
80
+ needs: exploratory-analysis
81
+
82
+ steps:
83
+ - name: Checkout repository
84
+ uses: actions/checkout@v3
85
+
86
+ - name: Set up Python
87
+ uses: actions/setup-python@v4
88
+ with:
89
+ python-version: 3.11
90
+
91
+ - name: Install dependencies
92
+ run: |
93
+ python -m pip install --upgrade pip
94
+ pip install -r requirements.txt
95
+
96
+ - name: Clean and prepare data
97
+ env:
98
+ HF_TOKEN: ${ secrets.HF_TOKEN }
99
+ run: |
100
+ python scripts/07_clean_data.py
101
+ python scripts/09_train_test_split.py
102
+ python scripts/10_upload_processed_data.py
103
+
104
+ model-training:
105
+ name: Model Training & Experimentation
106
+ runs-on: ubuntu-latest
107
+ needs: data-preparation
108
+
109
+ steps:
110
+ - name: Checkout repository
111
+ uses: actions/checkout@v3
112
+
113
+ - name: Set up Python
114
+ uses: actions/setup-python@v4
115
+ with:
116
+ python-version: 3.11
117
+
118
+ - name: Install dependencies
119
+ run: |
120
+ python -m pip install --upgrade pip
121
+ pip install -r requirements.txt
122
+
123
+ - name: Train all models
124
+ env:
125
+ HF_TOKEN: ${ secrets.HF_TOKEN }
126
+ run: |
127
+ python scripts/13_train_decision_tree.py
128
+ python scripts/14_train_bagging.py
129
+ python scripts/15_train_random_forest.py
130
+ python scripts/16_train_adaboost.py
131
+ python scripts/17_train_gradient_boosting.py
132
+ python scripts/18_train_xgboost.py
133
+
134
+ - name: Compare and register best model
135
+ env:
136
+ HF_TOKEN: ${ secrets.HF_TOKEN }
137
+ run: |
138
+ python scripts/19_compare_models.py
139
+ python scripts/20_register_best_model.py
predictive-maintenance-pipeline.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:942403e9ef0aae01674d35092c2bf19c2367a99e0100bbdf70039ba3c1abb30b
3
+ size 16138
predictive-maintenance-pipeline/.github/workflows/pipeline.yml ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Predictive Maintenance ML Pipeline
2
+
3
+ on:
4
+ push:
5
+ branches: [ main ]
6
+ pull_request:
7
+ branches: [ main ]
8
+ workflow_dispatch:
9
+
10
+ env:
11
+ PYTHON_VERSION: '3.11'
12
+ HF_USERNAME: 'SharleyK'
13
+ DATASET_NAME: 'PredictiveMaintenance'
14
+ MODEL_NAME: 'engine-predictive-maintenance'
15
+ MLFLOW_TRACKING_URI: 'file:./mlruns'
16
+
17
+ jobs:
18
+ data-registration:
19
+ name: Data Registration
20
+ runs-on: ubuntu-latest
21
+
22
+ steps:
23
+ - name: Checkout repository
24
+ uses: actions/checkout@v3
25
+
26
+ - name: Set up Python
27
+ uses: actions/setup-python@v4
28
+ with:
29
+ python-version: 3.11
30
+
31
+ - name: Install dependencies
32
+ run: |
33
+ python -m pip install --upgrade pip
34
+ pip install -r requirements.txt
35
+
36
+ - name: Create project folders
37
+ run: |
38
+ python scripts/01_create_folders.py
39
+
40
+ - name: Register data to Hugging Face
41
+ env:
42
+ HF_TOKEN: ${ secrets.HF_TOKEN }
43
+ run: |
44
+ python scripts/02_register_data.py
45
+
46
+ - name: Verify data registration
47
+ env:
48
+ HF_TOKEN: ${ secrets.HF_TOKEN }
49
+ run: |
50
+ python scripts/03_verify_data.py
51
+
52
+ exploratory-analysis:
53
+ name: Exploratory Data Analysis
54
+ runs-on: ubuntu-latest
55
+ needs: data-registration
56
+
57
+ steps:
58
+ - name: Checkout repository
59
+ uses: actions/checkout@v3
60
+
61
+ - name: Set up Python
62
+ uses: actions/setup-python@v4
63
+ with:
64
+ python-version: 3.11
65
+
66
+ - name: Install dependencies
67
+ run: |
68
+ python -m pip install --upgrade pip
69
+ pip install -r requirements.txt
70
+
71
+ - name: Run EDA analysis
72
+ env:
73
+ HF_TOKEN: ${ secrets.HF_TOKEN }
74
+ run: |
75
+ python scripts/05_run_eda.py
76
+
77
+ data-preparation:
78
+ name: Data Preparation
79
+ runs-on: ubuntu-latest
80
+ needs: exploratory-analysis
81
+
82
+ steps:
83
+ - name: Checkout repository
84
+ uses: actions/checkout@v3
85
+
86
+ - name: Set up Python
87
+ uses: actions/setup-python@v4
88
+ with:
89
+ python-version: 3.11
90
+
91
+ - name: Install dependencies
92
+ run: |
93
+ python -m pip install --upgrade pip
94
+ pip install -r requirements.txt
95
+
96
+ - name: Clean and prepare data
97
+ env:
98
+ HF_TOKEN: ${ secrets.HF_TOKEN }
99
+ run: |
100
+ python scripts/07_clean_data.py
101
+ python scripts/09_train_test_split.py
102
+ python scripts/10_upload_processed_data.py
103
+
104
+ model-training:
105
+ name: Model Training & Experimentation
106
+ runs-on: ubuntu-latest
107
+ needs: data-preparation
108
+
109
+ steps:
110
+ - name: Checkout repository
111
+ uses: actions/checkout@v3
112
+
113
+ - name: Set up Python
114
+ uses: actions/setup-python@v4
115
+ with:
116
+ python-version: 3.11
117
+
118
+ - name: Install dependencies
119
+ run: |
120
+ python -m pip install --upgrade pip
121
+ pip install -r requirements.txt
122
+
123
+ - name: Train all models
124
+ env:
125
+ HF_TOKEN: ${ secrets.HF_TOKEN }
126
+ run: |
127
+ python scripts/13_train_decision_tree.py
128
+ python scripts/14_train_bagging.py
129
+ python scripts/15_train_random_forest.py
130
+ python scripts/16_train_adaboost.py
131
+ python scripts/17_train_gradient_boosting.py
132
+ python scripts/18_train_xgboost.py
133
+
134
+ - name: Compare and register best model
135
+ env:
136
+ HF_TOKEN: ${ secrets.HF_TOKEN }
137
+ run: |
138
+ python scripts/19_compare_models.py
139
+ python scripts/20_register_best_model.py
predictive-maintenance-pipeline/.gitignore ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *.so
5
+ .Python
6
+ venv/
7
+ env/
8
+
9
+ # Jupyter
10
+ .ipynb_checkpoints
11
+
12
+ # Data (optional - uncomment if you want to track data files)
13
+ # *.csv
14
+ # *.xlsx
15
+
16
+ # Models
17
+ models/*.pkl
18
+ !models/.gitkeep
19
+
20
+ # MLflow
21
+ mlruns/
22
+
23
+ # IDE
24
+ .vscode/
25
+ .idea/
26
+
27
+ # OS
28
+ .DS_Store
29
+ Thumbs.db
30
+
31
+ # Secrets
32
+ .env
33
+ *.token
predictive-maintenance-pipeline/README.md ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Predictive Maintenance ML Pipeline
2
+
3
+ Automated end-to-end machine learning pipeline for engine predictive maintenance using GitHub Actions.
4
+
5
+ ## Quick Start
6
+
7
+ ### 1. Prerequisites
8
+ - GitHub account
9
+ - Hugging Face account with token
10
+ - Python 3.9+
11
+
12
+ ### 2. Setup
13
+
14
+ **Add GitHub Secret:**
15
+ 1. Go to Repository Settings → Secrets → Actions
16
+ 2. Add secret: `HF_TOKEN` = your Hugging Face token
17
+
18
+ **Create Hugging Face Repositories:**
19
+ - Dataset: `SharleyK/PredictiveMaintenance`
20
+ - Model: `SharleyK/engine-predictive-maintenance`
21
+
22
+ ### 3. Usage
23
+
24
+ **Upload your data:**
25
+ ```bash
26
+ # Place engine_data.csv in data/ folder
27
+ ```
28
+
29
+ **Push to GitHub:**
30
+ ```bash
31
+ git add .
32
+ git commit -m "Initial commit"
33
+ git push origin main
34
+ ```
35
+
36
+ The pipeline will run automatically!
37
+
38
+ ## Pipeline Overview
39
+
40
+ 1. **Data Registration** - Upload data to Hugging Face
41
+ 2. **EDA** - Exploratory data analysis
42
+ 3. **Data Preparation** - Clean and split data
43
+ 4. **Model Training** - Train 6 models with MLflow
44
+ - Decision Tree
45
+ - Bagging
46
+ - Random Forest
47
+ - AdaBoost
48
+ - Gradient Boosting
49
+ - XGBoost
50
+ 5. **Model Registration** - Upload best model to HF
51
+
52
+ ## Project Structure
53
+
54
+ ```
55
+ predictive-maintenance-pipeline/
56
+ ├── .github/workflows/pipeline.yml # GitHub Actions workflow
57
+ ├── scripts/ # Python scripts
58
+ ├── data/ # Data files
59
+ ├── models/ # Trained models
60
+ ├── outputs/ # Results
61
+ └── requirements.txt # Dependencies
62
+ ```
63
+
64
+ ## Results
65
+
66
+ After pipeline execution:
67
+ - Data available at: `https://huggingface.co/datasets/SharleyK/PredictiveMaintenance`
68
+ - Model available at: `https://huggingface.co/SharleyK/engine-predictive-maintenance`
69
+ - MLflow experiments in `mlruns/` folder
70
+
71
+ ## Documentation
72
+
73
+ - Full implementation guide in repository
74
+ - MLflow UI: `mlflow ui --backend-store-uri file:./mlruns`
75
+
76
+ ## Support
77
+
78
+ For issues, create a GitHub issue in this repository.
predictive-maintenance-pipeline/requirements.txt ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas==2.0.3
2
+ numpy==1.24.3
3
+ scikit-learn==1.3.0
4
+ matplotlib==3.7.2
5
+ seaborn==0.12.2
6
+ plotly==5.15.0
7
+ xgboost==1.7.6
8
+ mlflow==2.5.0
9
+ huggingface-hub==0.16.4
10
+ datasets==2.14.0
11
+ joblib==1.3.1
12
+ python-dotenv==1.0.0
predictive-maintenance-pipeline/scripts/01_create_folders.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Create project folder structure"""
3
+ import os
4
+ import logging
5
+
6
+ logging.basicConfig(level=logging.INFO)
7
+ logger = logging.getLogger(__name__)
8
+
9
+ folders = ['data', 'models', 'outputs', 'outputs/eda', 'outputs/models', 'mlruns']
10
+
11
+ for folder in folders:
12
+ os.makedirs(folder, exist_ok=True)
13
+ logger.info(f"✓ Created: {folder}")
14
+
15
+ logger.info("✓ Folder structure created successfully!")
predictive-maintenance-pipeline/scripts/02_register_data.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Register dataset to Hugging Face Hub"""
3
+ import os
4
+ import logging
5
+ from huggingface_hub import HfApi, create_repo
6
+ from huggingface_hub.utils import RepositoryNotFoundError
7
+
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
+
11
+ HF_TOKEN = os.getenv("HF_TOKEN")
12
+ HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
13
+ DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
14
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
15
+
16
+ if not HF_TOKEN:
17
+ raise ValueError("HF_TOKEN not set!")
18
+
19
+ logger.info(f"Registering dataset: {repo_id}")
20
+
21
+ api = HfApi(token=HF_TOKEN)
22
+
23
+ try:
24
+ api.repo_info(repo_id=repo_id, repo_type="dataset")
25
+ logger.info(f"✓ Repository exists: {repo_id}")
26
+ except RepositoryNotFoundError:
27
+ create_repo(repo_id=repo_id, repo_type="dataset", token=HF_TOKEN, private=False)
28
+ logger.info(f"✓ Created repository: {repo_id}")
29
+
30
+ if os.path.exists("data/engine_data.csv"):
31
+ api.upload_file(
32
+ path_or_fileobj="data/engine_data.csv",
33
+ path_in_repo="engine_data.csv",
34
+ repo_id=repo_id,
35
+ repo_type="dataset",
36
+ token=HF_TOKEN
37
+ )
38
+ logger.info("✓ Uploaded engine_data.csv")
39
+
40
+ logger.info("✓ Data registration completed!")
predictive-maintenance-pipeline/scripts/03_verify_data.py ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Verify data on Hugging Face"""
3
+ import os
4
+ import logging
5
+ import pandas as pd
6
+ from huggingface_hub import hf_hub_download
7
+
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
+
11
+ HF_TOKEN = os.getenv("HF_TOKEN")
12
+ HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
13
+ DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
14
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
15
+
16
+ logger.info(f"Verifying dataset: {repo_id}")
17
+
18
+ downloaded_file = hf_hub_download(
19
+ repo_id=repo_id,
20
+ repo_type="dataset",
21
+ filename="engine_data.csv",
22
+ token=HF_TOKEN)
23
+
24
+ df = pd.read_csv(downloaded_file)
25
+ logger.info(f"✓ Dataset shape: {df.shape}")
26
+ logger.info(f"✓ Columns: {list(df.columns)}")
27
+ logger.info("✓ Data verification completed!")
predictive-maintenance-pipeline/scripts/05_run_eda.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Run Exploratory Data Analysis"""
3
+ import os
4
+ import logging
5
+ import pandas as pd
6
+ import numpy as np
7
+ import matplotlib.pyplot as plt
8
+ import seaborn as sns
9
+ from huggingface_hub import hf_hub_download
10
+
11
+ logging.basicConfig(level=logging.INFO)
12
+ logger = logging.getLogger(__name__)
13
+
14
+ HF_TOKEN = os.getenv("HF_TOKEN")
15
+ HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
16
+ DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
17
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
18
+
19
+ logger.info("Starting EDA...")
20
+
21
+ # Download data
22
+ file_path = hf_hub_download(
23
+ repo_id=repo_id,
24
+ repo_type="dataset",
25
+ filename="engine_data.csv",
26
+ token=HF_TOKEN )
27
+
28
+ df = pd.read_csv(file_path)
29
+
30
+ os.makedirs("outputs/eda", exist_ok=True)
31
+
32
+ # Univariate Analysis
33
+ logger.info("Creating univariate plots...")
34
+ for col in df.select_dtypes(include=[np.number]).columns:
35
+ fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
36
+ df[col].hist(ax=ax1, bins=30)
37
+ ax1.set_title(f'{col} Distribution')
38
+ df.boxplot(column=col, ax=ax2)
39
+ ax2.set_title(f'{col} Box Plot')
40
+ plt.tight_layout()
41
+ plt.savefig(f'outputs/eda/{col}_univariate.png')
42
+ plt.close()
43
+
44
+ # Correlation Heatmap
45
+ logger.info("Creating correlation heatmap...")
46
+ plt.figure(figsize=(10, 8))
47
+ sns.heatmap(df.corr(), annot=True, cmap='coolwarm', fmt='.2f')
48
+ plt.title('Correlation Matrix')
49
+ plt.tight_layout()
50
+ plt.savefig('outputs/eda/correlation_heatmap.png')
51
+ plt.close()
52
+
53
+ logger.info("✓ EDA completed! Plots saved to outputs/eda/")
predictive-maintenance-pipeline/scripts/07_clean_data.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Clean and prepare data"""
3
+ import os
4
+ import logging
5
+ import pandas as pd
6
+ from huggingface_hub import hf_hub_download
7
+
8
+ logging.basicConfig(level=logging.INFO)
9
+ logger = logging.getLogger(__name__)
10
+
11
+ HF_TOKEN = os.getenv("HF_TOKEN")
12
+ HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
13
+ DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
14
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
15
+
16
+ logger.info("Cleaning data...")
17
+
18
+ # Download data
19
+ file_path = hf_hub_download(repo_id=repo_id, repo_type="dataset",
20
+ filename="engine_data.csv", token=HF_TOKEN)
21
+
22
+ df = pd.read_csv(file_path)
23
+
24
+ logger.info(f"Original shape: {df.shape}")
25
+ logger.info(f"Original columns: {df.columns.tolist()}")
26
+
27
+ # Standardize column names to lowercase with underscores
28
+ # This ensures consistency regardless of how they're named in the source
29
+ df.columns = df.columns.str.lower().str.replace(' ', '_')
30
+ logger.info(f"Standardized columns: {df.columns.tolist()}")
31
+
32
+ # Verify the target column exists
33
+ if 'engine_condition' not in df.columns:
34
+ logger.error(f"Target column 'engine_condition' not found after standardization!")
35
+ logger.error(f"Available columns: {df.columns.tolist()}")
36
+ raise KeyError("Missing expected target column")
37
+
38
+ # Remove duplicates
39
+ initial_rows = df.shape[0]
40
+ df = df.drop_duplicates()
41
+ logger.info(f"After removing duplicates: {df.shape} (removed {initial_rows - df.shape[0]} rows)")
42
+
43
+ # Handle missing values (if any)
44
+ initial_rows = df.shape[0]
45
+ df = df.dropna()
46
+ logger.info(f"After dropping NA: {df.shape} (removed {initial_rows - df.shape[0]} rows)")
47
+
48
+ # Log target distribution
49
+ logger.info(f"Target distribution:
50
+ {df['engine_condition'].value_counts()}")
51
+
52
+ # Save cleaned data
53
+ os.makedirs("data", exist_ok=True)
54
+ df.to_csv("data/cleaned_data.csv", index=False)
55
+
56
+ logger.info("✓ Data cleaning completed!")
57
+ logger.info(f"✓ Final columns: {df.columns.tolist()}")
predictive-maintenance-pipeline/scripts/09_train_test_split.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Split data into train and test sets"""
3
+ import os
4
+ import logging
5
+ import pandas as pd
6
+ from sklearn.model_selection import train_test_split
7
+ from sklearn.preprocessing import StandardScaler
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ logger.info("Splitting data...")
13
+
14
+ # Load cleaned data
15
+ df = pd.read_csv("data/cleaned_data.csv")
16
+
17
+ # Separate features and target
18
+ X = df.drop('engine_condition', axis=1)
19
+ y = df['engine_condition']
20
+
21
+ # Split data
22
+ X_train, X_test, y_train, y_test = train_test_split(
23
+ X, y, test_size=0.2, random_state=42, stratify=y
24
+ )
25
+
26
+ logger.info(f"Train shape: {X_train.shape}")
27
+ logger.info(f"Test shape: {X_test.shape}")
28
+
29
+ # Scale features
30
+ scaler = StandardScaler()
31
+ X_train_scaled = scaler.fit_transform(X_train)
32
+ X_test_scaled = scaler.transform(X_test)
33
+
34
+ # Save as DataFrames
35
+ train_df = pd.DataFrame(X_train_scaled, columns=X.columns)
36
+ train_df['engine_condition'] = y_train.values
37
+
38
+ test_df = pd.DataFrame(X_test_scaled, columns=X.columns)
39
+ test_df['engine_condition'] = y_test.values
40
+
41
+ train_df.to_csv('data/train_scaled.csv', index=False)
42
+ test_df.to_csv('data/test_scaled.csv', index=False)
43
+
44
+ logger.info("✓ Train-test split completed!")
predictive-maintenance-pipeline/scripts/10_upload_processed_data.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Upload processed data to Hugging Face"""
3
+ import os
4
+ import logging
5
+ from huggingface_hub import HfApi
6
+
7
+ logging.basicConfig(level=logging.INFO)
8
+ logger = logging.getLogger(__name__)
9
+
10
+ HF_TOKEN = os.getenv("HF_TOKEN")
11
+ HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
12
+ DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
13
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
14
+
15
+ api = HfApi(token=HF_TOKEN)
16
+
17
+ logger.info("Uploading processed data...")
18
+
19
+ # Upload train data
20
+ api.upload_file(
21
+ path_or_fileobj="data/train_scaled.csv",
22
+ path_in_repo="train_scaled.csv",
23
+ repo_id=repo_id,
24
+ repo_type="dataset",
25
+ token=HF_TOKEN
26
+ )
27
+ logger.info("✓ Uploaded train_scaled.csv")
28
+
29
+ # Upload test data
30
+ api.upload_file(
31
+ path_or_fileobj="data/test_scaled.csv",
32
+ path_in_repo="test_scaled.csv",
33
+ repo_id=repo_id,
34
+ repo_type="dataset",
35
+ token=HF_TOKEN
36
+ )
37
+ logger.info("✓ Uploaded test_scaled.csv")
38
+
39
+ logger.info("✓ Data upload completed!")
predictive-maintenance-pipeline/scripts/13_train_decision_tree.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Train Decision Tree model with MLflow tracking"""
3
+ import os
4
+ import logging
5
+ import pandas as pd
6
+ import mlflow
7
+ import mlflow.sklearn
8
+ from sklearn.tree import DecisionTreeClassifier
9
+ from sklearn.model_selection import GridSearchCV
10
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
11
+ from huggingface_hub import hf_hub_download
12
+ import joblib
13
+ import json
14
+
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ HF_TOKEN = os.getenv("HF_TOKEN")
19
+ HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
20
+ DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
21
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
22
+
23
+ # Set up MLflow
24
+ mlflow.set_tracking_uri("file:./mlruns")
25
+ mlflow.set_experiment("Predictive_Maintenance")
26
+
27
+ logger.info("Loading data from Hugging Face...")
28
+
29
+ # Download train and test data
30
+ train_file = hf_hub_download(repo_id=repo_id, repo_type="dataset",
31
+ filename="train_scaled.csv", token=HF_TOKEN)
32
+ test_file = hf_hub_download(repo_id=repo_id, repo_type="dataset",
33
+ filename="test_scaled.csv", token=HF_TOKEN)
34
+
35
+ train_df = pd.read_csv(train_file)
36
+ test_df = pd.read_csv(test_file)
37
+
38
+ X_train = train_df.drop('engine_condition', axis=1)
39
+ y_train = train_df['engine_condition']
40
+ X_test = test_df.drop('engine_condition', axis=1)
41
+ y_test = test_df['engine_condition']
42
+
43
+ logger.info("Training Decision Tree...")
44
+
45
+ param_grid = {
46
+ 'max_depth': [5, 10, 15, None],
47
+ 'min_samples_split': [2, 5, 10],
48
+ 'min_samples_leaf': [1, 2, 4]
49
+ }
50
+
51
+ with mlflow.start_run(run_name="Decision_Tree"):
52
+ mlflow.set_tag("model_type", "Decision Tree")
53
+
54
+ dt_model = DecisionTreeClassifier(random_state=42)
55
+ grid_search = GridSearchCV(dt_model, param_grid, cv=5, scoring='f1', n_jobs=-1)
56
+ grid_search.fit(X_train, y_train)
57
+
58
+ best_model = grid_search.best_estimator_
59
+
60
+ # Log parameters
61
+ mlflow.log_params(grid_search.best_params_)
62
+
63
+ # Make predictions
64
+ y_pred = best_model.predict(X_test)
65
+
66
+ # Calculate metrics
67
+ accuracy = accuracy_score(y_test, y_pred)
68
+ precision = precision_score(y_test, y_pred)
69
+ recall = recall_score(y_test, y_pred)
70
+ f1 = f1_score(y_test, y_pred)
71
+
72
+ # Log metrics
73
+ mlflow.log_metric("accuracy", accuracy)
74
+ mlflow.log_metric("precision", precision)
75
+ mlflow.log_metric("recall", recall)
76
+ mlflow.log_metric("f1_score", f1)
77
+
78
+ # Log model
79
+ mlflow.sklearn.log_model(best_model, "model")
80
+
81
+ # Save model locally
82
+ os.makedirs("models", exist_ok=True)
83
+ joblib.dump(best_model, "models/decision_tree.pkl")
84
+
85
+ # Save metrics
86
+ metrics = {
87
+ "model": "Decision Tree",
88
+ "accuracy": round(accuracy, 4),
89
+ "precision": round(precision, 4),
90
+ "recall": round(recall, 4),
91
+ "f1_score": round(f1, 4)
92
+ }
93
+
94
+ os.makedirs("outputs/models", exist_ok=True)
95
+ with open("outputs/models/decision_tree_metrics.json", "w") as f:
96
+ json.dump(metrics, f, indent=4)
97
+
98
+ logger.info(f"✓ Decision Tree trained! F1-Score: {f1:.4f}")
predictive-maintenance-pipeline/scripts/14_train_bagging.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Train Bagging model with MLflow tracking"""
3
+ import os, logging, pandas as pd, mlflow, mlflow.sklearn, joblib, json
4
+ from sklearn.model_selection import GridSearchCV
5
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
6
+ from huggingface_hub import hf_hub_download
7
+ from sklearn.ensemble import BaggingClassifier
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ HF_TOKEN = os.getenv("HF_TOKEN")
13
+ HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
14
+ DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
15
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
16
+
17
+ mlflow.set_tracking_uri("file:./mlruns")
18
+ mlflow.set_experiment("Predictive_Maintenance")
19
+
20
+ # Load data
21
+ train_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="train_scaled.csv", token=HF_TOKEN)
22
+ test_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="test_scaled.csv", token=HF_TOKEN)
23
+
24
+ train_df = pd.read_csv(train_file)
25
+ test_df = pd.read_csv(test_file)
26
+
27
+ X_train = train_df.drop('engine_condition', axis=1)
28
+ y_train = train_df['engine_condition']
29
+ X_test = test_df.drop('engine_condition', axis=1)
30
+ y_test = test_df['engine_condition']
31
+
32
+ logger.info("Training Bagging...")
33
+
34
+ param_grid = {'n_estimators': [50, 100, 200], 'max_samples': [0.5, 0.7, 1.0]}
35
+
36
+ with mlflow.start_run(run_name="Bagging"):
37
+ mlflow.set_tag("model_type", "Bagging")
38
+
39
+ model = BaggingClassifier(random_state=42)
40
+ grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
41
+ grid_search.fit(X_train, y_train)
42
+
43
+ best_model = grid_search.best_estimator_
44
+ mlflow.log_params(grid_search.best_params_)
45
+
46
+ y_pred = best_model.predict(X_test)
47
+
48
+ accuracy = accuracy_score(y_test, y_pred)
49
+ precision = precision_score(y_test, y_pred)
50
+ recall = recall_score(y_test, y_pred)
51
+ f1 = f1_score(y_test, y_pred)
52
+
53
+ mlflow.log_metric("accuracy", accuracy)
54
+ mlflow.log_metric("precision", precision)
55
+ mlflow.log_metric("recall", recall)
56
+ mlflow.log_metric("f1_score", f1)
57
+
58
+ mlflow.sklearn.log_model(best_model, "model")
59
+
60
+ os.makedirs("models", exist_ok=True)
61
+ joblib.dump(best_model, "models/bagging.pkl")
62
+
63
+ logger.info(f"✓ Bagging trained! F1-Score: {f1:.4f}")
predictive-maintenance-pipeline/scripts/15_train_random_forest.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Train Random Forest model with MLflow tracking"""
3
+ import os, logging, pandas as pd, mlflow, mlflow.sklearn, joblib, json
4
+ from sklearn.model_selection import GridSearchCV
5
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
6
+ from huggingface_hub import hf_hub_download
7
+ from sklearn.ensemble import RandomForestClassifier
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ HF_TOKEN = os.getenv("HF_TOKEN")
13
+ HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
14
+ DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
15
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
16
+
17
+ mlflow.set_tracking_uri("file:./mlruns")
18
+ mlflow.set_experiment("Predictive_Maintenance")
19
+
20
+ # Load data
21
+ train_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="train_scaled.csv", token=HF_TOKEN)
22
+ test_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="test_scaled.csv", token=HF_TOKEN)
23
+
24
+ train_df = pd.read_csv(train_file)
25
+ test_df = pd.read_csv(test_file)
26
+
27
+ X_train = train_df.drop('engine_condition', axis=1)
28
+ y_train = train_df['engine_condition']
29
+ X_test = test_df.drop('engine_condition', axis=1)
30
+ y_test = test_df['engine_condition']
31
+
32
+ logger.info("Training Random Forest...")
33
+
34
+ param_grid = {'n_estimators': [100, 200], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5]}
35
+
36
+ with mlflow.start_run(run_name="Random_Forest"):
37
+ mlflow.set_tag("model_type", "Random Forest")
38
+
39
+ model = RandomForestClassifier(random_state=42)
40
+ grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
41
+ grid_search.fit(X_train, y_train)
42
+
43
+ best_model = grid_search.best_estimator_
44
+ mlflow.log_params(grid_search.best_params_)
45
+
46
+ y_pred = best_model.predict(X_test)
47
+
48
+ accuracy = accuracy_score(y_test, y_pred)
49
+ precision = precision_score(y_test, y_pred)
50
+ recall = recall_score(y_test, y_pred)
51
+ f1 = f1_score(y_test, y_pred)
52
+
53
+ mlflow.log_metric("accuracy", accuracy)
54
+ mlflow.log_metric("precision", precision)
55
+ mlflow.log_metric("recall", recall)
56
+ mlflow.log_metric("f1_score", f1)
57
+
58
+ mlflow.sklearn.log_model(best_model, "model")
59
+
60
+ os.makedirs("models", exist_ok=True)
61
+ joblib.dump(best_model, "models/random_forest.pkl")
62
+
63
+ logger.info(f"✓ Random Forest trained! F1-Score: {f1:.4f}")
predictive-maintenance-pipeline/scripts/16_train_adaboost.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Train AdaBoost model with MLflow tracking"""
3
+ import os, logging, pandas as pd, mlflow, mlflow.sklearn, joblib, json
4
+ from sklearn.model_selection import GridSearchCV
5
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
6
+ from huggingface_hub import hf_hub_download
7
+ from sklearn.ensemble import AdaBoostClassifier
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ HF_TOKEN = os.getenv("HF_TOKEN")
13
+ HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
14
+ DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
15
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
16
+
17
+ mlflow.set_tracking_uri("file:./mlruns")
18
+ mlflow.set_experiment("Predictive_Maintenance")
19
+
20
+ # Load data
21
+ train_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="train_scaled.csv", token=HF_TOKEN)
22
+ test_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="test_scaled.csv", token=HF_TOKEN)
23
+
24
+ train_df = pd.read_csv(train_file)
25
+ test_df = pd.read_csv(test_file)
26
+
27
+ X_train = train_df.drop('engine_condition', axis=1)
28
+ y_train = train_df['engine_condition']
29
+ X_test = test_df.drop('engine_condition', axis=1)
30
+ y_test = test_df['engine_condition']
31
+
32
+ logger.info("Training AdaBoost...")
33
+
34
+ param_grid = {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]}
35
+
36
+ with mlflow.start_run(run_name="AdaBoost"):
37
+ mlflow.set_tag("model_type", "AdaBoost")
38
+
39
+ model = AdaBoostClassifier(random_state=42)
40
+ grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
41
+ grid_search.fit(X_train, y_train)
42
+
43
+ best_model = grid_search.best_estimator_
44
+ mlflow.log_params(grid_search.best_params_)
45
+
46
+ y_pred = best_model.predict(X_test)
47
+
48
+ accuracy = accuracy_score(y_test, y_pred)
49
+ precision = precision_score(y_test, y_pred)
50
+ recall = recall_score(y_test, y_pred)
51
+ f1 = f1_score(y_test, y_pred)
52
+
53
+ mlflow.log_metric("accuracy", accuracy)
54
+ mlflow.log_metric("precision", precision)
55
+ mlflow.log_metric("recall", recall)
56
+ mlflow.log_metric("f1_score", f1)
57
+
58
+ mlflow.sklearn.log_model(best_model, "model")
59
+
60
+ os.makedirs("models", exist_ok=True)
61
+ joblib.dump(best_model, "models/adaboost.pkl")
62
+
63
+ logger.info(f"✓ AdaBoost trained! F1-Score: {f1:.4f}")
predictive-maintenance-pipeline/scripts/17_train_gradient_boosting.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Train Gradient Boosting model with MLflow tracking"""
3
+ import os, logging, pandas as pd, mlflow, mlflow.sklearn, joblib, json
4
+ from sklearn.model_selection import GridSearchCV
5
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
6
+ from huggingface_hub import hf_hub_download
7
+ from sklearn.ensemble import GradientBoostingClassifier
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ HF_TOKEN = os.getenv("HF_TOKEN")
13
+ HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
14
+ DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
15
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
16
+
17
+ mlflow.set_tracking_uri("file:./mlruns")
18
+ mlflow.set_experiment("Predictive_Maintenance")
19
+
20
+ # Load data
21
+ train_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="train_scaled.csv", token=HF_TOKEN)
22
+ test_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="test_scaled.csv", token=HF_TOKEN)
23
+
24
+ train_df = pd.read_csv(train_file)
25
+ test_df = pd.read_csv(test_file)
26
+
27
+ X_train = train_df.drop('engine_condition', axis=1)
28
+ y_train = train_df['engine_condition']
29
+ X_test = test_df.drop('engine_condition', axis=1)
30
+ y_test = test_df['engine_condition']
31
+
32
+ logger.info("Training Gradient Boosting...")
33
+
34
+ param_grid = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5]}
35
+
36
+ with mlflow.start_run(run_name="Gradient_Boosting"):
37
+ mlflow.set_tag("model_type", "Gradient Boosting")
38
+
39
+ model = GradientBoostingClassifier(random_state=42)
40
+ grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
41
+ grid_search.fit(X_train, y_train)
42
+
43
+ best_model = grid_search.best_estimator_
44
+ mlflow.log_params(grid_search.best_params_)
45
+
46
+ y_pred = best_model.predict(X_test)
47
+
48
+ accuracy = accuracy_score(y_test, y_pred)
49
+ precision = precision_score(y_test, y_pred)
50
+ recall = recall_score(y_test, y_pred)
51
+ f1 = f1_score(y_test, y_pred)
52
+
53
+ mlflow.log_metric("accuracy", accuracy)
54
+ mlflow.log_metric("precision", precision)
55
+ mlflow.log_metric("recall", recall)
56
+ mlflow.log_metric("f1_score", f1)
57
+
58
+ mlflow.sklearn.log_model(best_model, "model")
59
+
60
+ os.makedirs("models", exist_ok=True)
61
+ joblib.dump(best_model, "models/gradient_boosting.pkl")
62
+
63
+ logger.info(f"✓ Gradient Boosting trained! F1-Score: {f1:.4f}")
predictive-maintenance-pipeline/scripts/18_train_xgboost.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Train XGBoost model with MLflow tracking"""
3
+ import os, logging, pandas as pd, mlflow, mlflow.sklearn, joblib, json
4
+ from sklearn.model_selection import GridSearchCV
5
+ from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
6
+ from huggingface_hub import hf_hub_download
7
+ from xgboost import XGBClassifier
8
+
9
+ logging.basicConfig(level=logging.INFO)
10
+ logger = logging.getLogger(__name__)
11
+
12
+ HF_TOKEN = os.getenv("HF_TOKEN")
13
+ HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
14
+ DATASET_NAME = os.getenv("DATASET_NAME", "PredictiveMaintenance")
15
+ repo_id = f"{HF_USERNAME}/{DATASET_NAME}"
16
+
17
+ mlflow.set_tracking_uri("file:./mlruns")
18
+ mlflow.set_experiment("Predictive_Maintenance")
19
+
20
+ # Load data
21
+ train_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="train_scaled.csv", token=HF_TOKEN)
22
+ test_file = hf_hub_download(repo_id=repo_id, repo_type="dataset", filename="test_scaled.csv", token=HF_TOKEN)
23
+
24
+ train_df = pd.read_csv(train_file)
25
+ test_df = pd.read_csv(test_file)
26
+
27
+ X_train = train_df.drop('engine_condition', axis=1)
28
+ y_train = train_df['engine_condition']
29
+ X_test = test_df.drop('engine_condition', axis=1)
30
+ y_test = test_df['engine_condition']
31
+
32
+ logger.info("Training XGBoost...")
33
+
34
+ param_grid = {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1], 'max_depth': [3, 5, 7]}
35
+
36
+ with mlflow.start_run(run_name="XGBoost"):
37
+ mlflow.set_tag("model_type", "XGBoost")
38
+
39
+ model = XGBClassifier(random_state=42)
40
+ grid_search = GridSearchCV(model, param_grid, cv=5, scoring='f1', n_jobs=-1)
41
+ grid_search.fit(X_train, y_train)
42
+
43
+ best_model = grid_search.best_estimator_
44
+ mlflow.log_params(grid_search.best_params_)
45
+
46
+ y_pred = best_model.predict(X_test)
47
+
48
+ accuracy = accuracy_score(y_test, y_pred)
49
+ precision = precision_score(y_test, y_pred)
50
+ recall = recall_score(y_test, y_pred)
51
+ f1 = f1_score(y_test, y_pred)
52
+
53
+ mlflow.log_metric("accuracy", accuracy)
54
+ mlflow.log_metric("precision", precision)
55
+ mlflow.log_metric("recall", recall)
56
+ mlflow.log_metric("f1_score", f1)
57
+
58
+ mlflow.sklearn.log_model(best_model, "model")
59
+
60
+ os.makedirs("models", exist_ok=True)
61
+ joblib.dump(best_model, "models/xgboost.pkl")
62
+
63
+ logger.info(f"✓ XGBoost trained! F1-Score: {f1:.4f}")
predictive-maintenance-pipeline/scripts/19_compare_models.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Compare all trained models"""
3
+ import os, json, pandas as pd, logging
4
+
5
+ logging.basicConfig(level=logging.INFO)
6
+ logger = logging.getLogger(__name__)
7
+
8
+ logger.info("Comparing models...")
9
+
10
+ results = []
11
+ for file in os.listdir("outputs/models"):
12
+ if file.endswith("_metrics.json"):
13
+ with open(f"outputs/models/{file}", "r") as f:
14
+ results.append(json.load(f))
15
+
16
+ df = pd.DataFrame(results)
17
+ df = df.sort_values("f1_score", ascending=False)
18
+
19
+ df.to_csv("outputs/model_comparison.csv", index=False)
20
+
21
+ logger.info("
22
+ Model Comparison:")
23
+ logger.info(f"
24
+ {df.to_string()}")
25
+ logger.info(f"
26
+ ✓ Best Model: {df.iloc[0]['model']} (F1: {df.iloc[0]['f1_score']:.4f})")
predictive-maintenance-pipeline/scripts/20_register_best_model.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Register best model to Hugging Face"""
3
+ import os, json, logging, joblib
4
+ from huggingface_hub import HfApi, create_repo
5
+ from huggingface_hub.utils import RepositoryNotFoundError
6
+
7
+ logging.basicConfig(level=logging.INFO)
8
+ logger = logging.getLogger(__name__)
9
+
10
+ HF_TOKEN = os.getenv("HF_TOKEN")
11
+ HF_USERNAME = os.getenv("HF_USERNAME", "SharleyK")
12
+ MODEL_NAME = os.getenv("MODEL_NAME", "engine-predictive-maintenance")
13
+ repo_id = f"{HF_USERNAME}/{MODEL_NAME}"
14
+
15
+ api = HfApi(token=HF_TOKEN)
16
+
17
+ # Get best model
18
+ comparison = __import__('pandas').read_csv("outputs/model_comparison.csv")
19
+ best_model_name = comparison.iloc[0]['model'].lower().replace(' ', '_')
20
+
21
+ logger.info(f"Registering best model: {best_model_name}")
22
+
23
+ # Create model repo
24
+ try:
25
+ api.repo_info(repo_id=repo_id, repo_type="model")
26
+ except RepositoryNotFoundError:
27
+ create_repo(repo_id=repo_id, repo_type="model", token=HF_TOKEN)
28
+
29
+ # Upload model
30
+ api.upload_file(
31
+ path_or_fileobj=f"models/{best_model_name}.pkl",
32
+ path_in_repo="best_model.pkl",
33
+ repo_id=repo_id,
34
+ repo_type="model",
35
+ token=HF_TOKEN
36
+ )
37
+
38
+ logger.info(f"✓ Model registered to Hugging Face: {repo_id}")