Upload folder using huggingface_hub
Browse files- .gitignore +63 -0
- APP_INTERFACE_GUIDE.md +182 -0
- CONFIGURATION_GUIDE.md +153 -0
- Dockerfile +21 -0
- GITHUB_REPO_INFO.md +71 -0
- MANUAL_VS_AUTOMATIC.md +235 -0
- Notebook_AML_and_MLOps_Project (2).ipynb +1808 -0
- README.md +120 -10
- UPLOAD_GUIDE.md +223 -0
- USERNAME_SUMMARY.md +69 -0
- WHAT_TO_UPLOAD.md +265 -0
- requirements.txt +11 -0
- src/app.py +387 -0
- src/config.py +89 -0
- src/data_prep.py +154 -0
- src/data_register.py +59 -0
- src/deploy_to_hf.py +68 -0
- src/eda.py +85 -0
- src/hf_data_utils.py +143 -0
- src/hf_model_utils.py +91 -0
- src/inference.py +97 -0
- src/train.py +202 -0
.gitignore
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
*.egg-info/
|
| 8 |
+
dist/
|
| 9 |
+
build/
|
| 10 |
+
|
| 11 |
+
# Virtual Environment
|
| 12 |
+
.venv/
|
| 13 |
+
venv/
|
| 14 |
+
ENV/
|
| 15 |
+
env/
|
| 16 |
+
|
| 17 |
+
# Environment variables and secrets
|
| 18 |
+
.env
|
| 19 |
+
.env.local
|
| 20 |
+
.env.*.local
|
| 21 |
+
*.env
|
| 22 |
+
secrets/
|
| 23 |
+
*.key
|
| 24 |
+
*.pem
|
| 25 |
+
|
| 26 |
+
# MLflow
|
| 27 |
+
mlruns/
|
| 28 |
+
.mlflow/
|
| 29 |
+
|
| 30 |
+
# IDE
|
| 31 |
+
.vscode/
|
| 32 |
+
.idea/
|
| 33 |
+
*.swp
|
| 34 |
+
*.swo
|
| 35 |
+
*~
|
| 36 |
+
|
| 37 |
+
# OS
|
| 38 |
+
.DS_Store
|
| 39 |
+
Thumbs.db
|
| 40 |
+
|
| 41 |
+
# Logs
|
| 42 |
+
*.log
|
| 43 |
+
*.out
|
| 44 |
+
|
| 45 |
+
# Jupyter Notebook checkpoints
|
| 46 |
+
.ipynb_checkpoints/
|
| 47 |
+
|
| 48 |
+
# Model files (optional - uncomment if you don't want to track large model files)
|
| 49 |
+
# models/*.joblib
|
| 50 |
+
# models/*.pkl
|
| 51 |
+
|
| 52 |
+
# Data files (optional - uncomment if data is too large)
|
| 53 |
+
# data/*.csv
|
| 54 |
+
# data/processed/*.csv
|
| 55 |
+
|
| 56 |
+
# Temporary files
|
| 57 |
+
*.tmp
|
| 58 |
+
*.bak
|
| 59 |
+
*.cache
|
| 60 |
+
|
| 61 |
+
# Hugging Face tokens (never commit these!)
|
| 62 |
+
**/token*
|
| 63 |
+
**/*token*
|
APP_INTERFACE_GUIDE.md
ADDED
|
@@ -0,0 +1,182 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Streamlit App Interface Guide
|
| 2 |
+
|
| 3 |
+
## 🚀 Accessing the App
|
| 4 |
+
|
| 5 |
+
The Streamlit app should now be running. Open your web browser and navigate to:
|
| 6 |
+
|
| 7 |
+
**http://localhost:8501**
|
| 8 |
+
|
| 9 |
+
If port 8501 is busy, Streamlit will automatically use the next available port (8502, 8503, etc.). Check the terminal output for the exact URL.
|
| 10 |
+
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
## 📱 Interface Overview
|
| 14 |
+
|
| 15 |
+
### **Main Page Layout**
|
| 16 |
+
|
| 17 |
+
```
|
| 18 |
+
┌─────────────────────────────────────────────────────────┐
|
| 19 |
+
│ 🛠️ Engine Predictive Maintenance – Failure Prediction │
|
| 20 |
+
├─────────────────────────────────────────────────────────┤
|
| 21 |
+
│ │
|
| 22 |
+
│ This app predicts whether an engine is operating │
|
| 23 |
+
│ normally (0) or requires maintenance / at risk of │
|
| 24 |
+
│ failure (1) based on real-time sensor readings. │
|
| 25 |
+
│ │
|
| 26 |
+
│ Adjust the sensor values below and click Predict to │
|
| 27 |
+
│ see the model's classification and the probability of │
|
| 28 |
+
│ a potential fault. │
|
| 29 |
+
│ │
|
| 30 |
+
├─────────────────────────────────────────────────────────┤
|
| 31 |
+
│ Input Sensor Readings │
|
| 32 |
+
├─────────────────────────────────────────────────────────┤
|
| 33 |
+
│ │
|
| 34 |
+
│ ┌─────────────────────┐ ┌─────────────────────┐ │
|
| 35 |
+
│ │ Engine RPM │ │ Coolant Pressure │ │
|
| 36 |
+
│ │ [800.0] │ │ [2.0] │ │
|
| 37 |
+
│ │ │ │ │ │
|
| 38 |
+
│ │ Lub Oil Pressure │ │ Lub Oil Temperature │ │
|
| 39 |
+
│ │ [3.0] │ │ [80.0] │ │
|
| 40 |
+
│ │ │ │ │ │
|
| 41 |
+
│ │ Fuel Pressure │ │ Coolant Temperature │ │
|
| 42 |
+
│ │ [10.0] │ │ [80.0] │ │
|
| 43 |
+
│ └─────────────────────┘ └─────────────────────┘ │
|
| 44 |
+
│ │
|
| 45 |
+
│ [Predict Button] │
|
| 46 |
+
│ │
|
| 47 |
+
└─────────────────────────────────────────────────────────┘
|
| 48 |
+
|
| 49 |
+
┌─────────────────────────────────────────────────────────┐
|
| 50 |
+
│ Sidebar: Model Source │
|
| 51 |
+
├─────────────────────────────────────────────────────────┤
|
| 52 |
+
│ Load model from: │
|
| 53 |
+
│ ○ Hugging Face Hub │
|
| 54 |
+
│ ● Local file │
|
| 55 |
+
│ │
|
| 56 |
+
│ Note: On Hugging Face Spaces, the model is typically │
|
| 57 |
+
│ loaded from the model hub. Locally, you can choose │
|
| 58 |
+
│ either source as long as you have run the training │
|
| 59 |
+
│ pipeline or configured your HF token. │
|
| 60 |
+
└─────────────────────────────────────────────────────────┘
|
| 61 |
+
```
|
| 62 |
+
|
| 63 |
+
---
|
| 64 |
+
|
| 65 |
+
## 🎯 How to Use the Interface
|
| 66 |
+
|
| 67 |
+
### **Step 1: Select Model Source (Sidebar)**
|
| 68 |
+
- Choose **"Local file"** if you've run `python src/train.py` locally
|
| 69 |
+
- Choose **"Hugging Face Hub"** if you've uploaded the model to HF and set `HF_TOKEN`
|
| 70 |
+
|
| 71 |
+
### **Step 2: Enter Sensor Values**
|
| 72 |
+
|
| 73 |
+
Adjust the 6 sensor inputs:
|
| 74 |
+
|
| 75 |
+
1. **Engine RPM** (0-4000)
|
| 76 |
+
- Default: 800.0
|
| 77 |
+
- Typical range: 500-2000 RPM
|
| 78 |
+
|
| 79 |
+
2. **Lub Oil Pressure** (0-10 bar/kPa)
|
| 80 |
+
- Default: 3.0
|
| 81 |
+
- Typical range: 2-5 bar
|
| 82 |
+
|
| 83 |
+
3. **Fuel Pressure** (0-30 bar/kPa)
|
| 84 |
+
- Default: 10.0
|
| 85 |
+
- Typical range: 5-20 bar
|
| 86 |
+
|
| 87 |
+
4. **Coolant Pressure** (0-10 bar/kPa)
|
| 88 |
+
- Default: 2.0
|
| 89 |
+
- Typical range: 1-4 bar
|
| 90 |
+
|
| 91 |
+
5. **Lub Oil Temperature** (0-150°C)
|
| 92 |
+
- Default: 80.0
|
| 93 |
+
- Typical range: 70-90°C
|
| 94 |
+
|
| 95 |
+
6. **Coolant Temperature** (0-150°C)
|
| 96 |
+
- Default: 80.0
|
| 97 |
+
- Typical range: 70-90°C
|
| 98 |
+
|
| 99 |
+
### **Step 3: Click "Predict"**
|
| 100 |
+
|
| 101 |
+
After clicking the **Predict** button, you'll see one of two results:
|
| 102 |
+
|
| 103 |
+
#### ✅ **Normal Operation**
|
| 104 |
+
```
|
| 105 |
+
✅ The engine is LIKELY OPERATING NORMALLY (probability of fault X.XX%).
|
| 106 |
+
```
|
| 107 |
+
|
| 108 |
+
#### 🚨 **Faulty / Requires Maintenance**
|
| 109 |
+
```
|
| 110 |
+
🚨 The engine is LIKELY FAULTY / REQUIRES MAINTENANCE (probability XX.XX%).
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
---
|
| 114 |
+
|
| 115 |
+
## 📊 Example Predictions
|
| 116 |
+
|
| 117 |
+
### **Example 1: Normal Engine**
|
| 118 |
+
- Engine RPM: 1200
|
| 119 |
+
- Lub Oil Pressure: 3.5
|
| 120 |
+
- Fuel Pressure: 12.0
|
| 121 |
+
- Coolant Pressure: 2.5
|
| 122 |
+
- Lub Oil Temperature: 82.0
|
| 123 |
+
- Coolant Temperature: 85.0
|
| 124 |
+
- **Result**: ✅ Normal operation (low fault probability)
|
| 125 |
+
|
| 126 |
+
### **Example 2: Faulty Engine**
|
| 127 |
+
- Engine RPM: 400
|
| 128 |
+
- Lub Oil Pressure: 1.5
|
| 129 |
+
- Fuel Pressure: 5.0
|
| 130 |
+
- Coolant Pressure: 1.0
|
| 131 |
+
- Lub Oil Temperature: 95.0
|
| 132 |
+
- Coolant Temperature: 100.0
|
| 133 |
+
- **Result**: 🚨 Requires maintenance (high fault probability)
|
| 134 |
+
|
| 135 |
+
---
|
| 136 |
+
|
| 137 |
+
## 🔧 Troubleshooting
|
| 138 |
+
|
| 139 |
+
### **App Not Loading?**
|
| 140 |
+
1. Check terminal for errors
|
| 141 |
+
2. Verify port 8501 is available: `lsof -ti:8501`
|
| 142 |
+
3. Try a different port: `streamlit run src/app.py --server.port 8502`
|
| 143 |
+
|
| 144 |
+
### **Model Not Found Error?**
|
| 145 |
+
1. **For Local**: Run `python src/train.py` first to create `models/best_model.joblib`
|
| 146 |
+
2. **For HF**: Set `HF_TOKEN` and `HF_MODEL_REPO` environment variables
|
| 147 |
+
|
| 148 |
+
### **Import Errors?**
|
| 149 |
+
1. Activate virtual environment: `source .venv/bin/activate`
|
| 150 |
+
2. Install dependencies: `pip install -r requirements.txt`
|
| 151 |
+
|
| 152 |
+
---
|
| 153 |
+
|
| 154 |
+
## 🎨 Interface Features
|
| 155 |
+
|
| 156 |
+
- **Clean, centered layout** for easy input
|
| 157 |
+
- **Two-column form** for organized sensor inputs
|
| 158 |
+
- **Real-time prediction** with probability scores
|
| 159 |
+
- **Color-coded results**: Green for normal, Red for faulty
|
| 160 |
+
- **Sidebar model selection** for flexibility
|
| 161 |
+
- **Responsive design** that works on different screen sizes
|
| 162 |
+
|
| 163 |
+
---
|
| 164 |
+
|
| 165 |
+
## 📸 Screenshots for Your Report
|
| 166 |
+
|
| 167 |
+
When documenting this in your final report, you can:
|
| 168 |
+
1. Take a screenshot of the input form
|
| 169 |
+
2. Take a screenshot showing a "Normal" prediction
|
| 170 |
+
3. Take a screenshot showing a "Faulty" prediction
|
| 171 |
+
4. Include the URL: `http://localhost:8501` (or your deployed HF Space URL)
|
| 172 |
+
|
| 173 |
+
---
|
| 174 |
+
|
| 175 |
+
## 🚀 Next Steps
|
| 176 |
+
|
| 177 |
+
1. **Test the app locally** with different sensor values
|
| 178 |
+
2. **Deploy to Hugging Face Space** using `python src/deploy_to_hf.py`
|
| 179 |
+
3. **Include screenshots** in your final report/notebook
|
| 180 |
+
4. **Document the interface** in your submission
|
| 181 |
+
|
| 182 |
+
The app is ready to use! 🎉
|
CONFIGURATION_GUIDE.md
ADDED
|
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration Guide: Hugging Face & GitHub Setup
|
| 2 |
+
|
| 3 |
+
This guide shows you exactly where to configure your Hugging Face Space details and GitHub repository information.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## 1. Hugging Face Configuration
|
| 8 |
+
|
| 9 |
+
### Option A: Update `src/config.py` (Recommended for Local Testing)
|
| 10 |
+
|
| 11 |
+
Edit `/Users/ananttripathi/Desktop/mlops/src/config.py` and replace the placeholder values:
|
| 12 |
+
|
| 13 |
+
```python
|
| 14 |
+
# Lines 58-62 in src/config.py
|
| 15 |
+
HF_DATASET_REPO = os.getenv(
|
| 16 |
+
"HF_DATASET_REPO", "your-username/engine-maintenance-dataset" # <-- Replace "your-username"
|
| 17 |
+
)
|
| 18 |
+
HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "your-username/engine-maintenance-model") # <-- Replace
|
| 19 |
+
HF_SPACE_REPO = os.getenv("HF_SPACE_REPO", "your-username/engine-maintenance-space") # <-- Replace
|
| 20 |
+
```
|
| 21 |
+
|
| 22 |
+
**Example** (HF username: `ananttripathiak`, GitHub username: `ananttripathi`):
|
| 23 |
+
```python
|
| 24 |
+
HF_DATASET_REPO = os.getenv(
|
| 25 |
+
"HF_DATASET_REPO", "ananttripathiak/engine-maintenance-dataset"
|
| 26 |
+
)
|
| 27 |
+
HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "ananttripathiak/engine-maintenance-model")
|
| 28 |
+
HF_SPACE_REPO = os.getenv("HF_SPACE_REPO", "ananttripathiak/engine-maintenance-space")
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
### Option B: Set Environment Variables (For Local Testing)
|
| 32 |
+
|
| 33 |
+
Instead of editing `config.py`, you can export these in your terminal:
|
| 34 |
+
|
| 35 |
+
```bash
|
| 36 |
+
export HF_TOKEN="hf_your_token_here"
|
| 37 |
+
export HF_DATASET_REPO="your-username/engine-maintenance-dataset"
|
| 38 |
+
export HF_MODEL_REPO="your-username/engine-maintenance-model"
|
| 39 |
+
export HF_SPACE_REPO="your-username/engine-maintenance-space"
|
| 40 |
+
```
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
## 2. GitHub Repository Configuration
|
| 45 |
+
|
| 46 |
+
### A. GitHub Secrets (Required for GitHub Actions)
|
| 47 |
+
|
| 48 |
+
Go to your GitHub repository → **Settings** → **Secrets and variables** → **Actions** → **New repository secret**
|
| 49 |
+
|
| 50 |
+
Add these 4 secrets:
|
| 51 |
+
|
| 52 |
+
1. **`HF_TOKEN`**
|
| 53 |
+
- Value: Your Hugging Face access token (get it from https://huggingface.co/settings/tokens)
|
| 54 |
+
- Example: `hf_xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx`
|
| 55 |
+
|
| 56 |
+
2. **`HF_DATASET_REPO`**
|
| 57 |
+
- Value: Your Hugging Face dataset repo ID
|
| 58 |
+
- Example: `ananttripathiak/engine-maintenance-dataset`
|
| 59 |
+
|
| 60 |
+
3. **`HF_MODEL_REPO`**
|
| 61 |
+
- Value: Your Hugging Face model repo ID
|
| 62 |
+
- Example: `ananttripathiak/engine-maintenance-model`
|
| 63 |
+
|
| 64 |
+
4. **`HF_SPACE_REPO`**
|
| 65 |
+
- Value: Your Hugging Face Space repo ID
|
| 66 |
+
- Example: `ananttripathiak/engine-maintenance-space`
|
| 67 |
+
|
| 68 |
+
**Note:** The GitHub Actions workflow (`.github/workflows/pipeline.yml`) automatically reads these secrets. No code changes needed!
|
| 69 |
+
|
| 70 |
+
### B. Update README.md with Your GitHub Repo URL
|
| 71 |
+
|
| 72 |
+
Edit `/Users/ananttripathi/Desktop/mlops/README.md` and add your GitHub repository link:
|
| 73 |
+
|
| 74 |
+
```markdown
|
| 75 |
+
## GitHub Repository
|
| 76 |
+
|
| 77 |
+
- **Repository URL**: https://github.com/your-username/engine-predictive-maintenance
|
| 78 |
+
- **GitHub Actions**: https://github.com/your-username/engine-predictive-maintenance/actions
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
---
|
| 82 |
+
|
| 83 |
+
## 3. Quick Setup Checklist
|
| 84 |
+
|
| 85 |
+
- [ ] **Hugging Face Account**: Create account at https://huggingface.co
|
| 86 |
+
- [ ] **HF Access Token**: Generate at https://huggingface.co/settings/tokens (needs `write` permission)
|
| 87 |
+
- [ ] **Update `src/config.py`**: Replace `"username"` with your actual HF username
|
| 88 |
+
- [ ] **Create HF Repos** (optional - scripts will create them automatically):
|
| 89 |
+
- Dataset repo: `your-username/engine-maintenance-dataset`
|
| 90 |
+
- Model repo: `your-username/engine-maintenance-model`
|
| 91 |
+
- Space repo: `your-username/engine-maintenance-space`
|
| 92 |
+
- [ ] **GitHub Repository**: Create a new repo and push this `mlops` folder
|
| 93 |
+
- [ ] **GitHub Secrets**: Add the 4 secrets listed above in your GitHub repo settings
|
| 94 |
+
- [ ] **Test Locally**: Run `python src/data_register.py` to verify HF connection
|
| 95 |
+
|
| 96 |
+
---
|
| 97 |
+
|
| 98 |
+
## 4. Testing Your Configuration
|
| 99 |
+
|
| 100 |
+
### Test Local Configuration
|
| 101 |
+
|
| 102 |
+
```bash
|
| 103 |
+
cd /Users/ananttripathi/Desktop/mlops
|
| 104 |
+
source .venv/bin/activate
|
| 105 |
+
|
| 106 |
+
# Set your HF token (if not in config.py)
|
| 107 |
+
export HF_TOKEN="hf_your_token_here"
|
| 108 |
+
|
| 109 |
+
# Test data registration
|
| 110 |
+
python src/data_register.py
|
| 111 |
+
|
| 112 |
+
# Test data preparation
|
| 113 |
+
python src/data_prep.py
|
| 114 |
+
|
| 115 |
+
# Test model training
|
| 116 |
+
python src/train.py
|
| 117 |
+
|
| 118 |
+
# Test deployment
|
| 119 |
+
python src/deploy_to_hf.py
|
| 120 |
+
```
|
| 121 |
+
|
| 122 |
+
### Test GitHub Actions
|
| 123 |
+
|
| 124 |
+
1. Push your code to GitHub:
|
| 125 |
+
```bash
|
| 126 |
+
git add .
|
| 127 |
+
git commit -m "Initial commit: Predictive maintenance MLOps pipeline"
|
| 128 |
+
git push origin main
|
| 129 |
+
```
|
| 130 |
+
|
| 131 |
+
2. Go to your GitHub repo → **Actions** tab
|
| 132 |
+
3. You should see the "Predictive Maintenance Pipeline" workflow running
|
| 133 |
+
4. All 4 jobs should complete successfully (green checkmarks)
|
| 134 |
+
|
| 135 |
+
---
|
| 136 |
+
|
| 137 |
+
## 5. Where Each Configuration is Used
|
| 138 |
+
|
| 139 |
+
| Configuration | Used In | Purpose |
|
| 140 |
+
|--------------|---------|---------|
|
| 141 |
+
| `HF_DATASET_REPO` | `src/data_register.py`, `src/data_prep.py`, `src/train.py` | Dataset storage and retrieval |
|
| 142 |
+
| `HF_MODEL_REPO` | `src/train.py`, `src/inference.py`, `src/app.py` | Model storage and loading |
|
| 143 |
+
| `HF_SPACE_REPO` | `src/deploy_to_hf.py` | Streamlit app deployment |
|
| 144 |
+
| `HF_TOKEN` | All HF-related scripts | Authentication |
|
| 145 |
+
| GitHub Secrets | `.github/workflows/pipeline.yml` | CI/CD automation |
|
| 146 |
+
|
| 147 |
+
---
|
| 148 |
+
|
| 149 |
+
## Need Help?
|
| 150 |
+
|
| 151 |
+
- **Hugging Face Docs**: https://huggingface.co/docs/hub
|
| 152 |
+
- **GitHub Actions Docs**: https://docs.github.com/en/actions
|
| 153 |
+
- **GitHub Secrets**: https://docs.github.com/en/actions/security-guides/encrypted-secrets
|
Dockerfile
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.10-slim
|
| 2 |
+
|
| 3 |
+
WORKDIR /app
|
| 4 |
+
|
| 5 |
+
ENV PYTHONUNBUFFERED=1 \
|
| 6 |
+
PIP_NO_CACHE_DIR=1
|
| 7 |
+
|
| 8 |
+
# Install system dependencies (if needed for some Python packages)
|
| 9 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 10 |
+
build-essential \
|
| 11 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 12 |
+
|
| 13 |
+
COPY requirements.txt .
|
| 14 |
+
RUN pip install --upgrade pip && pip install -r requirements.txt
|
| 15 |
+
|
| 16 |
+
COPY . .
|
| 17 |
+
|
| 18 |
+
EXPOSE 7860
|
| 19 |
+
|
| 20 |
+
CMD ["streamlit", "run", "src/app.py", "--server.port", "7860", "--server.address", "0.0.0.0"]
|
| 21 |
+
|
GITHUB_REPO_INFO.md
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# GitHub Repository Information
|
| 2 |
+
|
| 3 |
+
## Repository Details
|
| 4 |
+
|
| 5 |
+
- **GitHub Username**: `ananttripathi`
|
| 6 |
+
- **Repository Name**: `engine-predictive-maintenance` (or your chosen name)
|
| 7 |
+
- **Full Repository URL**: `https://github.com/ananttripathi/engine-predictive-maintenance`
|
| 8 |
+
|
| 9 |
+
## Where to Add This Information
|
| 10 |
+
|
| 11 |
+
### 1. In Your Final Notebook/Report
|
| 12 |
+
|
| 13 |
+
Add a section like this:
|
| 14 |
+
|
| 15 |
+
```markdown
|
| 16 |
+
## GitHub Repository
|
| 17 |
+
|
| 18 |
+
- **Repository URL**: https://github.com/ananttripathi/engine-predictive-maintenance
|
| 19 |
+
- **GitHub Actions Workflow**: https://github.com/ananttripathi/engine-predictive-maintenance/actions
|
| 20 |
+
- **Repository Structure**: [Screenshot of folder structure]
|
| 21 |
+
- **Workflow Execution**: [Screenshot of successful pipeline runs]
|
| 22 |
+
```
|
| 23 |
+
|
| 24 |
+
### 2. In README.md
|
| 25 |
+
|
| 26 |
+
The README already includes a placeholder. After you create the repo, update it with your actual repo name.
|
| 27 |
+
|
| 28 |
+
### 3. GitHub Actions Workflow
|
| 29 |
+
|
| 30 |
+
**No changes needed!** The `.github/workflows/pipeline.yml` file automatically runs in the context of whatever repository it's pushed to. It doesn't need the repo URL hardcoded.
|
| 31 |
+
|
| 32 |
+
## Steps to Set Up GitHub Repository
|
| 33 |
+
|
| 34 |
+
1. **Create the repository on GitHub:**
|
| 35 |
+
- Go to https://github.com/new
|
| 36 |
+
- Repository name: `engine-predictive-maintenance` (or your choice)
|
| 37 |
+
- Description: "Predictive Maintenance MLOps Pipeline for Engine Failure Classification"
|
| 38 |
+
- Choose Public or Private
|
| 39 |
+
- **Don't** initialize with README, .gitignore, or license (we already have files)
|
| 40 |
+
|
| 41 |
+
2. **Push your code:**
|
| 42 |
+
```bash
|
| 43 |
+
cd /Users/ananttripathi/Desktop/mlops
|
| 44 |
+
git init
|
| 45 |
+
git add .
|
| 46 |
+
git commit -m "Initial commit: Predictive maintenance MLOps pipeline"
|
| 47 |
+
git branch -M main
|
| 48 |
+
git remote add origin https://github.com/ananttripathi/engine-predictive-maintenance.git
|
| 49 |
+
git push -u origin main
|
| 50 |
+
```
|
| 51 |
+
|
| 52 |
+
3. **Add GitHub Secrets:**
|
| 53 |
+
- Go to: https://github.com/ananttripathi/engine-predictive-maintenance/settings/secrets/actions
|
| 54 |
+
- Click "New repository secret"
|
| 55 |
+
- Add these 4 secrets:
|
| 56 |
+
- `HF_TOKEN` → Your Hugging Face token
|
| 57 |
+
- `HF_DATASET_REPO` → `ananttripathiak/engine-maintenance-dataset`
|
| 58 |
+
- `HF_MODEL_REPO` → `ananttripathiak/engine-maintenance-model`
|
| 59 |
+
- `HF_SPACE_REPO` → `ananttripathiak/engine-maintenance-space`
|
| 60 |
+
|
| 61 |
+
4. **Verify GitHub Actions:**
|
| 62 |
+
- After pushing, go to: https://github.com/ananttripathi/engine-predictive-maintenance/actions
|
| 63 |
+
- You should see the "Predictive Maintenance Pipeline" workflow running
|
| 64 |
+
- All 4 jobs should complete successfully
|
| 65 |
+
|
| 66 |
+
## Important Notes
|
| 67 |
+
|
| 68 |
+
- **GitHub username** (`ananttripathi`) is used for the repository URL
|
| 69 |
+
- **Hugging Face username** might be different - check your HF account and update `src/config.py` if needed
|
| 70 |
+
- The GitHub Actions workflow reads secrets automatically - no code changes needed
|
| 71 |
+
- Repository name can be anything you want (e.g., `engine-predictive-maintenance`, `mlops-project`, etc.)
|
MANUAL_VS_AUTOMATIC.md
ADDED
|
@@ -0,0 +1,235 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Manual vs Automatic Uploads - Quick Guide
|
| 2 |
+
|
| 3 |
+
## 🚀 AUTOMATIC Uploads (Via Scripts)
|
| 4 |
+
|
| 5 |
+
These happen automatically when you run the scripts. **You don't need to manually upload anything to Hugging Face.**
|
| 6 |
+
|
| 7 |
+
### ✅ Automatic: Hugging Face Dataset Repo
|
| 8 |
+
**Script**: `python src/data_register.py` and `python src/data_prep.py`
|
| 9 |
+
|
| 10 |
+
**What gets uploaded automatically:**
|
| 11 |
+
- ✅ `data/engine_data.csv` → HF Dataset Repo
|
| 12 |
+
- ✅ `data/processed/train.csv` → HF Dataset Repo
|
| 13 |
+
- ✅ `data/processed/test.csv` → HF Dataset Repo
|
| 14 |
+
|
| 15 |
+
**You just run:**
|
| 16 |
+
```bash
|
| 17 |
+
python src/data_register.py # Auto-uploads raw data
|
| 18 |
+
python src/data_prep.py # Auto-uploads train/test
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
**That's it!** The scripts handle everything - creating the repo, uploading files, etc.
|
| 22 |
+
|
| 23 |
+
---
|
| 24 |
+
|
| 25 |
+
### ✅ Automatic: Hugging Face Model Repo
|
| 26 |
+
**Script**: `python src/train.py`
|
| 27 |
+
|
| 28 |
+
**What gets uploaded automatically:**
|
| 29 |
+
- ✅ `models/best_model.joblib` → HF Model Repo (as `model.joblib`)
|
| 30 |
+
|
| 31 |
+
**You just run:**
|
| 32 |
+
```bash
|
| 33 |
+
python src/train.py
|
| 34 |
+
```
|
| 35 |
+
|
| 36 |
+
**That's it!** The script:
|
| 37 |
+
1. Trains the model
|
| 38 |
+
2. Saves it locally
|
| 39 |
+
3. Automatically uploads to HF Model Repo
|
| 40 |
+
|
| 41 |
+
---
|
| 42 |
+
|
| 43 |
+
### ✅ Automatic: Hugging Face Space (Streamlit App)
|
| 44 |
+
**Script**: `python src/deploy_to_hf.py`
|
| 45 |
+
|
| 46 |
+
**What gets uploaded automatically:**
|
| 47 |
+
- ✅ `src/app.py` (Streamlit app)
|
| 48 |
+
- ✅ `src/inference.py`
|
| 49 |
+
- ✅ `src/config.py`
|
| 50 |
+
- ✅ `Dockerfile`
|
| 51 |
+
- ✅ `requirements.txt`
|
| 52 |
+
- ✅ Other `src/*.py` files needed
|
| 53 |
+
|
| 54 |
+
**You just run:**
|
| 55 |
+
```bash
|
| 56 |
+
python src/deploy_to_hf.py
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
**That's it!** The script:
|
| 60 |
+
1. Creates/updates the HF Space
|
| 61 |
+
2. Uploads all deployment files
|
| 62 |
+
3. Configures it as a Streamlit app
|
| 63 |
+
|
| 64 |
+
---
|
| 65 |
+
|
| 66 |
+
## 📤 MANUAL Uploads (You Do This)
|
| 67 |
+
|
| 68 |
+
Only **ONE** thing needs to be done manually:
|
| 69 |
+
|
| 70 |
+
### ⚠️ Manual: GitHub Repository
|
| 71 |
+
|
| 72 |
+
**What you need to do manually:**
|
| 73 |
+
- Push your entire `mlops/` folder to GitHub
|
| 74 |
+
|
| 75 |
+
**Steps:**
|
| 76 |
+
```bash
|
| 77 |
+
cd /Users/ananttripathi/Desktop/mlops
|
| 78 |
+
|
| 79 |
+
# 1. Initialize git (if not done)
|
| 80 |
+
git init
|
| 81 |
+
|
| 82 |
+
# 2. Create .gitignore (to exclude large files)
|
| 83 |
+
cat > .gitignore << EOF
|
| 84 |
+
.venv/
|
| 85 |
+
__pycache__/
|
| 86 |
+
*.pyc
|
| 87 |
+
mlruns/
|
| 88 |
+
*.log
|
| 89 |
+
.DS_Store
|
| 90 |
+
EOF
|
| 91 |
+
|
| 92 |
+
# 3. Add all files
|
| 93 |
+
git add .
|
| 94 |
+
|
| 95 |
+
# 4. Commit
|
| 96 |
+
git commit -m "Initial commit: Predictive Maintenance MLOps Pipeline"
|
| 97 |
+
|
| 98 |
+
# 5. Add your GitHub repo as remote
|
| 99 |
+
git remote add origin https://github.com/ananttripathi/engine-predictive-maintenance.git
|
| 100 |
+
|
| 101 |
+
# 6. Push to GitHub
|
| 102 |
+
git push -u origin main
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
**That's the ONLY manual upload!**
|
| 106 |
+
|
| 107 |
+
---
|
| 108 |
+
|
| 109 |
+
## 📊 Summary Table
|
| 110 |
+
|
| 111 |
+
| Destination | Upload Method | What You Do |
|
| 112 |
+
|------------|---------------|-------------|
|
| 113 |
+
| **GitHub** | ⚠️ **MANUAL** | Run `git push` commands |
|
| 114 |
+
| **HF Dataset** | ✅ **AUTOMATIC** | Run `python src/data_register.py` and `python src/data_prep.py` |
|
| 115 |
+
| **HF Model** | ✅ **AUTOMATIC** | Run `python src/train.py` |
|
| 116 |
+
| **HF Space** | ✅ **AUTOMATIC** | Run `python src/deploy_to_hf.py` |
|
| 117 |
+
|
| 118 |
+
---
|
| 119 |
+
|
| 120 |
+
## 🎯 Complete Workflow
|
| 121 |
+
|
| 122 |
+
### Step 1: Manual - GitHub (Do This First)
|
| 123 |
+
```bash
|
| 124 |
+
# Push everything to GitHub
|
| 125 |
+
git init
|
| 126 |
+
git add .
|
| 127 |
+
git commit -m "Initial commit"
|
| 128 |
+
git remote add origin https://github.com/ananttripathi/engine-predictive-maintenance.git
|
| 129 |
+
git push -u origin main
|
| 130 |
+
```
|
| 131 |
+
⏱️ **Time**: 2-3 minutes
|
| 132 |
+
|
| 133 |
+
---
|
| 134 |
+
|
| 135 |
+
### Step 2: Automatic - HF Dataset
|
| 136 |
+
```bash
|
| 137 |
+
export HF_TOKEN="your_hf_token"
|
| 138 |
+
python src/data_register.py
|
| 139 |
+
python src/data_prep.py
|
| 140 |
+
```
|
| 141 |
+
⏱️ **Time**: 1-2 minutes (scripts do everything)
|
| 142 |
+
|
| 143 |
+
---
|
| 144 |
+
|
| 145 |
+
### Step 3: Automatic - HF Model
|
| 146 |
+
```bash
|
| 147 |
+
python src/train.py
|
| 148 |
+
```
|
| 149 |
+
⏱️ **Time**: 5-10 minutes (trains model + auto-uploads)
|
| 150 |
+
|
| 151 |
+
---
|
| 152 |
+
|
| 153 |
+
### Step 4: Automatic - HF Space
|
| 154 |
+
```bash
|
| 155 |
+
python src/deploy_to_hf.py
|
| 156 |
+
```
|
| 157 |
+
⏱️ **Time**: 1-2 minutes (scripts do everything)
|
| 158 |
+
|
| 159 |
+
---
|
| 160 |
+
|
| 161 |
+
## ✅ What You Need to Prepare
|
| 162 |
+
|
| 163 |
+
### Before Running Scripts:
|
| 164 |
+
|
| 165 |
+
1. **Set Environment Variable:**
|
| 166 |
+
```bash
|
| 167 |
+
export HF_TOKEN="hf_your_token_here"
|
| 168 |
+
```
|
| 169 |
+
|
| 170 |
+
2. **Update `src/config.py`** (already done):
|
| 171 |
+
- HF repos are set to `ananttripathiak/...`
|
| 172 |
+
|
| 173 |
+
3. **That's it!** Scripts handle the rest automatically.
|
| 174 |
+
|
| 175 |
+
---
|
| 176 |
+
|
| 177 |
+
## 🔍 Verification
|
| 178 |
+
|
| 179 |
+
After running scripts, verify automatic uploads:
|
| 180 |
+
|
| 181 |
+
1. **HF Dataset**: https://huggingface.co/datasets/ananttripathiak/engine-maintenance-dataset
|
| 182 |
+
- Should see: `data/engine_data.csv`, `data/train.csv`, `data/test.csv`
|
| 183 |
+
|
| 184 |
+
2. **HF Model**: https://huggingface.co/ananttripathiak/engine-maintenance-model
|
| 185 |
+
- Should see: `model.joblib`
|
| 186 |
+
|
| 187 |
+
3. **HF Space**: https://huggingface.co/spaces/ananttripathiak/engine-maintenance-space
|
| 188 |
+
- Should see: `src/app.py`, `Dockerfile`, `requirements.txt`
|
| 189 |
+
- App should be running!
|
| 190 |
+
|
| 191 |
+
---
|
| 192 |
+
|
| 193 |
+
## 💡 Key Points
|
| 194 |
+
|
| 195 |
+
✅ **Automatic (3 out of 4):**
|
| 196 |
+
- HF Dataset upload
|
| 197 |
+
- HF Model upload
|
| 198 |
+
- HF Space deployment
|
| 199 |
+
|
| 200 |
+
⚠️ **Manual (1 out of 4):**
|
| 201 |
+
- GitHub push (only this one!)
|
| 202 |
+
|
| 203 |
+
🎯 **Bottom Line:**
|
| 204 |
+
- **75% automatic** - Just run the scripts!
|
| 205 |
+
- **25% manual** - Just push to GitHub once!
|
| 206 |
+
|
| 207 |
+
---
|
| 208 |
+
|
| 209 |
+
## 🚨 Common Questions
|
| 210 |
+
|
| 211 |
+
**Q: Do I need to manually create HF repos?**
|
| 212 |
+
A: **No!** Scripts create them automatically if they don't exist.
|
| 213 |
+
|
| 214 |
+
**Q: Do I need to manually upload files to HF?**
|
| 215 |
+
A: **No!** Scripts upload everything automatically.
|
| 216 |
+
|
| 217 |
+
**Q: What if a script fails?**
|
| 218 |
+
A: Check `HF_TOKEN` is set, then re-run the script. It will continue from where it left off.
|
| 219 |
+
|
| 220 |
+
**Q: Can I skip GitHub and just use HF?**
|
| 221 |
+
A: For the project rubric, you need GitHub for the CI/CD workflow. But for HF-only, yes, scripts handle everything.
|
| 222 |
+
|
| 223 |
+
---
|
| 224 |
+
|
| 225 |
+
## 📝 Quick Checklist
|
| 226 |
+
|
| 227 |
+
- [ ] **Manual**: Push to GitHub (`git push`)
|
| 228 |
+
- [ ] **Automatic**: Run `python src/data_register.py`
|
| 229 |
+
- [ ] **Automatic**: Run `python src/data_prep.py`
|
| 230 |
+
- [ ] **Automatic**: Run `python src/train.py`
|
| 231 |
+
- [ ] **Automatic**: Run `python src/deploy_to_hf.py`
|
| 232 |
+
- [ ] Verify all uploads completed successfully
|
| 233 |
+
|
| 234 |
+
**Total Manual Work: ~5 minutes (just GitHub)**
|
| 235 |
+
**Total Automatic Work: ~10-15 minutes (scripts do everything)**
|
Notebook_AML_and_MLOps_Project (2).ipynb
ADDED
|
@@ -0,0 +1,1808 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "markdown",
|
| 5 |
+
"metadata": {
|
| 6 |
+
"id": "klg2JF-oBblG"
|
| 7 |
+
},
|
| 8 |
+
"source": [
|
| 9 |
+
"# Problem Statement"
|
| 10 |
+
]
|
| 11 |
+
},
|
| 12 |
+
{
|
| 13 |
+
"cell_type": "markdown",
|
| 14 |
+
"metadata": {
|
| 15 |
+
"id": "m0CcOjZ-BblL"
|
| 16 |
+
},
|
| 17 |
+
"source": [
|
| 18 |
+
"## **Business Context**"
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"cell_type": "markdown",
|
| 23 |
+
"metadata": {
|
| 24 |
+
"id": "uyT6Koe7BblM"
|
| 25 |
+
},
|
| 26 |
+
"source": [
|
| 27 |
+
"\"Visit with Us,\" a leading travel company, is revolutionizing the tourism industry by leveraging data-driven strategies to optimize operations and customer engagement. While introducing a new package offering, such as the Wellness Tourism Package, the company faces challenges in targeting the right customers efficiently. The manual approach to identifying potential customers is inconsistent, time-consuming, and prone to errors, leading to missed opportunities and suboptimal campaign performance.\n",
|
| 28 |
+
"\n",
|
| 29 |
+
"To address these issues, the company aims to implement a scalable and automated system that integrates customer data, predicts potential buyers, and enhances decision-making for marketing strategies. By utilizing an MLOps pipeline, the company seeks to achieve seamless integration of data preprocessing, model development, deployment, and CI/CD practices for continuous improvement. This system will ensure efficient targeting of customers, timely updates to the predictive model, and adaptation to evolving customer behaviors, ultimately driving growth and customer satisfaction.\n"
|
| 30 |
+
]
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"cell_type": "markdown",
|
| 34 |
+
"metadata": {
|
| 35 |
+
"id": "zm6bNQOJBblO"
|
| 36 |
+
},
|
| 37 |
+
"source": [
|
| 38 |
+
"## **Objective**"
|
| 39 |
+
]
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"cell_type": "markdown",
|
| 43 |
+
"metadata": {
|
| 44 |
+
"id": "7PYtjk_YBblO"
|
| 45 |
+
},
|
| 46 |
+
"source": [
|
| 47 |
+
"As an MLOps Engineer at \"Visit with Us,\" your responsibility is to design and deploy an MLOps pipeline on GitHub to automate the end-to-end workflow for predicting customer purchases. The primary objective is to build a model that predicts whether a customer will purchase the newly introduced Wellness Tourism Package before contacting them. The pipeline will include data cleaning, preprocessing, transformation, model building, training, evaluation, and deployment, ensuring consistent performance and scalability. By leveraging GitHub Actions for CI/CD integration, the system will enable automated updates, streamline model deployment, and improve operational efficiency. This robust predictive solution will empower policymakers to make data-driven decisions, enhance marketing strategies, and effectively target potential customers, thereby driving customer acquisition and business growth."
|
| 48 |
+
]
|
| 49 |
+
},
|
| 50 |
+
{
|
| 51 |
+
"cell_type": "markdown",
|
| 52 |
+
"metadata": {
|
| 53 |
+
"id": "z8C11AzTBblP"
|
| 54 |
+
},
|
| 55 |
+
"source": [
|
| 56 |
+
"## **Data Description**"
|
| 57 |
+
]
|
| 58 |
+
},
|
| 59 |
+
{
|
| 60 |
+
"cell_type": "markdown",
|
| 61 |
+
"metadata": {
|
| 62 |
+
"id": "9DQx3pkaBblP"
|
| 63 |
+
},
|
| 64 |
+
"source": [
|
| 65 |
+
"The dataset contains customer and interaction data that serve as key attributes for predicting the likelihood of purchasing the Wellness Tourism Package. The detailed attributes are:\n",
|
| 66 |
+
"\n",
|
| 67 |
+
"**Customer Details**\n",
|
| 68 |
+
"- **CustomerID:** Unique identifier for each customer.\n",
|
| 69 |
+
"- **ProdTaken:** Target variable indicating whether the customer has purchased a package (0: No, 1: Yes).\n",
|
| 70 |
+
"- **Age:** Age of the customer.\n",
|
| 71 |
+
"- **TypeofContact:** The method by which the customer was contacted (Company Invited or Self Inquiry).\n",
|
| 72 |
+
"- **CityTier:** The city category based on development, population, and living standards (Tier 1 > Tier 2 > Tier 3).\n",
|
| 73 |
+
"- **Occupation:** Customer's occupation (e.g., Salaried, Freelancer).\n",
|
| 74 |
+
"- **Gender:** Gender of the customer (Male, Female).\n",
|
| 75 |
+
"- **NumberOfPersonVisiting:** Total number of people accompanying the customer on the trip.\n",
|
| 76 |
+
"- **PreferredPropertyStar:** Preferred hotel rating by the customer.\n",
|
| 77 |
+
"- **MaritalStatus:** Marital status of the customer (Single, Married, Divorced).\n",
|
| 78 |
+
"- **NumberOfTrips:** Average number of trips the customer takes annually.\n",
|
| 79 |
+
"- **Passport:** Whether the customer holds a valid passport (0: No, 1: Yes).\n",
|
| 80 |
+
"- **OwnCar:** Whether the customer owns a car (0: No, 1: Yes).\n",
|
| 81 |
+
"- **NumberOfChildrenVisiting:** Number of children below age 5 accompanying the customer.\n",
|
| 82 |
+
"- **Designation:** Customer's designation in their current organization.\n",
|
| 83 |
+
"- **MonthlyIncome:** Gross monthly income of the customer.\n",
|
| 84 |
+
"\n",
|
| 85 |
+
"**Customer Interaction Data**\n",
|
| 86 |
+
"- **PitchSatisfactionScore:** Score indicating the customer's satisfaction with the sales pitch.\n",
|
| 87 |
+
"- **ProductPitched:** The type of product pitched to the customer.\n",
|
| 88 |
+
"- **NumberOfFollowups:** Total number of follow-ups by the salesperson after the sales pitch.-\n",
|
| 89 |
+
"- **DurationOfPitch:** Duration of the sales pitch delivered to the customer.\n"
|
| 90 |
+
]
|
| 91 |
+
},
|
| 92 |
+
{
|
| 93 |
+
"cell_type": "markdown",
|
| 94 |
+
"metadata": {
|
| 95 |
+
"id": "0LbSu_p2jYfe"
|
| 96 |
+
},
|
| 97 |
+
"source": [
|
| 98 |
+
"# Model Building"
|
| 99 |
+
]
|
| 100 |
+
},
|
| 101 |
+
{
|
| 102 |
+
"cell_type": "code",
|
| 103 |
+
"execution_count": 1,
|
| 104 |
+
"metadata": {
|
| 105 |
+
"colab": {
|
| 106 |
+
"base_uri": "https://localhost:8080/"
|
| 107 |
+
},
|
| 108 |
+
"executionInfo": {
|
| 109 |
+
"elapsed": 15,
|
| 110 |
+
"status": "ok",
|
| 111 |
+
"timestamp": 1765705284123,
|
| 112 |
+
"user": {
|
| 113 |
+
"displayName": "Anant Tripathi",
|
| 114 |
+
"userId": "05588283814303116545"
|
| 115 |
+
},
|
| 116 |
+
"user_tz": -330
|
| 117 |
+
},
|
| 118 |
+
"id": "giodc4KknHID",
|
| 119 |
+
"outputId": "2caa2d21-8ef6-4259-8d79-5573ee1372d1"
|
| 120 |
+
},
|
| 121 |
+
"outputs": [
|
| 122 |
+
{
|
| 123 |
+
"name": "stdout",
|
| 124 |
+
"output_type": "stream",
|
| 125 |
+
"text": [
|
| 126 |
+
"/content\n"
|
| 127 |
+
]
|
| 128 |
+
}
|
| 129 |
+
],
|
| 130 |
+
"source": [
|
| 131 |
+
"# Create a master folder to keep all files created when executing the below code cells\n",
|
| 132 |
+
"import os\n",
|
| 133 |
+
"print(os.getcwd())\n",
|
| 134 |
+
"os.makedirs(\"tourism_project\", exist_ok=True)"
|
| 135 |
+
]
|
| 136 |
+
},
|
| 137 |
+
{
|
| 138 |
+
"cell_type": "code",
|
| 139 |
+
"execution_count": 2,
|
| 140 |
+
"metadata": {
|
| 141 |
+
"executionInfo": {
|
| 142 |
+
"elapsed": 1,
|
| 143 |
+
"status": "ok",
|
| 144 |
+
"timestamp": 1765705284135,
|
| 145 |
+
"user": {
|
| 146 |
+
"displayName": "Anant Tripathi",
|
| 147 |
+
"userId": "05588283814303116545"
|
| 148 |
+
},
|
| 149 |
+
"user_tz": -330
|
| 150 |
+
},
|
| 151 |
+
"id": "SUKPoy0EA4jj"
|
| 152 |
+
},
|
| 153 |
+
"outputs": [],
|
| 154 |
+
"source": [
|
| 155 |
+
"# Create a folder for storing the model building files\n",
|
| 156 |
+
"os.makedirs(\"tourism_project/model_building\", exist_ok=True)"
|
| 157 |
+
]
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"cell_type": "markdown",
|
| 161 |
+
"metadata": {
|
| 162 |
+
"id": "9DtS3gNDjBbR"
|
| 163 |
+
},
|
| 164 |
+
"source": [
|
| 165 |
+
"## Data Registration"
|
| 166 |
+
]
|
| 167 |
+
},
|
| 168 |
+
{
|
| 169 |
+
"cell_type": "code",
|
| 170 |
+
"execution_count": 3,
|
| 171 |
+
"metadata": {
|
| 172 |
+
"executionInfo": {
|
| 173 |
+
"elapsed": 10,
|
| 174 |
+
"status": "ok",
|
| 175 |
+
"timestamp": 1765705284146,
|
| 176 |
+
"user": {
|
| 177 |
+
"displayName": "Anant Tripathi",
|
| 178 |
+
"userId": "05588283814303116545"
|
| 179 |
+
},
|
| 180 |
+
"user_tz": -330
|
| 181 |
+
},
|
| 182 |
+
"id": "ZagOeVxJOtJ9"
|
| 183 |
+
},
|
| 184 |
+
"outputs": [],
|
| 185 |
+
"source": [
|
| 186 |
+
"os.makedirs(\"tourism_project/data\", exist_ok=True)"
|
| 187 |
+
]
|
| 188 |
+
},
|
| 189 |
+
{
|
| 190 |
+
"cell_type": "markdown",
|
| 191 |
+
"metadata": {
|
| 192 |
+
"id": "WxXiD9ZXxodF"
|
| 193 |
+
},
|
| 194 |
+
"source": [
|
| 195 |
+
"Once the **data** folder created after executing the above cell, please upload the **tourism.csv** in to the folder"
|
| 196 |
+
]
|
| 197 |
+
},
|
| 198 |
+
{
|
| 199 |
+
"cell_type": "markdown",
|
| 200 |
+
"metadata": {
|
| 201 |
+
"id": "hh2TjRG5WJ4Z"
|
| 202 |
+
},
|
| 203 |
+
"source": [
|
| 204 |
+
"## Data Preparation"
|
| 205 |
+
]
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"cell_type": "code",
|
| 209 |
+
"execution_count": 4,
|
| 210 |
+
"metadata": {
|
| 211 |
+
"colab": {
|
| 212 |
+
"base_uri": "https://localhost:8080/"
|
| 213 |
+
},
|
| 214 |
+
"executionInfo": {
|
| 215 |
+
"elapsed": 9,
|
| 216 |
+
"status": "ok",
|
| 217 |
+
"timestamp": 1765705284156,
|
| 218 |
+
"user": {
|
| 219 |
+
"displayName": "Anant Tripathi",
|
| 220 |
+
"userId": "05588283814303116545"
|
| 221 |
+
},
|
| 222 |
+
"user_tz": -330
|
| 223 |
+
},
|
| 224 |
+
"id": "EHVRGAeoOtJ-",
|
| 225 |
+
"outputId": "016d1a7d-e728-4d1e-a208-c4a08c9213a4"
|
| 226 |
+
},
|
| 227 |
+
"outputs": [
|
| 228 |
+
{
|
| 229 |
+
"name": "stdout",
|
| 230 |
+
"output_type": "stream",
|
| 231 |
+
"text": [
|
| 232 |
+
"Writing tourism_project/model_building/data_register.py\n"
|
| 233 |
+
]
|
| 234 |
+
}
|
| 235 |
+
],
|
| 236 |
+
"source": [
|
| 237 |
+
"%%writefile tourism_project/model_building/data_register.py\n",
|
| 238 |
+
"from huggingface_hub.utils import RepositoryNotFoundError, HfHubHTTPError\n",
|
| 239 |
+
"from huggingface_hub import HfApi, create_repo\n",
|
| 240 |
+
"import os\n",
|
| 241 |
+
"\n",
|
| 242 |
+
"\n",
|
| 243 |
+
"repo_id = \"ananttripathiak/tourism-dataset\"\n",
|
| 244 |
+
"repo_type = \"dataset\"\n",
|
| 245 |
+
"\n",
|
| 246 |
+
"# Initialize API client\n",
|
| 247 |
+
"api = HfApi(token=os.getenv(\"HF_TOKEN\"))\n",
|
| 248 |
+
"\n",
|
| 249 |
+
"# Step 1: Check if the space exists\n",
|
| 250 |
+
"try:\n",
|
| 251 |
+
" api.repo_info(repo_id=repo_id, repo_type=repo_type)\n",
|
| 252 |
+
" print(f\"Space '{repo_id}' already exists. Using it.\")\n",
|
| 253 |
+
"except RepositoryNotFoundError:\n",
|
| 254 |
+
" print(f\"Space '{repo_id}' not found. Creating new space...\")\n",
|
| 255 |
+
" create_repo(repo_id=repo_id, repo_type=repo_type, private=False)\n",
|
| 256 |
+
" print(f\"Space '{repo_id}' created.\")\n",
|
| 257 |
+
"\n",
|
| 258 |
+
"api.upload_folder(\n",
|
| 259 |
+
" folder_path=\"tourism_project/data\",\n",
|
| 260 |
+
" repo_id=repo_id,\n",
|
| 261 |
+
" repo_type=repo_type,\n",
|
| 262 |
+
")"
|
| 263 |
+
]
|
| 264 |
+
},
|
| 265 |
+
{
|
| 266 |
+
"cell_type": "markdown",
|
| 267 |
+
"metadata": {
|
| 268 |
+
"id": "eZZKnLkLjeM4"
|
| 269 |
+
},
|
| 270 |
+
"source": [
|
| 271 |
+
"## Model Training and Registration with Experimentation Tracking"
|
| 272 |
+
]
|
| 273 |
+
},
|
| 274 |
+
{
|
| 275 |
+
"cell_type": "code",
|
| 276 |
+
"execution_count": 5,
|
| 277 |
+
"metadata": {
|
| 278 |
+
"executionInfo": {
|
| 279 |
+
"elapsed": 2,
|
| 280 |
+
"status": "ok",
|
| 281 |
+
"timestamp": 1765705284159,
|
| 282 |
+
"user": {
|
| 283 |
+
"displayName": "Anant Tripathi",
|
| 284 |
+
"userId": "05588283814303116545"
|
| 285 |
+
},
|
| 286 |
+
"user_tz": -330
|
| 287 |
+
},
|
| 288 |
+
"id": "LFmrcXT_OtJ-"
|
| 289 |
+
},
|
| 290 |
+
"outputs": [],
|
| 291 |
+
"source": [
|
| 292 |
+
"# !pip install mlflow scikit-learn huggingface_hub\n",
|
| 293 |
+
"# import sys\n",
|
| 294 |
+
"# !{sys.executable} -m pip install mlflow"
|
| 295 |
+
]
|
| 296 |
+
},
|
| 297 |
+
{
|
| 298 |
+
"cell_type": "code",
|
| 299 |
+
"execution_count": 6,
|
| 300 |
+
"metadata": {
|
| 301 |
+
"colab": {
|
| 302 |
+
"base_uri": "https://localhost:8080/"
|
| 303 |
+
},
|
| 304 |
+
"executionInfo": {
|
| 305 |
+
"elapsed": 26,
|
| 306 |
+
"status": "ok",
|
| 307 |
+
"timestamp": 1765705284186,
|
| 308 |
+
"user": {
|
| 309 |
+
"displayName": "Anant Tripathi",
|
| 310 |
+
"userId": "05588283814303116545"
|
| 311 |
+
},
|
| 312 |
+
"user_tz": -330
|
| 313 |
+
},
|
| 314 |
+
"id": "DatpH_YdOtJ-",
|
| 315 |
+
"outputId": "3f7f7b88-26e9-4e36-df88-b83e5f28fd09"
|
| 316 |
+
},
|
| 317 |
+
"outputs": [
|
| 318 |
+
{
|
| 319 |
+
"name": "stdout",
|
| 320 |
+
"output_type": "stream",
|
| 321 |
+
"text": [
|
| 322 |
+
"Writing tourism_project/model_building/train.py\n"
|
| 323 |
+
]
|
| 324 |
+
}
|
| 325 |
+
],
|
| 326 |
+
"source": [
|
| 327 |
+
"%%writefile tourism_project/model_building/train.py\n",
|
| 328 |
+
"\n",
|
| 329 |
+
"# for data manipulation\n",
|
| 330 |
+
"import pandas as pd\n",
|
| 331 |
+
"import numpy as np\n",
|
| 332 |
+
"# for data preprocessing and pipeline creation\n",
|
| 333 |
+
"from sklearn.preprocessing import StandardScaler\n",
|
| 334 |
+
"from sklearn.compose import make_column_transformer\n",
|
| 335 |
+
"from sklearn.pipeline import make_pipeline\n",
|
| 336 |
+
"# for model training, tuning, and evaluation\n",
|
| 337 |
+
"import xgboost as xgb\n",
|
| 338 |
+
"from sklearn.model_selection import GridSearchCV\n",
|
| 339 |
+
"from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix\n",
|
| 340 |
+
"# for model serialization\n",
|
| 341 |
+
"import joblib\n",
|
| 342 |
+
"# for creating a folder\n",
|
| 343 |
+
"import os\n",
|
| 344 |
+
"# for hugging face space authentication to upload files\n",
|
| 345 |
+
"from huggingface_hub import login, HfApi, create_repo\n",
|
| 346 |
+
"from huggingface_hub.utils import RepositoryNotFoundError, HfHubHTTPError\n",
|
| 347 |
+
"import mlflow\n",
|
| 348 |
+
"\n",
|
| 349 |
+
"# Set up MLflow tracking\n",
|
| 350 |
+
"mlflow.set_tracking_uri(\"http://localhost:5000\")\n",
|
| 351 |
+
"mlflow.set_experiment(\"tourism-package-prediction\")\n",
|
| 352 |
+
"\n",
|
| 353 |
+
"api = HfApi()\n",
|
| 354 |
+
"\n",
|
| 355 |
+
"# Load the preprocessed data from Hugging Face\n",
|
| 356 |
+
"Xtrain_path = \"hf://datasets/ananttripathiak/tourism-dataset/Xtrain.csv\"\n",
|
| 357 |
+
"Xtest_path = \"hf://datasets/ananttripathiak/tourism-dataset/Xtest.csv\"\n",
|
| 358 |
+
"ytrain_path = \"hf://datasets/ananttripathiak/tourism-dataset/ytrain.csv\"\n",
|
| 359 |
+
"ytest_path = \"hf://datasets/ananttripathiak/tourism-dataset/ytest.csv\"\n",
|
| 360 |
+
"\n",
|
| 361 |
+
"print(\"Loading preprocessed data...\")\n",
|
| 362 |
+
"Xtrain = pd.read_csv(Xtrain_path)\n",
|
| 363 |
+
"Xtest = pd.read_csv(Xtest_path)\n",
|
| 364 |
+
"ytrain = pd.read_csv(ytrain_path).values.ravel()\n",
|
| 365 |
+
"ytest = pd.read_csv(ytest_path).values.ravel()\n",
|
| 366 |
+
"\n",
|
| 367 |
+
"print(f\"Training set shape: {Xtrain.shape}\")\n",
|
| 368 |
+
"print(f\"Test set shape: {Xtest.shape}\")\n",
|
| 369 |
+
"\n",
|
| 370 |
+
"# Identify numeric features (all features after encoding)\n",
|
| 371 |
+
"numeric_features = Xtrain.columns.tolist()\n",
|
| 372 |
+
"\n",
|
| 373 |
+
"# Preprocessor - StandardScaler for all numeric features\n",
|
| 374 |
+
"preprocessor = make_column_transformer(\n",
|
| 375 |
+
" (StandardScaler(), numeric_features)\n",
|
| 376 |
+
")\n",
|
| 377 |
+
"\n",
|
| 378 |
+
"# Define base XGBoost Classifier\n",
|
| 379 |
+
"xgb_model = xgb.XGBClassifier(\n",
|
| 380 |
+
" random_state=42,\n",
|
| 381 |
+
" n_jobs=-1,\n",
|
| 382 |
+
" eval_metric='logloss',\n",
|
| 383 |
+
" use_label_encoder=False\n",
|
| 384 |
+
")\n",
|
| 385 |
+
"\n",
|
| 386 |
+
"# Hyperparameter grid for classification\n",
|
| 387 |
+
"param_grid = {\n",
|
| 388 |
+
" 'xgbclassifier__n_estimators': [100, 200, 300],\n",
|
| 389 |
+
" 'xgbclassifier__max_depth': [3, 5, 7],\n",
|
| 390 |
+
" 'xgbclassifier__learning_rate': [0.01, 0.05, 0.1],\n",
|
| 391 |
+
" 'xgbclassifier__subsample': [0.7, 0.8, 1.0],\n",
|
| 392 |
+
" 'xgbclassifier__colsample_bytree': [0.7, 0.8, 1.0],\n",
|
| 393 |
+
" 'xgbclassifier__scale_pos_weight': [1, 2, 3] # Handle class imbalance\n",
|
| 394 |
+
"}\n",
|
| 395 |
+
"\n",
|
| 396 |
+
"# Pipeline\n",
|
| 397 |
+
"model_pipeline = make_pipeline(preprocessor, xgb_model)\n",
|
| 398 |
+
"\n",
|
| 399 |
+
"print(\"\\nStarting MLflow experiment...\")\n",
|
| 400 |
+
"with mlflow.start_run():\n",
|
| 401 |
+
" print(\"Performing Grid Search with Cross-Validation...\")\n",
|
| 402 |
+
" # Grid Search\n",
|
| 403 |
+
" grid_search = GridSearchCV(\n",
|
| 404 |
+
" model_pipeline,\n",
|
| 405 |
+
" param_grid,\n",
|
| 406 |
+
" cv=3,\n",
|
| 407 |
+
" n_jobs=-1,\n",
|
| 408 |
+
" scoring='roc_auc',\n",
|
| 409 |
+
" verbose=1\n",
|
| 410 |
+
" )\n",
|
| 411 |
+
" grid_search.fit(Xtrain, ytrain)\n",
|
| 412 |
+
"\n",
|
| 413 |
+
" # Log parameter sets\n",
|
| 414 |
+
" results = grid_search.cv_results_\n",
|
| 415 |
+
" print(f\"\\nEvaluated {len(results['params'])} parameter combinations\")\n",
|
| 416 |
+
"\n",
|
| 417 |
+
" for i in range(len(results['params'])):\n",
|
| 418 |
+
" param_set = results['params'][i]\n",
|
| 419 |
+
" mean_score = results['mean_test_score'][i]\n",
|
| 420 |
+
"\n",
|
| 421 |
+
" with mlflow.start_run(nested=True):\n",
|
| 422 |
+
" mlflow.log_params(param_set)\n",
|
| 423 |
+
" mlflow.log_metric(\"mean_roc_auc\", mean_score)\n",
|
| 424 |
+
"\n",
|
| 425 |
+
" # Best model\n",
|
| 426 |
+
" print(f\"\\nBest parameters: {grid_search.best_params_}\")\n",
|
| 427 |
+
" mlflow.log_params(grid_search.best_params_)\n",
|
| 428 |
+
" best_model = grid_search.best_estimator_\n",
|
| 429 |
+
"\n",
|
| 430 |
+
" # Predictions\n",
|
| 431 |
+
" print(\"\\nMaking predictions...\")\n",
|
| 432 |
+
" y_pred_train = best_model.predict(Xtrain)\n",
|
| 433 |
+
" y_pred_test = best_model.predict(Xtest)\n",
|
| 434 |
+
"\n",
|
| 435 |
+
" # Probability predictions for ROC-AUC\n",
|
| 436 |
+
" y_pred_train_proba = best_model.predict_proba(Xtrain)[:, 1]\n",
|
| 437 |
+
" y_pred_test_proba = best_model.predict_proba(Xtest)[:, 1]\n",
|
| 438 |
+
"\n",
|
| 439 |
+
" # Calculate metrics\n",
|
| 440 |
+
" print(\"\\nCalculating metrics...\")\n",
|
| 441 |
+
" train_accuracy = accuracy_score(ytrain, y_pred_train)\n",
|
| 442 |
+
" test_accuracy = accuracy_score(ytest, y_pred_test)\n",
|
| 443 |
+
"\n",
|
| 444 |
+
" train_precision = precision_score(ytrain, y_pred_train, zero_division=0)\n",
|
| 445 |
+
" test_precision = precision_score(ytest, y_pred_test, zero_division=0)\n",
|
| 446 |
+
"\n",
|
| 447 |
+
" train_recall = recall_score(ytrain, y_pred_train, zero_division=0)\n",
|
| 448 |
+
" test_recall = recall_score(ytest, y_pred_test, zero_division=0)\n",
|
| 449 |
+
"\n",
|
| 450 |
+
" train_f1 = f1_score(ytrain, y_pred_train, zero_division=0)\n",
|
| 451 |
+
" test_f1 = f1_score(ytest, y_pred_test, zero_division=0)\n",
|
| 452 |
+
"\n",
|
| 453 |
+
" train_roc_auc = roc_auc_score(ytrain, y_pred_train_proba)\n",
|
| 454 |
+
" test_roc_auc = roc_auc_score(ytest, y_pred_test_proba)\n",
|
| 455 |
+
"\n",
|
| 456 |
+
" # Log metrics\n",
|
| 457 |
+
" mlflow.log_metrics({\n",
|
| 458 |
+
" \"train_accuracy\": train_accuracy,\n",
|
| 459 |
+
" \"test_accuracy\": test_accuracy,\n",
|
| 460 |
+
" \"train_precision\": train_precision,\n",
|
| 461 |
+
" \"test_precision\": test_precision,\n",
|
| 462 |
+
" \"train_recall\": train_recall,\n",
|
| 463 |
+
" \"test_recall\": test_recall,\n",
|
| 464 |
+
" \"train_f1_score\": train_f1,\n",
|
| 465 |
+
" \"test_f1_score\": test_f1,\n",
|
| 466 |
+
" \"train_roc_auc\": train_roc_auc,\n",
|
| 467 |
+
" \"test_roc_auc\": test_roc_auc\n",
|
| 468 |
+
" })\n",
|
| 469 |
+
"\n",
|
| 470 |
+
" # Print results\n",
|
| 471 |
+
" print(\"\\n\" + \"=\"*50)\n",
|
| 472 |
+
" print(\"MODEL PERFORMANCE METRICS\")\n",
|
| 473 |
+
" print(\"=\"*50)\n",
|
| 474 |
+
" print(f\"Train Accuracy: {train_accuracy:.4f} | Test Accuracy: {test_accuracy:.4f}\")\n",
|
| 475 |
+
" print(f\"Train Precision: {train_precision:.4f} | Test Precision: {test_precision:.4f}\")\n",
|
| 476 |
+
" print(f\"Train Recall: {train_recall:.4f} | Test Recall: {test_recall:.4f}\")\n",
|
| 477 |
+
" print(f\"Train F1-Score: {train_f1:.4f} | Test F1-Score: {test_f1:.4f}\")\n",
|
| 478 |
+
" print(f\"Train ROC-AUC: {train_roc_auc:.4f} | Test ROC-AUC: {test_roc_auc:.4f}\")\n",
|
| 479 |
+
" print(\"=\"*50)\n",
|
| 480 |
+
"\n",
|
| 481 |
+
" print(\"\\nTest Set Classification Report:\")\n",
|
| 482 |
+
" print(classification_report(ytest, y_pred_test, target_names=['No Purchase', 'Purchase']))\n",
|
| 483 |
+
"\n",
|
| 484 |
+
" print(\"\\nTest Set Confusion Matrix:\")\n",
|
| 485 |
+
" print(confusion_matrix(ytest, y_pred_test))\n",
|
| 486 |
+
"\n",
|
| 487 |
+
" # Save the model locally\n",
|
| 488 |
+
" model_path = \"best_tourism_model_v1.joblib\"\n",
|
| 489 |
+
" joblib.dump(best_model, model_path)\n",
|
| 490 |
+
" print(f\"\\nModel saved locally as: {model_path}\")\n",
|
| 491 |
+
"\n",
|
| 492 |
+
" # Log the model artifact\n",
|
| 493 |
+
" mlflow.log_artifact(model_path, artifact_path=\"model\")\n",
|
| 494 |
+
" print(f\"Model logged to MLflow\")\n",
|
| 495 |
+
"\n",
|
| 496 |
+
" # Upload to Hugging Face\n",
|
| 497 |
+
" repo_id = \"ananttripathiak/tourism-prediction-model\"\n",
|
| 498 |
+
" repo_type = \"model\"\n",
|
| 499 |
+
"\n",
|
| 500 |
+
" # Step 1: Check if the repository exists\n",
|
| 501 |
+
" try:\n",
|
| 502 |
+
" api.repo_info(repo_id=repo_id, repo_type=repo_type)\n",
|
| 503 |
+
" print(f\"\\nRepository '{repo_id}' already exists. Using it.\")\n",
|
| 504 |
+
" except RepositoryNotFoundError:\n",
|
| 505 |
+
" print(f\"\\nRepository '{repo_id}' not found. Creating new repository...\")\n",
|
| 506 |
+
" create_repo(repo_id=repo_id, repo_type=repo_type, private=False)\n",
|
| 507 |
+
" print(f\"Repository '{repo_id}' created.\")\n",
|
| 508 |
+
"\n",
|
| 509 |
+
" # Upload model to Hugging Face\n",
|
| 510 |
+
" api.upload_file(\n",
|
| 511 |
+
" path_or_fileobj=\"best_tourism_model_v1.joblib\",\n",
|
| 512 |
+
" path_in_repo=\"best_tourism_model_v1.joblib\",\n",
|
| 513 |
+
" repo_id=repo_id,\n",
|
| 514 |
+
" repo_type=repo_type,\n",
|
| 515 |
+
" )\n",
|
| 516 |
+
" print(f\"Model uploaded to Hugging Face: {repo_id}\")\n",
|
| 517 |
+
"\n",
|
| 518 |
+
"print(\"\\n\" + \"=\"*50)\n",
|
| 519 |
+
"print(\"MODEL TRAINING COMPLETED SUCCESSFULLY!\")\n",
|
| 520 |
+
"print(\"=\"*50)"
|
| 521 |
+
]
|
| 522 |
+
},
|
| 523 |
+
{
|
| 524 |
+
"cell_type": "code",
|
| 525 |
+
"execution_count": 7,
|
| 526 |
+
"metadata": {
|
| 527 |
+
"colab": {
|
| 528 |
+
"base_uri": "https://localhost:8080/"
|
| 529 |
+
},
|
| 530 |
+
"executionInfo": {
|
| 531 |
+
"elapsed": 203,
|
| 532 |
+
"status": "ok",
|
| 533 |
+
"timestamp": 1765705284390,
|
| 534 |
+
"user": {
|
| 535 |
+
"displayName": "Anant Tripathi",
|
| 536 |
+
"userId": "05588283814303116545"
|
| 537 |
+
},
|
| 538 |
+
"user_tz": -330
|
| 539 |
+
},
|
| 540 |
+
"id": "Vtjn_63uOtJ-",
|
| 541 |
+
"outputId": "259877f9-6be4-4ef1-cda4-c06d26720447"
|
| 542 |
+
},
|
| 543 |
+
"outputs": [
|
| 544 |
+
{
|
| 545 |
+
"name": "stdout",
|
| 546 |
+
"output_type": "stream",
|
| 547 |
+
"text": [
|
| 548 |
+
"Writing tourism_project/model_building/prep.py\n"
|
| 549 |
+
]
|
| 550 |
+
}
|
| 551 |
+
],
|
| 552 |
+
"source": [
|
| 553 |
+
"%%writefile tourism_project/model_building/prep.py\n",
|
| 554 |
+
"# for data manipulation\n",
|
| 555 |
+
"import pandas as pd\n",
|
| 556 |
+
"import numpy as np\n",
|
| 557 |
+
"# for data preprocessing and pipeline creation\n",
|
| 558 |
+
"from sklearn.model_selection import train_test_split\n",
|
| 559 |
+
"# for converting text data in to numerical representation\n",
|
| 560 |
+
"from sklearn.preprocessing import LabelEncoder\n",
|
| 561 |
+
"# for hugging face space authentication to upload files\n",
|
| 562 |
+
"from huggingface_hub import login, HfApi\n",
|
| 563 |
+
"import os\n",
|
| 564 |
+
"\n",
|
| 565 |
+
"# Define constants for the dataset and output paths\n",
|
| 566 |
+
"api = HfApi(token=os.getenv(\"HF_TOKEN\"))\n",
|
| 567 |
+
"DATASET_PATH = \"hf://datasets/ananttripathiak/tourism-dataset/tourism.csv\"\n",
|
| 568 |
+
"df = pd.read_csv(DATASET_PATH)\n",
|
| 569 |
+
"print(\"Dataset loaded successfully.\")\n",
|
| 570 |
+
"print(f\"Dataset shape: {df.shape}\")\n",
|
| 571 |
+
"\n",
|
| 572 |
+
"# Drop the unnamed index column if it exists\n",
|
| 573 |
+
"if 'Unnamed: 0' in df.columns or df.columns[0] == '':\n",
|
| 574 |
+
" df = df.iloc[:, 1:]\n",
|
| 575 |
+
"\n",
|
| 576 |
+
"# Drop CustomerID as it's a unique identifier (not useful for modeling)\n",
|
| 577 |
+
"if 'CustomerID' in df.columns:\n",
|
| 578 |
+
" df.drop(columns=['CustomerID'], inplace=True)\n",
|
| 579 |
+
"\n",
|
| 580 |
+
"# Handle missing values\n",
|
| 581 |
+
"print(\"\\nHandling missing values...\")\n",
|
| 582 |
+
"# For numerical columns, fill with median\n",
|
| 583 |
+
"numerical_cols = df.select_dtypes(include=[np.number]).columns\n",
|
| 584 |
+
"for col in numerical_cols:\n",
|
| 585 |
+
" if df[col].isnull().sum() > 0:\n",
|
| 586 |
+
" df[col].fillna(df[col].median(), inplace=True)\n",
|
| 587 |
+
"\n",
|
| 588 |
+
"# For categorical columns, fill with mode\n",
|
| 589 |
+
"categorical_cols = df.select_dtypes(include=['object']).columns\n",
|
| 590 |
+
"for col in categorical_cols:\n",
|
| 591 |
+
" if df[col].isnull().sum() > 0:\n",
|
| 592 |
+
" df[col].fillna(df[col].mode()[0], inplace=True)\n",
|
| 593 |
+
"\n",
|
| 594 |
+
"# Handle specific data quality issues (e.g., \"Fe Male\" should be \"Female\")\n",
|
| 595 |
+
"if 'Gender' in df.columns:\n",
|
| 596 |
+
" df['Gender'] = df['Gender'].str.strip().replace({'Fe Male': 'Female', 'Fe male': 'Female'})\n",
|
| 597 |
+
"\n",
|
| 598 |
+
"# Encode categorical columns\n",
|
| 599 |
+
"print(\"\\nEncoding categorical variables...\")\n",
|
| 600 |
+
"label_encoder = LabelEncoder()\n",
|
| 601 |
+
"\n",
|
| 602 |
+
"# List of categorical columns to encode\n",
|
| 603 |
+
"categorical_features = ['TypeofContact', 'Occupation', 'Gender', 'ProductPitched',\n",
|
| 604 |
+
" 'MaritalStatus', 'Designation']\n",
|
| 605 |
+
"\n",
|
| 606 |
+
"for col in categorical_features:\n",
|
| 607 |
+
" if col in df.columns:\n",
|
| 608 |
+
" df[col] = label_encoder.fit_transform(df[col].astype(str))\n",
|
| 609 |
+
"\n",
|
| 610 |
+
"# Define target variable\n",
|
| 611 |
+
"target_col = 'ProdTaken'\n",
|
| 612 |
+
"\n",
|
| 613 |
+
"# Split into X (features) and y (target)\n",
|
| 614 |
+
"X = df.drop(columns=[target_col])\n",
|
| 615 |
+
"y = df[target_col]\n",
|
| 616 |
+
"\n",
|
| 617 |
+
"print(f\"\\nFeatures shape: {X.shape}\")\n",
|
| 618 |
+
"print(f\"Target shape: {y.shape}\")\n",
|
| 619 |
+
"print(f\"Target distribution:\\n{y.value_counts()}\")\n",
|
| 620 |
+
"\n",
|
| 621 |
+
"# Perform train-test split\n",
|
| 622 |
+
"Xtrain, Xtest, ytrain, ytest = train_test_split(\n",
|
| 623 |
+
" X, y, test_size=0.2, random_state=42, stratify=y\n",
|
| 624 |
+
")\n",
|
| 625 |
+
"\n",
|
| 626 |
+
"print(f\"\\nTrain set size: {Xtrain.shape[0]}\")\n",
|
| 627 |
+
"print(f\"Test set size: {Xtest.shape[0]}\")\n",
|
| 628 |
+
"\n",
|
| 629 |
+
"# Save the datasets\n",
|
| 630 |
+
"Xtrain.to_csv(\"Xtrain.csv\", index=False)\n",
|
| 631 |
+
"Xtest.to_csv(\"Xtest.csv\", index=False)\n",
|
| 632 |
+
"ytrain.to_csv(\"ytrain.csv\", index=False)\n",
|
| 633 |
+
"ytest.to_csv(\"ytest.csv\", index=False)\n",
|
| 634 |
+
"\n",
|
| 635 |
+
"print(\"\\nDatasets saved locally.\")\n",
|
| 636 |
+
"\n",
|
| 637 |
+
"# Upload to Hugging Face\n",
|
| 638 |
+
"files = [\"Xtrain.csv\", \"Xtest.csv\", \"ytrain.csv\", \"ytest.csv\"]\n",
|
| 639 |
+
"\n",
|
| 640 |
+
"for file_path in files:\n",
|
| 641 |
+
" api.upload_file(\n",
|
| 642 |
+
" path_or_fileobj=file_path,\n",
|
| 643 |
+
" path_in_repo=file_path.split(\"/\")[-1],\n",
|
| 644 |
+
" repo_id=\"ananttripathiak/tourism-dataset\",\n",
|
| 645 |
+
" repo_type=\"dataset\",\n",
|
| 646 |
+
" )\n",
|
| 647 |
+
" print(f\"Uploaded {file_path} to Hugging Face\")\n",
|
| 648 |
+
"\n",
|
| 649 |
+
"print(\"\\nData preparation completed successfully!\")"
|
| 650 |
+
]
|
| 651 |
+
},
|
| 652 |
+
{
|
| 653 |
+
"cell_type": "markdown",
|
| 654 |
+
"metadata": {
|
| 655 |
+
"id": "0McYCZzkji5I"
|
| 656 |
+
},
|
| 657 |
+
"source": [
|
| 658 |
+
"# Deployment"
|
| 659 |
+
]
|
| 660 |
+
},
|
| 661 |
+
{
|
| 662 |
+
"cell_type": "markdown",
|
| 663 |
+
"metadata": {
|
| 664 |
+
"id": "9QrY2v77vbEZ"
|
| 665 |
+
},
|
| 666 |
+
"source": [
|
| 667 |
+
"## Dockerfile"
|
| 668 |
+
]
|
| 669 |
+
},
|
| 670 |
+
{
|
| 671 |
+
"cell_type": "code",
|
| 672 |
+
"execution_count": 8,
|
| 673 |
+
"metadata": {
|
| 674 |
+
"executionInfo": {
|
| 675 |
+
"elapsed": 2,
|
| 676 |
+
"status": "ok",
|
| 677 |
+
"timestamp": 1765705284393,
|
| 678 |
+
"user": {
|
| 679 |
+
"displayName": "Anant Tripathi",
|
| 680 |
+
"userId": "05588283814303116545"
|
| 681 |
+
},
|
| 682 |
+
"user_tz": -330
|
| 683 |
+
},
|
| 684 |
+
"id": "0-AMAI72CR-T"
|
| 685 |
+
},
|
| 686 |
+
"outputs": [],
|
| 687 |
+
"source": [
|
| 688 |
+
"os.makedirs(\"tourism_project/deployment\", exist_ok=True)"
|
| 689 |
+
]
|
| 690 |
+
},
|
| 691 |
+
{
|
| 692 |
+
"cell_type": "code",
|
| 693 |
+
"execution_count": 9,
|
| 694 |
+
"metadata": {
|
| 695 |
+
"colab": {
|
| 696 |
+
"base_uri": "https://localhost:8080/"
|
| 697 |
+
},
|
| 698 |
+
"executionInfo": {
|
| 699 |
+
"elapsed": 10,
|
| 700 |
+
"status": "ok",
|
| 701 |
+
"timestamp": 1765705284404,
|
| 702 |
+
"user": {
|
| 703 |
+
"displayName": "Anant Tripathi",
|
| 704 |
+
"userId": "05588283814303116545"
|
| 705 |
+
},
|
| 706 |
+
"user_tz": -330
|
| 707 |
+
},
|
| 708 |
+
"id": "ZTicTDnPCVZr",
|
| 709 |
+
"outputId": "1707e165-9dee-4adb-c9c4-e2e0b50ce8fb"
|
| 710 |
+
},
|
| 711 |
+
"outputs": [
|
| 712 |
+
{
|
| 713 |
+
"name": "stdout",
|
| 714 |
+
"output_type": "stream",
|
| 715 |
+
"text": [
|
| 716 |
+
"Writing tourism_project/deployment/Dockerfile\n"
|
| 717 |
+
]
|
| 718 |
+
}
|
| 719 |
+
],
|
| 720 |
+
"source": [
|
| 721 |
+
"%%writefile tourism_project/deployment/Dockerfile\n",
|
| 722 |
+
"# Use a minimal base image with Python 3.9 installed\n",
|
| 723 |
+
"FROM python:3.9\n",
|
| 724 |
+
"\n",
|
| 725 |
+
"# Set the working directory inside the container to /app\n",
|
| 726 |
+
"WORKDIR /app\n",
|
| 727 |
+
"\n",
|
| 728 |
+
"# Copy all files from the current directory on the host to the container's /app directory\n",
|
| 729 |
+
"COPY . .\n",
|
| 730 |
+
"\n",
|
| 731 |
+
"# Install Python dependencies listed in requirements.txt\n",
|
| 732 |
+
"RUN pip3 install -r requirements.txt\n",
|
| 733 |
+
"\n",
|
| 734 |
+
"RUN useradd -m -u 1000 user\n",
|
| 735 |
+
"USER user\n",
|
| 736 |
+
"ENV HOME=/home/user \\\n",
|
| 737 |
+
"\tPATH=/home/user/.local/bin:$PATH\n",
|
| 738 |
+
"\n",
|
| 739 |
+
"WORKDIR $HOME/app\n",
|
| 740 |
+
"\n",
|
| 741 |
+
"COPY --chown=user . $HOME/app\n",
|
| 742 |
+
"\n",
|
| 743 |
+
"# Define the command to run the Streamlit app on port \"8501\" and make it accessible externally\n",
|
| 744 |
+
"CMD [\"streamlit\", \"run\", \"app.py\", \"--server.port=8501\", \"--server.address=0.0.0.0\", \"--server.enableXsrfProtection=false\"]"
|
| 745 |
+
]
|
| 746 |
+
},
|
| 747 |
+
{
|
| 748 |
+
"cell_type": "markdown",
|
| 749 |
+
"metadata": {
|
| 750 |
+
"id": "LCvrklrBwNvJ"
|
| 751 |
+
},
|
| 752 |
+
"source": [
|
| 753 |
+
"## Streamlit App"
|
| 754 |
+
]
|
| 755 |
+
},
|
| 756 |
+
{
|
| 757 |
+
"cell_type": "markdown",
|
| 758 |
+
"metadata": {
|
| 759 |
+
"id": "fXWe6ObRjP6-"
|
| 760 |
+
},
|
| 761 |
+
"source": [
|
| 762 |
+
"Please ensure that the web app script is named `app.py`."
|
| 763 |
+
]
|
| 764 |
+
},
|
| 765 |
+
{
|
| 766 |
+
"cell_type": "code",
|
| 767 |
+
"execution_count": 10,
|
| 768 |
+
"metadata": {
|
| 769 |
+
"colab": {
|
| 770 |
+
"base_uri": "https://localhost:8080/"
|
| 771 |
+
},
|
| 772 |
+
"executionInfo": {
|
| 773 |
+
"elapsed": 56,
|
| 774 |
+
"status": "ok",
|
| 775 |
+
"timestamp": 1765705284459,
|
| 776 |
+
"user": {
|
| 777 |
+
"displayName": "Anant Tripathi",
|
| 778 |
+
"userId": "05588283814303116545"
|
| 779 |
+
},
|
| 780 |
+
"user_tz": -330
|
| 781 |
+
},
|
| 782 |
+
"id": "WBG-jxM89jdp",
|
| 783 |
+
"outputId": "885441f8-7cf2-4646-fc4e-705ebddf4640"
|
| 784 |
+
},
|
| 785 |
+
"outputs": [
|
| 786 |
+
{
|
| 787 |
+
"name": "stdout",
|
| 788 |
+
"output_type": "stream",
|
| 789 |
+
"text": [
|
| 790 |
+
"Writing tourism_project/deployment/app.py\n"
|
| 791 |
+
]
|
| 792 |
+
}
|
| 793 |
+
],
|
| 794 |
+
"source": [
|
| 795 |
+
"%%writefile tourism_project/deployment/app.py\n",
|
| 796 |
+
"import streamlit as st\n",
|
| 797 |
+
"import pandas as pd\n",
|
| 798 |
+
"import numpy as np\n",
|
| 799 |
+
"from huggingface_hub import hf_hub_download\n",
|
| 800 |
+
"import joblib\n",
|
| 801 |
+
"\n",
|
| 802 |
+
"# Page configuration\n",
|
| 803 |
+
"st.set_page_config(\n",
|
| 804 |
+
" page_title=\"Tourism Package Prediction\",\n",
|
| 805 |
+
" page_icon=\"✈️\",\n",
|
| 806 |
+
" layout=\"wide\"\n",
|
| 807 |
+
")\n",
|
| 808 |
+
"\n",
|
| 809 |
+
"# Download and load the trained model\n",
|
| 810 |
+
"@st.cache_resource\n",
|
| 811 |
+
"def load_model():\n",
|
| 812 |
+
" try:\n",
|
| 813 |
+
" model_path = hf_hub_download(\n",
|
| 814 |
+
" repo_id=\"ananttripathiak/tourism-prediction-model\",\n",
|
| 815 |
+
" filename=\"best_tourism_model_v1.joblib\"\n",
|
| 816 |
+
" )\n",
|
| 817 |
+
" model = joblib.load(model_path)\n",
|
| 818 |
+
" return model\n",
|
| 819 |
+
" except Exception as e:\n",
|
| 820 |
+
" st.error(f\"Error loading model: {e}\")\n",
|
| 821 |
+
" return None\n",
|
| 822 |
+
"\n",
|
| 823 |
+
"model = load_model()\n",
|
| 824 |
+
"\n",
|
| 825 |
+
"# Title and Description\n",
|
| 826 |
+
"st.title(\"✈️ Wellness Tourism Package Prediction\")\n",
|
| 827 |
+
"st.markdown(\"\"\"\n",
|
| 828 |
+
"<style>\n",
|
| 829 |
+
" .main-header {\n",
|
| 830 |
+
" font-size: 20px;\n",
|
| 831 |
+
" color: #1f77b4;\n",
|
| 832 |
+
" margin-bottom: 20px;\n",
|
| 833 |
+
" }\n",
|
| 834 |
+
" .prediction-box {\n",
|
| 835 |
+
" padding: 20px;\n",
|
| 836 |
+
" border-radius: 10px;\n",
|
| 837 |
+
" margin: 20px 0;\n",
|
| 838 |
+
" }\n",
|
| 839 |
+
" .success-box {\n",
|
| 840 |
+
" background-color: #d4edda;\n",
|
| 841 |
+
" border: 1px solid #c3e6cb;\n",
|
| 842 |
+
" }\n",
|
| 843 |
+
" .warning-box {\n",
|
| 844 |
+
" background-color: #fff3cd;\n",
|
| 845 |
+
" border: 1px solid #ffeeba;\n",
|
| 846 |
+
" }\n",
|
| 847 |
+
"</style>\n",
|
| 848 |
+
"\"\"\", unsafe_allow_html=True)\n",
|
| 849 |
+
"\n",
|
| 850 |
+
"st.markdown(\"\"\"\n",
|
| 851 |
+
"This application predicts the likelihood of a customer purchasing the **Wellness Tourism Package**\n",
|
| 852 |
+
"based on their profile and interaction data. Enter customer details below to get a prediction.\n",
|
| 853 |
+
"\"\"\")\n",
|
| 854 |
+
"\n",
|
| 855 |
+
"# Create two columns for input\n",
|
| 856 |
+
"col1, col2 = st.columns(2)\n",
|
| 857 |
+
"\n",
|
| 858 |
+
"with col1:\n",
|
| 859 |
+
" st.subheader(\"📋 Customer Demographics\")\n",
|
| 860 |
+
"\n",
|
| 861 |
+
" age = st.number_input(\"Age\", min_value=18, max_value=100, value=35, step=1)\n",
|
| 862 |
+
"\n",
|
| 863 |
+
" type_of_contact = st.selectbox(\n",
|
| 864 |
+
" \"Type of Contact\",\n",
|
| 865 |
+
" options=[\"Self Enquiry\", \"Company Invited\"]\n",
|
| 866 |
+
" )\n",
|
| 867 |
+
"\n",
|
| 868 |
+
" city_tier = st.selectbox(\n",
|
| 869 |
+
" \"City Tier\",\n",
|
| 870 |
+
" options=[1, 2, 3],\n",
|
| 871 |
+
" help=\"Tier 1: Metro cities, Tier 2: Mid-sized cities, Tier 3: Smaller cities\"\n",
|
| 872 |
+
" )\n",
|
| 873 |
+
"\n",
|
| 874 |
+
" occupation = st.selectbox(\n",
|
| 875 |
+
" \"Occupation\",\n",
|
| 876 |
+
" options=[\"Salaried\", \"Small Business\", \"Free Lancer\", \"Large Business\"]\n",
|
| 877 |
+
" )\n",
|
| 878 |
+
"\n",
|
| 879 |
+
" gender = st.selectbox(\"Gender\", options=[\"Male\", \"Female\"])\n",
|
| 880 |
+
"\n",
|
| 881 |
+
" marital_status = st.selectbox(\n",
|
| 882 |
+
" \"Marital Status\",\n",
|
| 883 |
+
" options=[\"Single\", \"Married\", \"Divorced\", \"Unmarried\"]\n",
|
| 884 |
+
" )\n",
|
| 885 |
+
"\n",
|
| 886 |
+
" designation = st.selectbox(\n",
|
| 887 |
+
" \"Designation\",\n",
|
| 888 |
+
" options=[\"Executive\", \"Manager\", \"Senior Manager\", \"AVP\", \"VP\"]\n",
|
| 889 |
+
" )\n",
|
| 890 |
+
"\n",
|
| 891 |
+
" monthly_income = st.number_input(\n",
|
| 892 |
+
" \"Monthly Income (₹)\",\n",
|
| 893 |
+
" min_value=0.0,\n",
|
| 894 |
+
" max_value=200000.0,\n",
|
| 895 |
+
" value=25000.0,\n",
|
| 896 |
+
" step=1000.0\n",
|
| 897 |
+
" )\n",
|
| 898 |
+
"\n",
|
| 899 |
+
"with col2:\n",
|
| 900 |
+
" st.subheader(\"🎯 Customer Interaction & Preferences\")\n",
|
| 901 |
+
"\n",
|
| 902 |
+
" duration_of_pitch = st.number_input(\n",
|
| 903 |
+
" \"Duration of Pitch (minutes)\",\n",
|
| 904 |
+
" min_value=0.0,\n",
|
| 905 |
+
" max_value=60.0,\n",
|
| 906 |
+
" value=15.0,\n",
|
| 907 |
+
" step=0.5\n",
|
| 908 |
+
" )\n",
|
| 909 |
+
"\n",
|
| 910 |
+
" number_of_persons_visiting = st.number_input(\n",
|
| 911 |
+
" \"Number of Persons Visiting\",\n",
|
| 912 |
+
" min_value=1,\n",
|
| 913 |
+
" max_value=10,\n",
|
| 914 |
+
" value=2,\n",
|
| 915 |
+
" step=1\n",
|
| 916 |
+
" )\n",
|
| 917 |
+
"\n",
|
| 918 |
+
" number_of_followups = st.number_input(\n",
|
| 919 |
+
" \"Number of Follow-ups\",\n",
|
| 920 |
+
" min_value=0.0,\n",
|
| 921 |
+
" max_value=10.0,\n",
|
| 922 |
+
" value=3.0,\n",
|
| 923 |
+
" step=1.0\n",
|
| 924 |
+
" )\n",
|
| 925 |
+
"\n",
|
| 926 |
+
" product_pitched = st.selectbox(\n",
|
| 927 |
+
" \"Product Pitched\",\n",
|
| 928 |
+
" options=[\"Basic\", \"Standard\", \"Deluxe\", \"Super Deluxe\", \"King\"]\n",
|
| 929 |
+
" )\n",
|
| 930 |
+
"\n",
|
| 931 |
+
" preferred_property_star = st.selectbox(\n",
|
| 932 |
+
" \"Preferred Property Star Rating\",\n",
|
| 933 |
+
" options=[3.0, 4.0, 5.0]\n",
|
| 934 |
+
" )\n",
|
| 935 |
+
"\n",
|
| 936 |
+
" number_of_trips = st.number_input(\n",
|
| 937 |
+
" \"Number of Trips (per year)\",\n",
|
| 938 |
+
" min_value=0.0,\n",
|
| 939 |
+
" max_value=20.0,\n",
|
| 940 |
+
" value=3.0,\n",
|
| 941 |
+
" step=1.0\n",
|
| 942 |
+
" )\n",
|
| 943 |
+
"\n",
|
| 944 |
+
" passport = st.selectbox(\"Has Passport?\", options=[\"Yes\", \"No\"])\n",
|
| 945 |
+
"\n",
|
| 946 |
+
" pitch_satisfaction_score = st.slider(\n",
|
| 947 |
+
" \"Pitch Satisfaction Score\",\n",
|
| 948 |
+
" min_value=1,\n",
|
| 949 |
+
" max_value=5,\n",
|
| 950 |
+
" value=3,\n",
|
| 951 |
+
" step=1\n",
|
| 952 |
+
" )\n",
|
| 953 |
+
"\n",
|
| 954 |
+
" own_car = st.selectbox(\"Owns Car?\", options=[\"Yes\", \"No\"])\n",
|
| 955 |
+
"\n",
|
| 956 |
+
" number_of_children_visiting = st.number_input(\n",
|
| 957 |
+
" \"Number of Children Visiting\",\n",
|
| 958 |
+
" min_value=0.0,\n",
|
| 959 |
+
" max_value=5.0,\n",
|
| 960 |
+
" value=0.0,\n",
|
| 961 |
+
" step=1.0\n",
|
| 962 |
+
" )\n",
|
| 963 |
+
"\n",
|
| 964 |
+
"# Encoding mapping (based on LabelEncoder used during training)\n",
|
| 965 |
+
"# Note: These mappings should match the exact encoding used during training\n",
|
| 966 |
+
"type_of_contact_map = {\"Company Invited\": 0, \"Self Enquiry\": 1}\n",
|
| 967 |
+
"occupation_map = {\"Free Lancer\": 0, \"Large Business\": 1, \"Salaried\": 2, \"Small Business\": 3}\n",
|
| 968 |
+
"gender_map = {\"Female\": 0, \"Male\": 1}\n",
|
| 969 |
+
"product_pitched_map = {\"Basic\": 0, \"Deluxe\": 1, \"King\": 2, \"Standard\": 3, \"Super Deluxe\": 4}\n",
|
| 970 |
+
"marital_status_map = {\"Divorced\": 0, \"Married\": 1, \"Single\": 2, \"Unmarried\": 3}\n",
|
| 971 |
+
"designation_map = {\"AVP\": 0, \"Executive\": 1, \"Manager\": 2, \"Senior Manager\": 3, \"VP\": 4}\n",
|
| 972 |
+
"\n",
|
| 973 |
+
"# Convert Yes/No to 0/1\n",
|
| 974 |
+
"passport_val = 1 if passport == \"Yes\" else 0\n",
|
| 975 |
+
"own_car_val = 1 if own_car == \"Yes\" else 0\n",
|
| 976 |
+
"\n",
|
| 977 |
+
"# Prepare input data\n",
|
| 978 |
+
"input_data = pd.DataFrame([{\n",
|
| 979 |
+
" 'Age': age,\n",
|
| 980 |
+
" 'TypeofContact': type_of_contact_map[type_of_contact],\n",
|
| 981 |
+
" 'CityTier': city_tier,\n",
|
| 982 |
+
" 'DurationOfPitch': duration_of_pitch,\n",
|
| 983 |
+
" 'Occupation': occupation_map[occupation],\n",
|
| 984 |
+
" 'Gender': gender_map[gender],\n",
|
| 985 |
+
" 'NumberOfPersonVisiting': number_of_persons_visiting,\n",
|
| 986 |
+
" 'NumberOfFollowups': number_of_followups,\n",
|
| 987 |
+
" 'ProductPitched': product_pitched_map[product_pitched],\n",
|
| 988 |
+
" 'PreferredPropertyStar': preferred_property_star,\n",
|
| 989 |
+
" 'MaritalStatus': marital_status_map[marital_status],\n",
|
| 990 |
+
" 'NumberOfTrips': number_of_trips,\n",
|
| 991 |
+
" 'Passport': passport_val,\n",
|
| 992 |
+
" 'PitchSatisfactionScore': pitch_satisfaction_score,\n",
|
| 993 |
+
" 'OwnCar': own_car_val,\n",
|
| 994 |
+
" 'NumberOfChildrenVisiting': number_of_children_visiting,\n",
|
| 995 |
+
" 'Designation': designation_map[designation],\n",
|
| 996 |
+
" 'MonthlyIncome': monthly_income\n",
|
| 997 |
+
"}])\n",
|
| 998 |
+
"\n",
|
| 999 |
+
"# Predict button\n",
|
| 1000 |
+
"st.markdown(\"---\")\n",
|
| 1001 |
+
"if st.button(\"🔮 Predict Purchase Likelihood\", use_container_width=True):\n",
|
| 1002 |
+
" if model is not None:\n",
|
| 1003 |
+
" try:\n",
|
| 1004 |
+
" # Make prediction\n",
|
| 1005 |
+
" prediction = model.predict(input_data)[0]\n",
|
| 1006 |
+
" prediction_proba = model.predict_proba(input_data)[0]\n",
|
| 1007 |
+
"\n",
|
| 1008 |
+
" # Display results\n",
|
| 1009 |
+
" st.markdown(\"### 📊 Prediction Results\")\n",
|
| 1010 |
+
"\n",
|
| 1011 |
+
" if prediction == 1:\n",
|
| 1012 |
+
" st.markdown(\n",
|
| 1013 |
+
" f'<div class=\"prediction-box success-box\">'\n",
|
| 1014 |
+
" f'<h2 style=\"color: #155724;\">✅ High Likelihood of Purchase!</h2>'\n",
|
| 1015 |
+
" f'<p style=\"font-size: 18px;\">This customer is <b>likely to purchase</b> the Wellness Tourism Package.</p>'\n",
|
| 1016 |
+
" f'<p style=\"font-size: 16px;\">Confidence: <b>{prediction_proba[1]*100:.2f}%</b></p>'\n",
|
| 1017 |
+
" f'</div>',\n",
|
| 1018 |
+
" unsafe_allow_html=True\n",
|
| 1019 |
+
" )\n",
|
| 1020 |
+
" st.success(\"💡 **Recommendation:** Prioritize follow-up with this customer!\")\n",
|
| 1021 |
+
" else:\n",
|
| 1022 |
+
" st.markdown(\n",
|
| 1023 |
+
" f'<div class=\"prediction-box warning-box\">'\n",
|
| 1024 |
+
" f'<h2 style=\"color: #856404;\">⚠️ Low Likelihood of Purchase</h2>'\n",
|
| 1025 |
+
" f'<p style=\"font-size: 18px;\">This customer is <b>unlikely to purchase</b> the Wellness Tourism Package.</p>'\n",
|
| 1026 |
+
" f'<p style=\"font-size: 16px;\">Confidence: <b>{prediction_proba[0]*100:.2f}%</b></p>'\n",
|
| 1027 |
+
" f'</div>',\n",
|
| 1028 |
+
" unsafe_allow_html=True\n",
|
| 1029 |
+
" )\n",
|
| 1030 |
+
" st.info(\"💡 **Recommendation:** Consider alternative packages or additional engagement strategies.\")\n",
|
| 1031 |
+
"\n",
|
| 1032 |
+
" # Show probability breakdown\n",
|
| 1033 |
+
" col_prob1, col_prob2 = st.columns(2)\n",
|
| 1034 |
+
" with col_prob1:\n",
|
| 1035 |
+
" st.metric(\"Probability of No Purchase\", f\"{prediction_proba[0]*100:.2f}%\")\n",
|
| 1036 |
+
" with col_prob2:\n",
|
| 1037 |
+
" st.metric(\"Probability of Purchase\", f\"{prediction_proba[1]*100:.2f}%\")\n",
|
| 1038 |
+
"\n",
|
| 1039 |
+
" except Exception as e:\n",
|
| 1040 |
+
" st.error(f\"Error making prediction: {e}\")\n",
|
| 1041 |
+
" else:\n",
|
| 1042 |
+
" st.error(\"Model not loaded. Please check the model repository.\")\n",
|
| 1043 |
+
"\n",
|
| 1044 |
+
"# Footer\n",
|
| 1045 |
+
"st.markdown(\"---\")\n",
|
| 1046 |
+
"st.markdown(\"\"\"\n",
|
| 1047 |
+
"<div style='text-align: center; color: #666; padding: 20px;'>\n",
|
| 1048 |
+
" <p>🏢 Visit with Us - Wellness Tourism Package Prediction System</p>\n",
|
| 1049 |
+
" <p>Built with ❤️ using Streamlit and XGBoost</p>\n",
|
| 1050 |
+
"</div>\n",
|
| 1051 |
+
"\"\"\", unsafe_allow_html=True)"
|
| 1052 |
+
]
|
| 1053 |
+
},
|
| 1054 |
+
{
|
| 1055 |
+
"cell_type": "markdown",
|
| 1056 |
+
"metadata": {
|
| 1057 |
+
"id": "07cYzWcIwTL-"
|
| 1058 |
+
},
|
| 1059 |
+
"source": [
|
| 1060 |
+
"## Dependency Handling"
|
| 1061 |
+
]
|
| 1062 |
+
},
|
| 1063 |
+
{
|
| 1064 |
+
"cell_type": "markdown",
|
| 1065 |
+
"metadata": {
|
| 1066 |
+
"id": "JEgfHL64jU7o"
|
| 1067 |
+
},
|
| 1068 |
+
"source": [
|
| 1069 |
+
"Please ensure that the dependency handling file is named `requirements.txt`."
|
| 1070 |
+
]
|
| 1071 |
+
},
|
| 1072 |
+
{
|
| 1073 |
+
"cell_type": "code",
|
| 1074 |
+
"execution_count": 11,
|
| 1075 |
+
"metadata": {
|
| 1076 |
+
"colab": {
|
| 1077 |
+
"base_uri": "https://localhost:8080/"
|
| 1078 |
+
},
|
| 1079 |
+
"executionInfo": {
|
| 1080 |
+
"elapsed": 4,
|
| 1081 |
+
"status": "ok",
|
| 1082 |
+
"timestamp": 1765705284460,
|
| 1083 |
+
"user": {
|
| 1084 |
+
"displayName": "Anant Tripathi",
|
| 1085 |
+
"userId": "05588283814303116545"
|
| 1086 |
+
},
|
| 1087 |
+
"user_tz": -330
|
| 1088 |
+
},
|
| 1089 |
+
"id": "nvdmy7Wd9lda",
|
| 1090 |
+
"outputId": "16b57c41-8e07-49cc-d6c5-1f15240b59fe"
|
| 1091 |
+
},
|
| 1092 |
+
"outputs": [
|
| 1093 |
+
{
|
| 1094 |
+
"name": "stdout",
|
| 1095 |
+
"output_type": "stream",
|
| 1096 |
+
"text": [
|
| 1097 |
+
"Writing tourism_project/deployment/requirements.txt\n"
|
| 1098 |
+
]
|
| 1099 |
+
}
|
| 1100 |
+
],
|
| 1101 |
+
"source": [
|
| 1102 |
+
"%%writefile tourism_project/deployment/requirements.txt\n",
|
| 1103 |
+
"pandas==2.2.2\n",
|
| 1104 |
+
"numpy==1.26.4\n",
|
| 1105 |
+
"huggingface_hub==0.32.6\n",
|
| 1106 |
+
"streamlit==1.43.2\n",
|
| 1107 |
+
"joblib==1.5.1\n",
|
| 1108 |
+
"scikit-learn==1.6.0\n",
|
| 1109 |
+
"xgboost==2.1.4"
|
| 1110 |
+
]
|
| 1111 |
+
},
|
| 1112 |
+
{
|
| 1113 |
+
"cell_type": "markdown",
|
| 1114 |
+
"metadata": {
|
| 1115 |
+
"id": "V4ynzpKNwWS_"
|
| 1116 |
+
},
|
| 1117 |
+
"source": [
|
| 1118 |
+
"# Hosting"
|
| 1119 |
+
]
|
| 1120 |
+
},
|
| 1121 |
+
{
|
| 1122 |
+
"cell_type": "code",
|
| 1123 |
+
"execution_count": 12,
|
| 1124 |
+
"metadata": {
|
| 1125 |
+
"executionInfo": {
|
| 1126 |
+
"elapsed": 1,
|
| 1127 |
+
"status": "ok",
|
| 1128 |
+
"timestamp": 1765705284460,
|
| 1129 |
+
"user": {
|
| 1130 |
+
"displayName": "Anant Tripathi",
|
| 1131 |
+
"userId": "05588283814303116545"
|
| 1132 |
+
},
|
| 1133 |
+
"user_tz": -330
|
| 1134 |
+
},
|
| 1135 |
+
"id": "7p5sBvTg9nCW"
|
| 1136 |
+
},
|
| 1137 |
+
"outputs": [],
|
| 1138 |
+
"source": [
|
| 1139 |
+
"os.makedirs(\"tourism_project/hosting\", exist_ok=True)"
|
| 1140 |
+
]
|
| 1141 |
+
},
|
| 1142 |
+
{
|
| 1143 |
+
"cell_type": "code",
|
| 1144 |
+
"execution_count": 13,
|
| 1145 |
+
"metadata": {
|
| 1146 |
+
"colab": {
|
| 1147 |
+
"base_uri": "https://localhost:8080/"
|
| 1148 |
+
},
|
| 1149 |
+
"executionInfo": {
|
| 1150 |
+
"elapsed": 2,
|
| 1151 |
+
"status": "ok",
|
| 1152 |
+
"timestamp": 1765705284463,
|
| 1153 |
+
"user": {
|
| 1154 |
+
"displayName": "Anant Tripathi",
|
| 1155 |
+
"userId": "05588283814303116545"
|
| 1156 |
+
},
|
| 1157 |
+
"user_tz": -330
|
| 1158 |
+
},
|
| 1159 |
+
"id": "QlpgAQoXOtKA",
|
| 1160 |
+
"outputId": "2611020e-c3cf-4dea-af77-ccb122c85e65"
|
| 1161 |
+
},
|
| 1162 |
+
"outputs": [
|
| 1163 |
+
{
|
| 1164 |
+
"name": "stdout",
|
| 1165 |
+
"output_type": "stream",
|
| 1166 |
+
"text": [
|
| 1167 |
+
"Writing tourism_project/hosting/hosting.py\n"
|
| 1168 |
+
]
|
| 1169 |
+
}
|
| 1170 |
+
],
|
| 1171 |
+
"source": [
|
| 1172 |
+
"%%writefile tourism_project/hosting/hosting.py\n",
|
| 1173 |
+
"from huggingface_hub import HfApi\n",
|
| 1174 |
+
"import os\n",
|
| 1175 |
+
"\n",
|
| 1176 |
+
"api = HfApi(token=os.getenv(\"HF_TOKEN\"))\n",
|
| 1177 |
+
"api.upload_folder(\n",
|
| 1178 |
+
" folder_path=\"tourism_project/deployment\",\n",
|
| 1179 |
+
" repo_id=\"ananttripathiak/wellness-tourism-prediction\",\n",
|
| 1180 |
+
" repo_type=\"space\",\n",
|
| 1181 |
+
" path_in_repo=\"\",\n",
|
| 1182 |
+
")\n",
|
| 1183 |
+
"\n",
|
| 1184 |
+
"print(\"Deployment files successfully uploaded to Hugging Face Space!\")"
|
| 1185 |
+
]
|
| 1186 |
+
},
|
| 1187 |
+
{
|
| 1188 |
+
"cell_type": "markdown",
|
| 1189 |
+
"metadata": {
|
| 1190 |
+
"id": "PuCgAW2hktli"
|
| 1191 |
+
},
|
| 1192 |
+
"source": [
|
| 1193 |
+
"# MLOps Pipeline with Github Actions Workflow"
|
| 1194 |
+
]
|
| 1195 |
+
},
|
| 1196 |
+
{
|
| 1197 |
+
"cell_type": "markdown",
|
| 1198 |
+
"metadata": {
|
| 1199 |
+
"id": "L5BZr5i8PKVN"
|
| 1200 |
+
},
|
| 1201 |
+
"source": [
|
| 1202 |
+
"**Note:**\n",
|
| 1203 |
+
"\n",
|
| 1204 |
+
"1. Before running the file below, make sure to add the HF_TOKEN to your GitHub secrets to enable authentication between GitHub and Hugging Face.\n",
|
| 1205 |
+
"2. The below code is for a sample YAML file that can be updated as required to meet the requirements of this project."
|
| 1206 |
+
]
|
| 1207 |
+
},
|
| 1208 |
+
{
|
| 1209 |
+
"cell_type": "code",
|
| 1210 |
+
"execution_count": 14,
|
| 1211 |
+
"metadata": {
|
| 1212 |
+
"executionInfo": {
|
| 1213 |
+
"elapsed": 10,
|
| 1214 |
+
"status": "ok",
|
| 1215 |
+
"timestamp": 1765705284474,
|
| 1216 |
+
"user": {
|
| 1217 |
+
"displayName": "Anant Tripathi",
|
| 1218 |
+
"userId": "05588283814303116545"
|
| 1219 |
+
},
|
| 1220 |
+
"user_tz": -330
|
| 1221 |
+
},
|
| 1222 |
+
"id": "H2kXyghROtKA"
|
| 1223 |
+
},
|
| 1224 |
+
"outputs": [],
|
| 1225 |
+
"source": [
|
| 1226 |
+
"os.makedirs(\"tourism_project/.github/workflows\", exist_ok=True)"
|
| 1227 |
+
]
|
| 1228 |
+
},
|
| 1229 |
+
{
|
| 1230 |
+
"cell_type": "code",
|
| 1231 |
+
"execution_count": 15,
|
| 1232 |
+
"metadata": {
|
| 1233 |
+
"colab": {
|
| 1234 |
+
"base_uri": "https://localhost:8080/"
|
| 1235 |
+
},
|
| 1236 |
+
"executionInfo": {
|
| 1237 |
+
"elapsed": 5,
|
| 1238 |
+
"status": "ok",
|
| 1239 |
+
"timestamp": 1765705284479,
|
| 1240 |
+
"user": {
|
| 1241 |
+
"displayName": "Anant Tripathi",
|
| 1242 |
+
"userId": "05588283814303116545"
|
| 1243 |
+
},
|
| 1244 |
+
"user_tz": -330
|
| 1245 |
+
},
|
| 1246 |
+
"id": "M5J4Kq2ROtKA",
|
| 1247 |
+
"outputId": "a9c5ad71-d5b5-41a0-823e-8739942162fb"
|
| 1248 |
+
},
|
| 1249 |
+
"outputs": [
|
| 1250 |
+
{
|
| 1251 |
+
"name": "stdout",
|
| 1252 |
+
"output_type": "stream",
|
| 1253 |
+
"text": [
|
| 1254 |
+
"Writing tourism_project/.github/workflows/pipeline.yml\n"
|
| 1255 |
+
]
|
| 1256 |
+
}
|
| 1257 |
+
],
|
| 1258 |
+
"source": [
|
| 1259 |
+
"%%writefile tourism_project/.github/workflows/pipeline.yml\n",
|
| 1260 |
+
"name: Tourism Package Prediction Pipeline\n",
|
| 1261 |
+
"\n",
|
| 1262 |
+
"on:\n",
|
| 1263 |
+
" push:\n",
|
| 1264 |
+
" branches:\n",
|
| 1265 |
+
" - main\n",
|
| 1266 |
+
"\n",
|
| 1267 |
+
"jobs:\n",
|
| 1268 |
+
"\n",
|
| 1269 |
+
" register-dataset:\n",
|
| 1270 |
+
" runs-on: ubuntu-latest\n",
|
| 1271 |
+
" steps:\n",
|
| 1272 |
+
" - uses: actions/checkout@v3\n",
|
| 1273 |
+
" - name: Install Dependencies\n",
|
| 1274 |
+
" run: pip install -r tourism_project/requirements.txt\n",
|
| 1275 |
+
" - name: Upload Dataset to Hugging Face Hub\n",
|
| 1276 |
+
" env:\n",
|
| 1277 |
+
" HF_TOKEN: ${{ secrets.HF_TOKEN }}\n",
|
| 1278 |
+
" run: python tourism_project/model_building/data_register.py\n",
|
| 1279 |
+
"\n",
|
| 1280 |
+
" data-prep:\n",
|
| 1281 |
+
" needs: register-dataset\n",
|
| 1282 |
+
" runs-on: ubuntu-latest\n",
|
| 1283 |
+
" steps:\n",
|
| 1284 |
+
" - uses: actions/checkout@v3\n",
|
| 1285 |
+
" - name: Install Dependencies\n",
|
| 1286 |
+
" run: pip install -r tourism_project/requirements.txt\n",
|
| 1287 |
+
" - name: Run Data Preparation\n",
|
| 1288 |
+
" env:\n",
|
| 1289 |
+
" HF_TOKEN: ${{ secrets.HF_TOKEN }}\n",
|
| 1290 |
+
" run: python tourism_project/model_building/prep.py\n",
|
| 1291 |
+
"\n",
|
| 1292 |
+
" model-traning:\n",
|
| 1293 |
+
" needs: data-prep\n",
|
| 1294 |
+
" runs-on: ubuntu-latest\n",
|
| 1295 |
+
" steps:\n",
|
| 1296 |
+
" - uses: actions/checkout@v3\n",
|
| 1297 |
+
" - name: Install Dependencies\n",
|
| 1298 |
+
" run: pip install -r tourism_project/requirements.txt\n",
|
| 1299 |
+
" - name: Start MLflow Server\n",
|
| 1300 |
+
" run: |\n",
|
| 1301 |
+
" nohup mlflow ui --host 0.0.0.0 --port 5000 &\n",
|
| 1302 |
+
" sleep 5\n",
|
| 1303 |
+
" - name: Model Building\n",
|
| 1304 |
+
" env:\n",
|
| 1305 |
+
" HF_TOKEN: ${{ secrets.HF_TOKEN }}\n",
|
| 1306 |
+
" run: python tourism_project/model_building/train.py\n",
|
| 1307 |
+
"\n",
|
| 1308 |
+
" deploy-hosting:\n",
|
| 1309 |
+
" runs-on: ubuntu-latest\n",
|
| 1310 |
+
" needs: [model-traning,data-prep,register-dataset]\n",
|
| 1311 |
+
" steps:\n",
|
| 1312 |
+
" - uses: actions/checkout@v3\n",
|
| 1313 |
+
" - name: Install Dependencies\n",
|
| 1314 |
+
" run: pip install -r tourism_project/requirements.txt\n",
|
| 1315 |
+
" - name: Push files to Frontend Hugging Face Space\n",
|
| 1316 |
+
" env:\n",
|
| 1317 |
+
" HF_TOKEN: ${{ secrets.HF_TOKEN }}\n",
|
| 1318 |
+
" run: python tourism_project/hosting/hosting.py\n"
|
| 1319 |
+
]
|
| 1320 |
+
},
|
| 1321 |
+
{
|
| 1322 |
+
"cell_type": "markdown",
|
| 1323 |
+
"metadata": {
|
| 1324 |
+
"id": "J029tYPq4Rmq"
|
| 1325 |
+
},
|
| 1326 |
+
"source": [
|
| 1327 |
+
"```\n",
|
| 1328 |
+
"name: Tourism Project Pipeline\n",
|
| 1329 |
+
"\n",
|
| 1330 |
+
"on:\n",
|
| 1331 |
+
" push:\n",
|
| 1332 |
+
" branches:\n",
|
| 1333 |
+
" - main # Automatically triggers on push to the main branch\n",
|
| 1334 |
+
"\n",
|
| 1335 |
+
"jobs:\n",
|
| 1336 |
+
"\n",
|
| 1337 |
+
" register-dataset:\n",
|
| 1338 |
+
" runs-on: ubuntu-latest\n",
|
| 1339 |
+
" steps:\n",
|
| 1340 |
+
" - uses: actions/checkout@v3\n",
|
| 1341 |
+
" - name: Install Dependencies\n",
|
| 1342 |
+
" run: <add_code_here>\n",
|
| 1343 |
+
" - name: Upload Dataset to Hugging Face Hub\n",
|
| 1344 |
+
" env:\n",
|
| 1345 |
+
" HF_TOKEN: ${{ secrets.HF_TOKEN }}\n",
|
| 1346 |
+
" run: <add_code_here>\n",
|
| 1347 |
+
"\n",
|
| 1348 |
+
" data-prep:\n",
|
| 1349 |
+
" needs: register-dataset\n",
|
| 1350 |
+
" runs-on: ubuntu-latest\n",
|
| 1351 |
+
" steps:\n",
|
| 1352 |
+
" - uses: actions/checkout@v3\n",
|
| 1353 |
+
" - name: Install Dependencies\n",
|
| 1354 |
+
" run: <add_code_here>\n",
|
| 1355 |
+
" - name: Run Data Preparation\n",
|
| 1356 |
+
" env:\n",
|
| 1357 |
+
" HF_TOKEN: ${{ secrets.HF_TOKEN }}\n",
|
| 1358 |
+
" run: <add_code_here>\n",
|
| 1359 |
+
"\n",
|
| 1360 |
+
"\n",
|
| 1361 |
+
" model-traning:\n",
|
| 1362 |
+
" needs: data-prep\n",
|
| 1363 |
+
" runs-on: ubuntu-latest\n",
|
| 1364 |
+
" steps:\n",
|
| 1365 |
+
" - uses: actions/checkout@v3\n",
|
| 1366 |
+
" - name: Install Dependencies\n",
|
| 1367 |
+
" run: <add_code_here>\n",
|
| 1368 |
+
" - name: Start MLflow Server\n",
|
| 1369 |
+
" run: |\n",
|
| 1370 |
+
" nohup mlflow ui --host 0.0.0.0 --port 5000 & # Run MLflow UI in the background\n",
|
| 1371 |
+
" sleep 5 # Wait for a moment to let the server starts\n",
|
| 1372 |
+
" - name: Model Building\n",
|
| 1373 |
+
" env:\n",
|
| 1374 |
+
" HF_TOKEN: ${{ secrets.HF_TOKEN }}\n",
|
| 1375 |
+
" run: <add_code_here>\n",
|
| 1376 |
+
"\n",
|
| 1377 |
+
"\n",
|
| 1378 |
+
" deploy-hosting:\n",
|
| 1379 |
+
" runs-on: ubuntu-latest\n",
|
| 1380 |
+
" needs: [model-traning,data-prep,register-dataset]\n",
|
| 1381 |
+
" steps:\n",
|
| 1382 |
+
" - uses: actions/checkout@v3\n",
|
| 1383 |
+
" - name: Install Dependencies\n",
|
| 1384 |
+
" run: <add_code_here>\n",
|
| 1385 |
+
" - name: Push files to Frontend Hugging Face Space\n",
|
| 1386 |
+
" env:\n",
|
| 1387 |
+
" HF_TOKEN: ${{ secrets.HF_TOKEN }}\n",
|
| 1388 |
+
" run: <add_code_here>\n",
|
| 1389 |
+
"\n",
|
| 1390 |
+
"```"
|
| 1391 |
+
]
|
| 1392 |
+
},
|
| 1393 |
+
{
|
| 1394 |
+
"cell_type": "markdown",
|
| 1395 |
+
"metadata": {
|
| 1396 |
+
"id": "T9fgZ_Mq3zzp"
|
| 1397 |
+
},
|
| 1398 |
+
"source": [
|
| 1399 |
+
"**Note:** To use this YAML file for our use case, we need to\n",
|
| 1400 |
+
"\n",
|
| 1401 |
+
"1. Go to the GitHub repository for the project\n",
|
| 1402 |
+
"2. Create a folder named ***.github/workflows/***\n",
|
| 1403 |
+
"3. In the above folder, create a file named ***pipeline.yml***\n",
|
| 1404 |
+
"4. Copy and paste the above content for the YAML file into the ***pipeline.yml*** file"
|
| 1405 |
+
]
|
| 1406 |
+
},
|
| 1407 |
+
{
|
| 1408 |
+
"cell_type": "markdown",
|
| 1409 |
+
"metadata": {
|
| 1410 |
+
"id": "PvEUJ-t5kdxH"
|
| 1411 |
+
},
|
| 1412 |
+
"source": [
|
| 1413 |
+
"## Requirements file for the Github Actions Workflow"
|
| 1414 |
+
]
|
| 1415 |
+
},
|
| 1416 |
+
{
|
| 1417 |
+
"cell_type": "code",
|
| 1418 |
+
"execution_count": 16,
|
| 1419 |
+
"metadata": {
|
| 1420 |
+
"colab": {
|
| 1421 |
+
"base_uri": "https://localhost:8080/"
|
| 1422 |
+
},
|
| 1423 |
+
"executionInfo": {
|
| 1424 |
+
"elapsed": 2,
|
| 1425 |
+
"status": "ok",
|
| 1426 |
+
"timestamp": 1765705284481,
|
| 1427 |
+
"user": {
|
| 1428 |
+
"displayName": "Anant Tripathi",
|
| 1429 |
+
"userId": "05588283814303116545"
|
| 1430 |
+
},
|
| 1431 |
+
"user_tz": -330
|
| 1432 |
+
},
|
| 1433 |
+
"id": "nfqWcLRm-dga",
|
| 1434 |
+
"outputId": "ca9d9001-8154-47f1-809a-c2d62e51b4c2"
|
| 1435 |
+
},
|
| 1436 |
+
"outputs": [
|
| 1437 |
+
{
|
| 1438 |
+
"name": "stdout",
|
| 1439 |
+
"output_type": "stream",
|
| 1440 |
+
"text": [
|
| 1441 |
+
"Writing tourism_project/requirements.txt\n"
|
| 1442 |
+
]
|
| 1443 |
+
}
|
| 1444 |
+
],
|
| 1445 |
+
"source": [
|
| 1446 |
+
"%%writefile tourism_project/requirements.txt\n",
|
| 1447 |
+
"huggingface_hub==0.32.6\n",
|
| 1448 |
+
"datasets==3.6.0\n",
|
| 1449 |
+
"pandas==2.2.2\n",
|
| 1450 |
+
"numpy==1.26.4\n",
|
| 1451 |
+
"scikit-learn==1.6.0\n",
|
| 1452 |
+
"xgboost==2.1.4\n",
|
| 1453 |
+
"mlflow==3.0.1\n",
|
| 1454 |
+
"joblib==1.5.1"
|
| 1455 |
+
]
|
| 1456 |
+
},
|
| 1457 |
+
{
|
| 1458 |
+
"cell_type": "markdown",
|
| 1459 |
+
"metadata": {
|
| 1460 |
+
"id": "BA6mP-Ebkm3O"
|
| 1461 |
+
},
|
| 1462 |
+
"source": [
|
| 1463 |
+
"## Github Authentication and Push Files"
|
| 1464 |
+
]
|
| 1465 |
+
},
|
| 1466 |
+
{
|
| 1467 |
+
"cell_type": "markdown",
|
| 1468 |
+
"metadata": {
|
| 1469 |
+
"id": "T84Ei-g9Z2uw"
|
| 1470 |
+
},
|
| 1471 |
+
"source": [
|
| 1472 |
+
"* Before moving forward, we need to generate a secret token to push files directly from Colab to the GitHub repository.\n",
|
| 1473 |
+
"* Please follow the below instructions to create the GitHub token:\n",
|
| 1474 |
+
" - Open your GitHub profile.\n",
|
| 1475 |
+
" - Click on ***Settings***.\n",
|
| 1476 |
+
" - Go to ***Developer Settings***.\n",
|
| 1477 |
+
" - Expand the ***Personal access tokens*** section and select ***Tokens (classic)***.\n",
|
| 1478 |
+
" - Click ***Generate new token***, then choose ***Generate new token (classic)***.\n",
|
| 1479 |
+
" - Add a note and select all required scopes.\n",
|
| 1480 |
+
" - Click ***Generate token***.\n",
|
| 1481 |
+
" - Copy the generated token and store it safely in a notepad."
|
| 1482 |
+
]
|
| 1483 |
+
},
|
| 1484 |
+
{
|
| 1485 |
+
"cell_type": "code",
|
| 1486 |
+
"execution_count": 17,
|
| 1487 |
+
"metadata": {
|
| 1488 |
+
"colab": {
|
| 1489 |
+
"base_uri": "https://localhost:8080/"
|
| 1490 |
+
},
|
| 1491 |
+
"executionInfo": {
|
| 1492 |
+
"elapsed": 6682,
|
| 1493 |
+
"status": "ok",
|
| 1494 |
+
"timestamp": 1765705291163,
|
| 1495 |
+
"user": {
|
| 1496 |
+
"displayName": "Anant Tripathi",
|
| 1497 |
+
"userId": "05588283814303116545"
|
| 1498 |
+
},
|
| 1499 |
+
"user_tz": -330
|
| 1500 |
+
},
|
| 1501 |
+
"id": "KPDx4gqGh7cO",
|
| 1502 |
+
"outputId": "47eb6218-6a20-4eae-ee1d-04c771d14101"
|
| 1503 |
+
},
|
| 1504 |
+
"outputs": [
|
| 1505 |
+
{
|
| 1506 |
+
"name": "stdout",
|
| 1507 |
+
"output_type": "stream",
|
| 1508 |
+
"text": [
|
| 1509 |
+
"Reading package lists... Done\n",
|
| 1510 |
+
"Building dependency tree... Done\n",
|
| 1511 |
+
"Reading state information... Done\n",
|
| 1512 |
+
"git is already the newest version (1:2.34.1-1ubuntu1.15).\n",
|
| 1513 |
+
"0 upgraded, 0 newly installed, 0 to remove and 1 not upgraded.\n",
|
| 1514 |
+
"Cloning into 'Tourism_Project'...\n",
|
| 1515 |
+
"remote: Enumerating objects: 99, done.\u001b[K\n",
|
| 1516 |
+
"remote: Counting objects: 100% (99/99), done.\u001b[K\n",
|
| 1517 |
+
"remote: Compressing objects: 100% (89/89), done.\u001b[K\n",
|
| 1518 |
+
"remote: Total 99 (delta 31), reused 0 (delta 0), pack-reused 0 (from 0)\u001b[K\n",
|
| 1519 |
+
"Receiving objects: 100% (99/99), 111.21 KiB | 4.28 MiB/s, done.\n",
|
| 1520 |
+
"Resolving deltas: 100% (31/31), done.\n",
|
| 1521 |
+
"mv: cannot move '/content/tourism_project/' to '/content/Tourism_Project/tourism_project': Directory not empty\n"
|
| 1522 |
+
]
|
| 1523 |
+
}
|
| 1524 |
+
],
|
| 1525 |
+
"source": [
|
| 1526 |
+
"# Install Git\n",
|
| 1527 |
+
"!apt-get install git\n",
|
| 1528 |
+
"\n",
|
| 1529 |
+
"# Set your Git identity (replace with your details)\n",
|
| 1530 |
+
"!git config --global user.email \"ananttripathi1996@gmail.com\"\n",
|
| 1531 |
+
"!git config --global user.name \"ananttripathi\"\n",
|
| 1532 |
+
"\n",
|
| 1533 |
+
"# Clone your GitHub repository\n",
|
| 1534 |
+
"!git clone https://github.com/ananttripathi/Tourism_Project.git\n",
|
| 1535 |
+
"\n",
|
| 1536 |
+
"# Move your folder to the repository directory\n",
|
| 1537 |
+
"!mv /content/tourism_project/ /content/Tourism_Project"
|
| 1538 |
+
]
|
| 1539 |
+
},
|
| 1540 |
+
{
|
| 1541 |
+
"cell_type": "code",
|
| 1542 |
+
"execution_count": null,
|
| 1543 |
+
"metadata": {
|
| 1544 |
+
"colab": {
|
| 1545 |
+
"base_uri": "https://localhost:8080/"
|
| 1546 |
+
},
|
| 1547 |
+
"executionInfo": {
|
| 1548 |
+
"elapsed": 435,
|
| 1549 |
+
"status": "ok",
|
| 1550 |
+
"timestamp": 1765705291611,
|
| 1551 |
+
"user": {
|
| 1552 |
+
"displayName": "Anant Tripathi",
|
| 1553 |
+
"userId": "05588283814303116545"
|
| 1554 |
+
},
|
| 1555 |
+
"user_tz": -330
|
| 1556 |
+
},
|
| 1557 |
+
"id": "IuUahCwVigon",
|
| 1558 |
+
"outputId": "4ca2922a-9c3b-4bbe-ce2f-ef55236d9ba5"
|
| 1559 |
+
},
|
| 1560 |
+
"outputs": [
|
| 1561 |
+
{
|
| 1562 |
+
"name": "stdout",
|
| 1563 |
+
"output_type": "stream",
|
| 1564 |
+
"text": [
|
| 1565 |
+
"/content/Tourism_Project\n",
|
| 1566 |
+
"On branch main\n",
|
| 1567 |
+
"Your branch is up to date with 'origin/main'.\n",
|
| 1568 |
+
"\n",
|
| 1569 |
+
"nothing to commit, working tree clean\n",
|
| 1570 |
+
"Everything up-to-date\n"
|
| 1571 |
+
]
|
| 1572 |
+
}
|
| 1573 |
+
],
|
| 1574 |
+
"source": [
|
| 1575 |
+
"# Change directory to the cloned repository\n",
|
| 1576 |
+
"%cd Tourism_Project/\n",
|
| 1577 |
+
"\n",
|
| 1578 |
+
"# Add the new folder to Git\n",
|
| 1579 |
+
"!git add .\n",
|
| 1580 |
+
"\n",
|
| 1581 |
+
"# Commit the changes\n",
|
| 1582 |
+
"!git commit -m \"first commit\"\n",
|
| 1583 |
+
"\n",
|
| 1584 |
+
"# Push to GitHub (you'll need your GitHub credentials; use a personal access token if 2FA enabled)\n",
|
| 1585 |
+
"!git push https://ananttripathi:github_pat_11SPxp30Fxq@github.com/ananttripathi/Tourism_Project.git"
|
| 1586 |
+
]
|
| 1587 |
+
},
|
| 1588 |
+
{
|
| 1589 |
+
"cell_type": "markdown",
|
| 1590 |
+
"metadata": {
|
| 1591 |
+
"id": "v-i8Jdyz-_L1"
|
| 1592 |
+
},
|
| 1593 |
+
"source": [
|
| 1594 |
+
"# Output Evaluation"
|
| 1595 |
+
]
|
| 1596 |
+
},
|
| 1597 |
+
{
|
| 1598 |
+
"cell_type": "markdown",
|
| 1599 |
+
"metadata": {
|
| 1600 |
+
"id": "FTK8Bpda_UHg"
|
| 1601 |
+
},
|
| 1602 |
+
"source": [
|
| 1603 |
+
"- GitHub (link to repository, screenshot of folder structure and executed workflow)"
|
| 1604 |
+
]
|
| 1605 |
+
},
|
| 1606 |
+
{
|
| 1607 |
+
"cell_type": "code",
|
| 1608 |
+
"execution_count": 19,
|
| 1609 |
+
"metadata": {
|
| 1610 |
+
"executionInfo": {
|
| 1611 |
+
"elapsed": 57,
|
| 1612 |
+
"status": "ok",
|
| 1613 |
+
"timestamp": 1765705291669,
|
| 1614 |
+
"user": {
|
| 1615 |
+
"displayName": "Anant Tripathi",
|
| 1616 |
+
"userId": "05588283814303116545"
|
| 1617 |
+
},
|
| 1618 |
+
"user_tz": -330
|
| 1619 |
+
},
|
| 1620 |
+
"id": "6qzzesaG_Xw8"
|
| 1621 |
+
},
|
| 1622 |
+
"outputs": [],
|
| 1623 |
+
"source": [
|
| 1624 |
+
"# TODO: Add Screenshot After Deployment\n",
|
| 1625 |
+
"#\n",
|
| 1626 |
+
"# After pushing to GitHub and running the workflow, add a screenshot showing:\n",
|
| 1627 |
+
"# 1. GitHub repository structure (showing tourism_project folder and all subfolders)\n",
|
| 1628 |
+
"# 2. GitHub Actions workflow execution (all 4 jobs completed successfully with green checkmarks)\n",
|
| 1629 |
+
"#\n",
|
| 1630 |
+
"# Screenshot should show:\n",
|
| 1631 |
+
"# ✅ register-dataset job completed\n",
|
| 1632 |
+
"# ✅ data-prep job completed\n",
|
| 1633 |
+
"# ✅ model-training job completed\n",
|
| 1634 |
+
"# ✅ deploy-hosting job completed\n",
|
| 1635 |
+
"#\n",
|
| 1636 |
+
"# You can also provide the GitHub repository URL here"
|
| 1637 |
+
]
|
| 1638 |
+
},
|
| 1639 |
+
{
|
| 1640 |
+
"cell_type": "code",
|
| 1641 |
+
"execution_count": 19,
|
| 1642 |
+
"metadata": {
|
| 1643 |
+
"executionInfo": {
|
| 1644 |
+
"elapsed": 2,
|
| 1645 |
+
"status": "ok",
|
| 1646 |
+
"timestamp": 1765705291669,
|
| 1647 |
+
"user": {
|
| 1648 |
+
"displayName": "Anant Tripathi",
|
| 1649 |
+
"userId": "05588283814303116545"
|
| 1650 |
+
},
|
| 1651 |
+
"user_tz": -330
|
| 1652 |
+
},
|
| 1653 |
+
"id": "qXS_wjMdOtKM"
|
| 1654 |
+
},
|
| 1655 |
+
"outputs": [],
|
| 1656 |
+
"source": []
|
| 1657 |
+
},
|
| 1658 |
+
{
|
| 1659 |
+
"cell_type": "markdown",
|
| 1660 |
+
"metadata": {
|
| 1661 |
+
"id": "IidBPl95OtKM"
|
| 1662 |
+
},
|
| 1663 |
+
"source": [
|
| 1664 |
+
""
|
| 1665 |
+
]
|
| 1666 |
+
},
|
| 1667 |
+
{
|
| 1668 |
+
"cell_type": "markdown",
|
| 1669 |
+
"metadata": {
|
| 1670 |
+
"id": "P8QouFeUOtKM"
|
| 1671 |
+
},
|
| 1672 |
+
"source": [
|
| 1673 |
+
""
|
| 1674 |
+
]
|
| 1675 |
+
},
|
| 1676 |
+
{
|
| 1677 |
+
"cell_type": "markdown",
|
| 1678 |
+
"metadata": {
|
| 1679 |
+
"id": "3KDN31V2_YSr"
|
| 1680 |
+
},
|
| 1681 |
+
"source": [
|
| 1682 |
+
"- Streamlit on Hugging Face (link to HF space, screenshot of Streamlit app)"
|
| 1683 |
+
]
|
| 1684 |
+
},
|
| 1685 |
+
{
|
| 1686 |
+
"cell_type": "code",
|
| 1687 |
+
"execution_count": 20,
|
| 1688 |
+
"metadata": {
|
| 1689 |
+
"executionInfo": {
|
| 1690 |
+
"elapsed": 1,
|
| 1691 |
+
"status": "ok",
|
| 1692 |
+
"timestamp": 1765705291669,
|
| 1693 |
+
"user": {
|
| 1694 |
+
"displayName": "Anant Tripathi",
|
| 1695 |
+
"userId": "05588283814303116545"
|
| 1696 |
+
},
|
| 1697 |
+
"user_tz": -330
|
| 1698 |
+
},
|
| 1699 |
+
"id": "NuIUdj3b_ZYV"
|
| 1700 |
+
},
|
| 1701 |
+
"outputs": [],
|
| 1702 |
+
"source": [
|
| 1703 |
+
"# TODO: Add Screenshot After Deployment\n",
|
| 1704 |
+
"#\n",
|
| 1705 |
+
"# After the workflow completes and app is deployed, add a screenshot showing:\n",
|
| 1706 |
+
"# 1. Live Streamlit app running on Hugging Face Spaces\n",
|
| 1707 |
+
"# 2. Making a sample prediction with the interface\n",
|
| 1708 |
+
"# 3. Prediction results displayed with confidence scores\n",
|
| 1709 |
+
"#\n",
|
| 1710 |
+
"# Also provide the Hugging Face Space URL:\n",
|
| 1711 |
+
"# Expected format: https://huggingface.co/spaces/<your-username>/wellness-tourism-prediction\n",
|
| 1712 |
+
"#\n",
|
| 1713 |
+
"# Screenshot should show:\n",
|
| 1714 |
+
"# ✅ App interface with input forms\n",
|
| 1715 |
+
"# ✅ Prediction button\n",
|
| 1716 |
+
"# ✅ Results with confidence scores\n",
|
| 1717 |
+
"# ✅ Recommendations"
|
| 1718 |
+
]
|
| 1719 |
+
},
|
| 1720 |
+
{
|
| 1721 |
+
"cell_type": "markdown",
|
| 1722 |
+
"metadata": {
|
| 1723 |
+
"id": "tp06I8AnOtKM"
|
| 1724 |
+
},
|
| 1725 |
+
"source": [
|
| 1726 |
+
""
|
| 1727 |
+
]
|
| 1728 |
+
},
|
| 1729 |
+
{
|
| 1730 |
+
"cell_type": "code",
|
| 1731 |
+
"execution_count": 20,
|
| 1732 |
+
"metadata": {
|
| 1733 |
+
"executionInfo": {
|
| 1734 |
+
"elapsed": 1,
|
| 1735 |
+
"status": "ok",
|
| 1736 |
+
"timestamp": 1765705291669,
|
| 1737 |
+
"user": {
|
| 1738 |
+
"displayName": "Anant Tripathi",
|
| 1739 |
+
"userId": "05588283814303116545"
|
| 1740 |
+
},
|
| 1741 |
+
"user_tz": -330
|
| 1742 |
+
},
|
| 1743 |
+
"id": "mK_xcFxlOtKM"
|
| 1744 |
+
},
|
| 1745 |
+
"outputs": [],
|
| 1746 |
+
"source": []
|
| 1747 |
+
},
|
| 1748 |
+
{
|
| 1749 |
+
"cell_type": "markdown",
|
| 1750 |
+
"metadata": {
|
| 1751 |
+
"id": "zWAag1sAOtKM"
|
| 1752 |
+
},
|
| 1753 |
+
"source": [
|
| 1754 |
+
""
|
| 1755 |
+
]
|
| 1756 |
+
},
|
| 1757 |
+
{
|
| 1758 |
+
"cell_type": "markdown",
|
| 1759 |
+
"metadata": {
|
| 1760 |
+
"id": "fN8j9-3nW8G9"
|
| 1761 |
+
},
|
| 1762 |
+
"source": [
|
| 1763 |
+
"<font size=6 color=\"navyblue\">Power Ahead!</font>\n",
|
| 1764 |
+
"___"
|
| 1765 |
+
]
|
| 1766 |
+
}
|
| 1767 |
+
],
|
| 1768 |
+
"metadata": {
|
| 1769 |
+
"colab": {
|
| 1770 |
+
"provenance": []
|
| 1771 |
+
},
|
| 1772 |
+
"kernel_info": {
|
| 1773 |
+
"name": "python310-sdkv2"
|
| 1774 |
+
},
|
| 1775 |
+
"kernelspec": {
|
| 1776 |
+
"display_name": "Python [conda env:anaconda3] *",
|
| 1777 |
+
"language": "python",
|
| 1778 |
+
"name": "conda-env-anaconda3-py"
|
| 1779 |
+
},
|
| 1780 |
+
"language_info": {
|
| 1781 |
+
"codemirror_mode": {
|
| 1782 |
+
"name": "ipython",
|
| 1783 |
+
"version": 3
|
| 1784 |
+
},
|
| 1785 |
+
"file_extension": ".py",
|
| 1786 |
+
"mimetype": "text/x-python",
|
| 1787 |
+
"name": "python",
|
| 1788 |
+
"nbconvert_exporter": "python",
|
| 1789 |
+
"pygments_lexer": "ipython3",
|
| 1790 |
+
"version": "3.11.4"
|
| 1791 |
+
},
|
| 1792 |
+
"microsoft": {
|
| 1793 |
+
"host": {
|
| 1794 |
+
"AzureML": {
|
| 1795 |
+
"notebookHasBeenCompleted": true
|
| 1796 |
+
}
|
| 1797 |
+
},
|
| 1798 |
+
"ms_spell_check": {
|
| 1799 |
+
"ms_spell_check_language": "en"
|
| 1800 |
+
}
|
| 1801 |
+
},
|
| 1802 |
+
"nteract": {
|
| 1803 |
+
"version": "nteract-front-end@1.0.0"
|
| 1804 |
+
}
|
| 1805 |
+
},
|
| 1806 |
+
"nbformat": 4,
|
| 1807 |
+
"nbformat_minor": 0
|
| 1808 |
+
}
|
README.md
CHANGED
|
@@ -1,10 +1,120 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## Predictive Maintenance – Engine Failure Classification (MLOps Project)
|
| 2 |
+
|
| 3 |
+
> 🎯 **YOLO Achievement PR** - Merging without review to earn the GitHub YOLO badge!
|
| 4 |
+
|
| 5 |
+
This project builds an end‑to‑end **predictive maintenance system** for small and large engines using sensor data (RPM, pressures, temperatures) to classify whether an engine is **healthy** or **requires maintenance**.
|
| 6 |
+
|
| 7 |
+
The work is organized to satisfy the provided **interim and final report rubrics**, including:
|
| 8 |
+
|
| 9 |
+
- **Data registration on Hugging Face**
|
| 10 |
+
- **Exploratory Data Analysis (EDA)**
|
| 11 |
+
- **Data preparation and dataset versioning**
|
| 12 |
+
- **Model building with experimentation tracking**
|
| 13 |
+
- **Model deployment with Docker + Streamlit on Hugging Face Spaces**
|
| 14 |
+
- **Automated GitHub Actions workflow**
|
| 15 |
+
|
| 16 |
+
### Repository Structure
|
| 17 |
+
|
| 18 |
+
- `data/` – Raw and processed data (local copies).
|
| 19 |
+
- `engine_data.csv` – Original engine sensor dataset.
|
| 20 |
+
- `notebooks/`
|
| 21 |
+
- EDA and experimentation notebooks (you can connect these to your report).
|
| 22 |
+
- `src/`
|
| 23 |
+
- `config.py` – Central configuration (paths, Hugging Face repo names, MLflow config).
|
| 24 |
+
- `data_prep.py` – Loads data (from Hugging Face dataset/local), cleans it, and creates train/test splits.
|
| 25 |
+
- `hf_data_utils.py` – Helper functions for uploading/downloading datasets to/from Hugging Face.
|
| 26 |
+
- `train.py` – Model training, hyperparameter tuning, MLflow logging, and best‑model saving.
|
| 27 |
+
- `hf_model_utils.py` – Register and load the best model from the Hugging Face model hub.
|
| 28 |
+
- `inference.py` – Simple Python API for making predictions from the trained model.
|
| 29 |
+
- `app.py` – Streamlit app used for deployment on Hugging Face Spaces.
|
| 30 |
+
- `deploy_to_hf.py` – Script to push deployment files (app, Dockerfile, requirements) to a Hugging Face Space.
|
| 31 |
+
- `.github/workflows/pipeline.yml` – Automated CI/CD workflow for data prep, training, and deployment.
|
| 32 |
+
- `requirements.txt` – Python dependencies (for local use, CI, Docker, and HF Spaces).
|
| 33 |
+
- `Dockerfile` – Container definition for running the Streamlit app.
|
| 34 |
+
|
| 35 |
+
### High‑Level Pipeline (aligned with template notebook)
|
| 36 |
+
|
| 37 |
+
1. **Data Registration**
|
| 38 |
+
- Script: `src/data_register.py`
|
| 39 |
+
- Behaviour similar to `data_register.py` in the reference notebook:
|
| 40 |
+
- Creates/uses a **Hugging Face dataset repo** (`HF_DATASET_REPO`),
|
| 41 |
+
- Uploads `data/engine_data.csv` as the canonical raw dataset.
|
| 42 |
+
2. **EDA**
|
| 43 |
+
- Script: `src/eda.py` (or use a separate notebook) to perform:
|
| 44 |
+
- Data overview,
|
| 45 |
+
- Univariate, bivariate, multivariate analysis,
|
| 46 |
+
- Business insights about engine health and failure patterns.
|
| 47 |
+
3. **Data Preparation**
|
| 48 |
+
- Script: `src/data_prep.py`
|
| 49 |
+
- Loads the raw data from the HF dataset (or local fallback), cleans it, creates train/test splits, and uploads `data/train.csv` and `data/test.csv` back to the dataset repo.
|
| 50 |
+
4. **Model Building + Experiment Tracking**
|
| 51 |
+
- Script: `src/train.py`
|
| 52 |
+
- Loads train/test from the HF dataset or local files, trains and tunes a Random Forest classifier, logs all tuned parameters and metrics with **MLflow**, saves the best model locally, and registers it to a **Hugging Face model hub** repo.
|
| 53 |
+
5. **Deployment & Hosting**
|
| 54 |
+
- Streamlit app: `src/app.py` (used by Docker / HF Space) loads the best model from HF (or local) and serves predictions.
|
| 55 |
+
- Containerisation: `Dockerfile` and `requirements.txt` define the runtime image and dependencies, matching the “deployment” section of the notebook.
|
| 56 |
+
- Hosting script: `src/deploy_to_hf.py` plays the role of `hosting.py` in the notebook, pushing the app, Dockerfile, and dependencies to a **Hugging Face Space**.
|
| 57 |
+
6. **GitHub Actions Workflow**
|
| 58 |
+
- Workflow file: `.github/workflows/pipeline.yml`
|
| 59 |
+
- Defines four staged jobs mirroring the notebook:
|
| 60 |
+
- `register-dataset` → runs `src/data_register.py`
|
| 61 |
+
- `data-prep` → runs `src/data_prep.py`
|
| 62 |
+
- `model-training` → runs `src/train.py`
|
| 63 |
+
- `deploy-hosting` → runs `src/deploy_to_hf.py`
|
| 64 |
+
- All jobs share dependencies via `requirements.txt` and use GitHub secrets for HF credentials and repo IDs.
|
| 65 |
+
|
| 66 |
+
### What You Need to Configure
|
| 67 |
+
|
| 68 |
+
#### 1. Hugging Face Configuration
|
| 69 |
+
|
| 70 |
+
**Update `src/config.py`** (lines 58-62) with your Hugging Face username:
|
| 71 |
+
|
| 72 |
+
```python
|
| 73 |
+
HF_DATASET_REPO = os.getenv("HF_DATASET_REPO", "your-username/engine-maintenance-dataset")
|
| 74 |
+
HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "your-username/engine-maintenance-model")
|
| 75 |
+
HF_SPACE_REPO = os.getenv("HF_SPACE_REPO", "your-username/engine-maintenance-space")
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
Replace `"your-username"` with your actual Hugging Face username (e.g., `"ananttripathi"`).
|
| 79 |
+
|
| 80 |
+
**Or set environment variables:**
|
| 81 |
+
```bash
|
| 82 |
+
export HF_TOKEN="hf_your_token_here"
|
| 83 |
+
export HF_DATASET_REPO="your-username/engine-maintenance-dataset"
|
| 84 |
+
export HF_MODEL_REPO="your-username/engine-maintenance-model"
|
| 85 |
+
export HF_SPACE_REPO="your-username/engine-maintenance-space"
|
| 86 |
+
```
|
| 87 |
+
|
| 88 |
+
#### 2. GitHub Repository Configuration
|
| 89 |
+
|
| 90 |
+
**A. Create GitHub Repository:**
|
| 91 |
+
1. Create a new repository on GitHub (e.g., `engine-predictive-maintenance`)
|
| 92 |
+
2. Push this `mlops` folder to it:
|
| 93 |
+
```bash
|
| 94 |
+
git init
|
| 95 |
+
git add .
|
| 96 |
+
git commit -m "Initial commit: Predictive maintenance MLOps pipeline"
|
| 97 |
+
git remote add origin https://github.com/your-username/engine-predictive-maintenance.git
|
| 98 |
+
git push -u origin main
|
| 99 |
+
```
|
| 100 |
+
|
| 101 |
+
**B. Add GitHub Secrets:**
|
| 102 |
+
Go to your GitHub repo → **Settings** → **Secrets and variables** → **Actions** → **New repository secret**
|
| 103 |
+
|
| 104 |
+
Add these 4 secrets:
|
| 105 |
+
- `HF_TOKEN` – Your Hugging Face access token (from https://huggingface.co/settings/tokens)
|
| 106 |
+
- `HF_DATASET_REPO` – e.g., `your-username/engine-maintenance-dataset`
|
| 107 |
+
- `HF_MODEL_REPO` – e.g., `your-username/engine-maintenance-model`
|
| 108 |
+
- `HF_SPACE_REPO` – e.g., `your-username/engine-maintenance-space`
|
| 109 |
+
|
| 110 |
+
**C. Update README with GitHub Repo URL:**
|
| 111 |
+
Add this section to your README (or in your final notebook):
|
| 112 |
+
```markdown
|
| 113 |
+
## GitHub Repository
|
| 114 |
+
- **Repository URL**: https://github.com/your-username/engine-predictive-maintenance
|
| 115 |
+
- **GitHub Actions**: https://github.com/your-username/engine-predictive-maintenance/actions
|
| 116 |
+
```
|
| 117 |
+
|
| 118 |
+
Once these values are set, you can run the scripts locally and/or via GitHub Actions to produce outputs that cover all the rubric sections (data registration, EDA, data prep, modeling, deployment, and automated workflow).
|
| 119 |
+
|
| 120 |
+
**📖 For detailed setup instructions, see `CONFIGURATION_GUIDE.md`**
|
UPLOAD_GUIDE.md
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# File Upload Guide: Where Each File Goes
|
| 2 |
+
|
| 3 |
+
This guide shows exactly which files are uploaded to which location (Hugging Face or GitHub) and when.
|
| 4 |
+
|
| 5 |
+
---
|
| 6 |
+
|
| 7 |
+
## Overview: Three Upload Destinations
|
| 8 |
+
|
| 9 |
+
1. **Hugging Face Dataset Repo** (`ananttripathiak/engine-maintenance-dataset`)
|
| 10 |
+
2. **Hugging Face Model Repo** (`ananttripathiak/engine-maintenance-model`)
|
| 11 |
+
3. **Hugging Face Space** (`ananttripathiak/engine-maintenance-space`)
|
| 12 |
+
4. **GitHub Repository** (`ananttripathi/engine-predictive-maintenance`)
|
| 13 |
+
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
## 1. Hugging Face Dataset Repo
|
| 17 |
+
|
| 18 |
+
**Repo ID**: `ananttripathiak/engine-maintenance-dataset`
|
| 19 |
+
**Created by**: `src/data_register.py` and `src/data_prep.py`
|
| 20 |
+
|
| 21 |
+
### Files Uploaded:
|
| 22 |
+
|
| 23 |
+
#### A. Raw Data (via `src/data_register.py`)
|
| 24 |
+
- **File**: `data/engine_data.csv`
|
| 25 |
+
- **Uploaded to**: `data/engine_data.csv` in the dataset repo
|
| 26 |
+
- **When**: Run `python src/data_register.py`
|
| 27 |
+
|
| 28 |
+
#### B. Processed Data (via `src/data_prep.py`)
|
| 29 |
+
- **File**: `data/processed/train.csv`
|
| 30 |
+
- **Uploaded to**: `data/train.csv` in the dataset repo
|
| 31 |
+
- **When**: Run `python src/data_prep.py`
|
| 32 |
+
|
| 33 |
+
- **File**: `data/processed/test.csv`
|
| 34 |
+
- **Uploaded to**: `data/test.csv` in the dataset repo
|
| 35 |
+
- **When**: Run `python src/data_prep.py`
|
| 36 |
+
|
| 37 |
+
**Scripts that upload here:**
|
| 38 |
+
- `src/data_register.py` → uploads raw data
|
| 39 |
+
- `src/data_prep.py` → uploads train/test splits
|
| 40 |
+
|
| 41 |
+
---
|
| 42 |
+
|
| 43 |
+
## 2. Hugging Face Model Repo
|
| 44 |
+
|
| 45 |
+
**Repo ID**: `ananttripathiak/engine-maintenance-model`
|
| 46 |
+
**Created by**: `src/train.py`
|
| 47 |
+
|
| 48 |
+
### Files Uploaded:
|
| 49 |
+
|
| 50 |
+
- **File**: `models/best_model.joblib`
|
| 51 |
+
- **Uploaded to**: `model.joblib` in the model repo
|
| 52 |
+
- **When**: Run `python src/train.py` (after training completes)
|
| 53 |
+
|
| 54 |
+
**Scripts that upload here:**
|
| 55 |
+
- `src/train.py` → uploads the trained model
|
| 56 |
+
|
| 57 |
+
---
|
| 58 |
+
|
| 59 |
+
## 3. Hugging Face Space (Streamlit App)
|
| 60 |
+
|
| 61 |
+
**Repo ID**: `ananttripathiak/engine-maintenance-space`
|
| 62 |
+
**Created by**: `src/deploy_to_hf.py`
|
| 63 |
+
|
| 64 |
+
### Files Uploaded:
|
| 65 |
+
|
| 66 |
+
The `src/deploy_to_hf.py` script uploads the entire project folder **except**:
|
| 67 |
+
- `data/` (ignored - too large)
|
| 68 |
+
- `mlruns/` (ignored - MLflow tracking data)
|
| 69 |
+
- `models/` (ignored - model is in model repo)
|
| 70 |
+
- `.github/` (ignored - GitHub-specific)
|
| 71 |
+
|
| 72 |
+
**Files that ARE uploaded to Space:**
|
| 73 |
+
- `src/app.py` ← **Main Streamlit app**
|
| 74 |
+
- `src/inference.py` ← Inference utilities
|
| 75 |
+
- `src/config.py` ← Configuration
|
| 76 |
+
- `Dockerfile` ← Container definition
|
| 77 |
+
- `requirements.txt` ← Python dependencies
|
| 78 |
+
- `README.md` ← Documentation
|
| 79 |
+
- Other `src/*.py` files (if needed by app)
|
| 80 |
+
|
| 81 |
+
**Scripts that upload here:**
|
| 82 |
+
- `src/deploy_to_hf.py` → uploads deployment files
|
| 83 |
+
|
| 84 |
+
---
|
| 85 |
+
|
| 86 |
+
## 4. GitHub Repository
|
| 87 |
+
|
| 88 |
+
**Repo URL**: `https://github.com/ananttripathi/engine-predictive-maintenance`
|
| 89 |
+
**Created by**: You (manually via `git push`)
|
| 90 |
+
|
| 91 |
+
### Files Uploaded:
|
| 92 |
+
|
| 93 |
+
**Everything in the `mlops/` folder**, including:
|
| 94 |
+
- ✅ `data/` (including `engine_data.csv`, `processed/train.csv`, `processed/test.csv`)
|
| 95 |
+
- ✅ `src/` (all Python scripts)
|
| 96 |
+
- ✅ `notebooks/` (EDA notebooks, etc.)
|
| 97 |
+
- ✅ `.github/workflows/pipeline.yml` ← **GitHub Actions workflow**
|
| 98 |
+
- ✅ `requirements.txt`
|
| 99 |
+
- ✅ `Dockerfile`
|
| 100 |
+
- ✅ `README.md`
|
| 101 |
+
- ✅ `models/` (if you want to track model versions in git)
|
| 102 |
+
- ✅ `mlruns/` (MLflow tracking data - optional)
|
| 103 |
+
- ✅ All other project files
|
| 104 |
+
|
| 105 |
+
**How to upload:**
|
| 106 |
+
```bash
|
| 107 |
+
cd /Users/ananttripathi/Desktop/mlops
|
| 108 |
+
git init
|
| 109 |
+
git add .
|
| 110 |
+
git commit -m "Initial commit: Predictive maintenance MLOps pipeline"
|
| 111 |
+
git remote add origin https://github.com/ananttripathi/engine-predictive-maintenance.git
|
| 112 |
+
git push -u origin main
|
| 113 |
+
```
|
| 114 |
+
|
| 115 |
+
---
|
| 116 |
+
|
| 117 |
+
## Upload Workflow Summary
|
| 118 |
+
|
| 119 |
+
### Step-by-Step Upload Process:
|
| 120 |
+
|
| 121 |
+
1. **Data Registration** → Hugging Face Dataset Repo
|
| 122 |
+
```bash
|
| 123 |
+
python src/data_register.py
|
| 124 |
+
```
|
| 125 |
+
- Uploads: `data/engine_data.csv` → HF Dataset Repo
|
| 126 |
+
|
| 127 |
+
2. **Data Preparation** → Hugging Face Dataset Repo
|
| 128 |
+
```bash
|
| 129 |
+
python src/data_prep.py
|
| 130 |
+
```
|
| 131 |
+
- Uploads: `data/processed/train.csv` and `test.csv` → HF Dataset Repo
|
| 132 |
+
|
| 133 |
+
3. **Model Training** → Hugging Face Model Repo
|
| 134 |
+
```bash
|
| 135 |
+
python src/train.py
|
| 136 |
+
```
|
| 137 |
+
- Uploads: `models/best_model.joblib` → HF Model Repo
|
| 138 |
+
|
| 139 |
+
4. **Deploy App** → Hugging Face Space
|
| 140 |
+
```bash
|
| 141 |
+
python src/deploy_to_hf.py
|
| 142 |
+
```
|
| 143 |
+
- Uploads: `src/app.py`, `Dockerfile`, `requirements.txt`, etc. → HF Space
|
| 144 |
+
|
| 145 |
+
5. **Push to GitHub** → GitHub Repository
|
| 146 |
+
```bash
|
| 147 |
+
git add .
|
| 148 |
+
git commit -m "Complete MLOps pipeline"
|
| 149 |
+
git push origin main
|
| 150 |
+
```
|
| 151 |
+
- Uploads: Everything → GitHub Repo
|
| 152 |
+
|
| 153 |
+
---
|
| 154 |
+
|
| 155 |
+
## What Gets Uploaded Automatically vs Manually
|
| 156 |
+
|
| 157 |
+
### Automatic (via Scripts):
|
| 158 |
+
- ✅ Hugging Face Dataset Repo → `src/data_register.py` and `src/data_prep.py`
|
| 159 |
+
- ✅ Hugging Face Model Repo → `src/train.py`
|
| 160 |
+
- ✅ Hugging Face Space → `src/deploy_to_hf.py`
|
| 161 |
+
- ✅ GitHub Actions → Runs automatically when you push to GitHub
|
| 162 |
+
|
| 163 |
+
### Manual:
|
| 164 |
+
- ⚠️ **GitHub Repository** → You need to run `git push` yourself
|
| 165 |
+
|
| 166 |
+
---
|
| 167 |
+
|
| 168 |
+
## File Size Considerations
|
| 169 |
+
|
| 170 |
+
### Large Files (may be ignored):
|
| 171 |
+
- `data/engine_data.csv` → Uploaded to HF Dataset, but you might want to add to `.gitignore` for GitHub
|
| 172 |
+
- `mlruns/` → MLflow tracking data (can be large) - ignored by HF Space deploy
|
| 173 |
+
- `models/best_model.joblib` → Uploaded to HF Model Repo, but you might want to add to `.gitignore` for GitHub
|
| 174 |
+
|
| 175 |
+
### Recommended `.gitignore`:
|
| 176 |
+
```
|
| 177 |
+
# Large data files
|
| 178 |
+
data/*.csv
|
| 179 |
+
data/processed/*.csv
|
| 180 |
+
|
| 181 |
+
# MLflow tracking
|
| 182 |
+
mlruns/
|
| 183 |
+
|
| 184 |
+
# Model files (already in HF Model Repo)
|
| 185 |
+
models/*.joblib
|
| 186 |
+
|
| 187 |
+
# Python cache
|
| 188 |
+
__pycache__/
|
| 189 |
+
*.pyc
|
| 190 |
+
.venv/
|
| 191 |
+
```
|
| 192 |
+
|
| 193 |
+
---
|
| 194 |
+
|
| 195 |
+
## Quick Reference Table
|
| 196 |
+
|
| 197 |
+
| File/Folder | HF Dataset | HF Model | HF Space | GitHub |
|
| 198 |
+
|------------|------------|----------|----------|--------|
|
| 199 |
+
| `data/engine_data.csv` | ✅ | ❌ | ❌ | ⚠️ Optional |
|
| 200 |
+
| `data/processed/train.csv` | ✅ | ❌ | ❌ | ⚠️ Optional |
|
| 201 |
+
| `data/processed/test.csv` | ✅ | ❌ | ❌ | ⚠️ Optional |
|
| 202 |
+
| `models/best_model.joblib` | ❌ | ✅ | ❌ | ⚠️ Optional |
|
| 203 |
+
| `src/app.py` | ❌ | ❌ | ✅ | ✅ |
|
| 204 |
+
| `src/train.py` | ❌ | ❌ | ❌ | ✅ |
|
| 205 |
+
| `src/data_prep.py` | ❌ | ❌ | ❌ | ✅ |
|
| 206 |
+
| `Dockerfile` | ❌ | ❌ | ✅ | ✅ |
|
| 207 |
+
| `requirements.txt` | ❌ | ❌ | ✅ | ✅ |
|
| 208 |
+
| `.github/workflows/pipeline.yml` | ❌ | ❌ | ❌ | ✅ |
|
| 209 |
+
| `README.md` | ❌ | ❌ | ✅ | ✅ |
|
| 210 |
+
|
| 211 |
+
**Legend:**
|
| 212 |
+
- ✅ = Uploaded automatically or should be uploaded
|
| 213 |
+
- ❌ = Not uploaded to this location
|
| 214 |
+
- ⚠️ Optional = Can be uploaded but might want to exclude from GitHub due to size
|
| 215 |
+
|
| 216 |
+
---
|
| 217 |
+
|
| 218 |
+
## Need Help?
|
| 219 |
+
|
| 220 |
+
- **Hugging Face Dataset**: Check `src/hf_data_utils.py`
|
| 221 |
+
- **Hugging Face Model**: Check `src/hf_model_utils.py`
|
| 222 |
+
- **Hugging Face Space**: Check `src/deploy_to_hf.py`
|
| 223 |
+
- **GitHub**: Standard git commands
|
USERNAME_SUMMARY.md
ADDED
|
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Username Configuration Summary
|
| 2 |
+
|
| 3 |
+
## ✅ Updated Configuration
|
| 4 |
+
|
| 5 |
+
All files have been updated with the correct usernames:
|
| 6 |
+
|
| 7 |
+
- **Hugging Face Username**: `ananttripathiak` (with "ak" at the end)
|
| 8 |
+
- **GitHub Username**: `ananttripathi` (no "ak")
|
| 9 |
+
|
| 10 |
+
## Files Updated
|
| 11 |
+
|
| 12 |
+
### 1. `src/config.py` ✅
|
| 13 |
+
- **HF_DATASET_REPO**: `ananttripathiak/engine-maintenance-dataset`
|
| 14 |
+
- **HF_MODEL_REPO**: `ananttripathiak/engine-maintenance-model`
|
| 15 |
+
- **HF_SPACE_REPO**: `ananttripathiak/engine-maintenance-space`
|
| 16 |
+
|
| 17 |
+
### 2. `README.md` ✅
|
| 18 |
+
- Updated all Hugging Face repo examples to use `ananttripathiak`
|
| 19 |
+
- GitHub repository URLs use `ananttripathi`
|
| 20 |
+
|
| 21 |
+
### 3. `CONFIGURATION_GUIDE.md` ✅
|
| 22 |
+
- Updated examples to show `ananttripathiak` for HF repos
|
| 23 |
+
|
| 24 |
+
### 4. `GITHUB_REPO_INFO.md` ✅
|
| 25 |
+
- Updated GitHub secrets examples to use `ananttripathiak` for HF repos
|
| 26 |
+
- GitHub URLs use `ananttripathi`
|
| 27 |
+
|
| 28 |
+
## What You Need to Do
|
| 29 |
+
|
| 30 |
+
### 1. GitHub Secrets (in your GitHub repo settings)
|
| 31 |
+
|
| 32 |
+
When you add secrets to your GitHub repository, use these exact values:
|
| 33 |
+
|
| 34 |
+
- **`HF_TOKEN`**: Your Hugging Face access token
|
| 35 |
+
- **`HF_DATASET_REPO`**: `ananttripathiak/engine-maintenance-dataset`
|
| 36 |
+
- **`HF_MODEL_REPO`**: `ananttripathiak/engine-maintenance-model`
|
| 37 |
+
- **`HF_SPACE_REPO`**: `ananttripathiak/engine-maintenance-space`
|
| 38 |
+
|
| 39 |
+
### 2. Local Environment Variables (optional)
|
| 40 |
+
|
| 41 |
+
If you want to override config.py, set these:
|
| 42 |
+
|
| 43 |
+
```bash
|
| 44 |
+
export HF_TOKEN="hf_your_token_here"
|
| 45 |
+
export HF_DATASET_REPO="ananttripathiak/engine-maintenance-dataset"
|
| 46 |
+
export HF_MODEL_REPO="ananttripathiak/engine-maintenance-model"
|
| 47 |
+
export HF_SPACE_REPO="ananttripathiak/engine-maintenance-space"
|
| 48 |
+
```
|
| 49 |
+
|
| 50 |
+
### 3. GitHub Repository URLs
|
| 51 |
+
|
| 52 |
+
Your GitHub repository will be:
|
| 53 |
+
- **Repository**: `https://github.com/ananttripathi/engine-predictive-maintenance`
|
| 54 |
+
- **Actions**: `https://github.com/ananttripathi/engine-predictive-maintenance/actions`
|
| 55 |
+
|
| 56 |
+
### 4. Hugging Face Repos (will be created automatically)
|
| 57 |
+
|
| 58 |
+
After running the scripts, these will be created:
|
| 59 |
+
- **Dataset**: `https://huggingface.co/datasets/ananttripathiak/engine-maintenance-dataset`
|
| 60 |
+
- **Model**: `https://huggingface.co/ananttripathiak/engine-maintenance-model`
|
| 61 |
+
- **Space**: `https://huggingface.co/spaces/ananttripathiak/engine-maintenance-space`
|
| 62 |
+
|
| 63 |
+
## Verification
|
| 64 |
+
|
| 65 |
+
All configuration files are now correctly set. You can proceed with:
|
| 66 |
+
1. Running the scripts locally
|
| 67 |
+
2. Pushing to GitHub
|
| 68 |
+
3. Setting up GitHub secrets
|
| 69 |
+
4. Running the GitHub Actions workflow
|
WHAT_TO_UPLOAD.md
ADDED
|
@@ -0,0 +1,265 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# What to Upload: GitHub vs Hugging Face
|
| 2 |
+
|
| 3 |
+
## Quick Summary
|
| 4 |
+
|
| 5 |
+
| Destination | What to Upload | How |
|
| 6 |
+
|------------|----------------|-----|
|
| 7 |
+
| **GitHub** | **Everything** (entire project folder) | `git push` |
|
| 8 |
+
| **HF Dataset** | Raw data + train/test splits | Scripts auto-upload |
|
| 9 |
+
| **HF Model** | Trained model file | Scripts auto-upload |
|
| 10 |
+
| **HF Space** | App files (code, Dockerfile, requirements) | Scripts auto-upload |
|
| 11 |
+
|
| 12 |
+
---
|
| 13 |
+
|
| 14 |
+
## 📦 GitHub Repository
|
| 15 |
+
|
| 16 |
+
### Upload: **Everything in the `mlops/` folder**
|
| 17 |
+
|
| 18 |
+
**What to include:**
|
| 19 |
+
```
|
| 20 |
+
mlops/
|
| 21 |
+
├── data/
|
| 22 |
+
│ ├── engine_data.csv ✅ Upload
|
| 23 |
+
│ └── processed/
|
| 24 |
+
│ ├── train.csv ✅ Upload
|
| 25 |
+
│ └── test.csv ✅ Upload
|
| 26 |
+
├── src/
|
| 27 |
+
│ ├── app.py ✅ Upload
|
| 28 |
+
│ ├── config.py ✅ Upload
|
| 29 |
+
│ ├── data_prep.py ✅ Upload
|
| 30 |
+
│ ├── data_register.py ✅ Upload
|
| 31 |
+
│ ├── train.py ✅ Upload
|
| 32 |
+
│ ├── inference.py ✅ Upload
|
| 33 |
+
│ ├── deploy_to_hf.py ✅ Upload
|
| 34 |
+
│ ├── eda.py ✅ Upload
|
| 35 |
+
│ ├── hf_data_utils.py ✅ Upload
|
| 36 |
+
│ └── hf_model_utils.py ✅ Upload
|
| 37 |
+
├── notebooks/ ✅ Upload (if you have EDA notebooks)
|
| 38 |
+
├── models/
|
| 39 |
+
│ └── best_model.joblib ⚠️ Optional (large file)
|
| 40 |
+
├── .github/
|
| 41 |
+
│ └── workflows/
|
| 42 |
+
│ └── pipeline.yml ✅ Upload (IMPORTANT!)
|
| 43 |
+
├── requirements.txt ✅ Upload
|
| 44 |
+
├── Dockerfile ✅ Upload
|
| 45 |
+
├── README.md ✅ Upload
|
| 46 |
+
└── *.md files ✅ Upload (documentation)
|
| 47 |
+
|
| 48 |
+
❌ DON'T upload:
|
| 49 |
+
├── .venv/ ❌ Skip (virtual environment)
|
| 50 |
+
├── __pycache__/ ❌ Skip (Python cache)
|
| 51 |
+
├── mlruns/ ❌ Skip (MLflow tracking - can be large)
|
| 52 |
+
└── .git/ ❌ Skip (git metadata)
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
### How to Upload to GitHub:
|
| 56 |
+
|
| 57 |
+
```bash
|
| 58 |
+
cd /Users/ananttripathi/Desktop/mlops
|
| 59 |
+
|
| 60 |
+
# Initialize git (if not already done)
|
| 61 |
+
git init
|
| 62 |
+
|
| 63 |
+
# Create .gitignore to exclude large/unnecessary files
|
| 64 |
+
cat > .gitignore << EOF
|
| 65 |
+
.venv/
|
| 66 |
+
__pycache__/
|
| 67 |
+
*.pyc
|
| 68 |
+
*.pyo
|
| 69 |
+
*.pyd
|
| 70 |
+
.Python
|
| 71 |
+
mlruns/
|
| 72 |
+
*.log
|
| 73 |
+
.DS_Store
|
| 74 |
+
EOF
|
| 75 |
+
|
| 76 |
+
# Add all files
|
| 77 |
+
git add .
|
| 78 |
+
|
| 79 |
+
# Commit
|
| 80 |
+
git commit -m "Initial commit: Predictive Maintenance MLOps Pipeline"
|
| 81 |
+
|
| 82 |
+
# Add remote (replace with your repo URL)
|
| 83 |
+
git remote add origin https://github.com/ananttripathi/engine-predictive-maintenance.git
|
| 84 |
+
|
| 85 |
+
# Push
|
| 86 |
+
git push -u origin main
|
| 87 |
+
```
|
| 88 |
+
|
| 89 |
+
---
|
| 90 |
+
|
| 91 |
+
## 🤗 Hugging Face - Dataset Repo
|
| 92 |
+
|
| 93 |
+
**Repo**: `ananttripathiak/engine-maintenance-dataset`
|
| 94 |
+
|
| 95 |
+
### Upload: **Data files only**
|
| 96 |
+
|
| 97 |
+
**Files uploaded automatically by scripts:**
|
| 98 |
+
- `data/engine_data.csv` → Uploaded as `data/engine_data.csv`
|
| 99 |
+
- `data/processed/train.csv` → Uploaded as `data/train.csv`
|
| 100 |
+
- `data/processed/test.csv` → Uploaded as `data/test.csv`
|
| 101 |
+
|
| 102 |
+
### How to Upload:
|
| 103 |
+
|
| 104 |
+
**Option 1: Run the scripts (automatic)**
|
| 105 |
+
```bash
|
| 106 |
+
# Step 1: Register raw data
|
| 107 |
+
python src/data_register.py
|
| 108 |
+
|
| 109 |
+
# Step 2: Prepare and upload train/test
|
| 110 |
+
python src/data_prep.py
|
| 111 |
+
```
|
| 112 |
+
|
| 113 |
+
**Option 2: Manual upload via HF Hub**
|
| 114 |
+
- Go to https://huggingface.co/datasets/ananttripathiak/engine-maintenance-dataset
|
| 115 |
+
- Click "Add file" → Upload `data/engine_data.csv`
|
| 116 |
+
- Upload `data/processed/train.csv` and `test.csv`
|
| 117 |
+
|
| 118 |
+
---
|
| 119 |
+
|
| 120 |
+
## 🤗 Hugging Face - Model Repo
|
| 121 |
+
|
| 122 |
+
**Repo**: `ananttripathiak/engine-maintenance-model`
|
| 123 |
+
|
| 124 |
+
### Upload: **Trained model file only**
|
| 125 |
+
|
| 126 |
+
**File uploaded automatically:**
|
| 127 |
+
- `models/best_model.joblib` → Uploaded as `model.joblib`
|
| 128 |
+
|
| 129 |
+
### How to Upload:
|
| 130 |
+
|
| 131 |
+
**Option 1: Run the training script (automatic)**
|
| 132 |
+
```bash
|
| 133 |
+
python src/train.py
|
| 134 |
+
# This will:
|
| 135 |
+
# 1. Train the model
|
| 136 |
+
# 2. Save to models/best_model.joblib
|
| 137 |
+
# 3. Upload to HF Model Repo automatically
|
| 138 |
+
```
|
| 139 |
+
|
| 140 |
+
**Option 2: Manual upload via HF Hub**
|
| 141 |
+
- Go to https://huggingface.co/ananttripathiak/engine-maintenance-model
|
| 142 |
+
- Click "Add file" → Upload `models/best_model.joblib`
|
| 143 |
+
- Rename it to `model.joblib` in the repo
|
| 144 |
+
|
| 145 |
+
---
|
| 146 |
+
|
| 147 |
+
## 🤗 Hugging Face - Space (Streamlit App)
|
| 148 |
+
|
| 149 |
+
**Repo**: `ananttripathiak/engine-maintenance-space`
|
| 150 |
+
|
| 151 |
+
### Upload: **App deployment files**
|
| 152 |
+
|
| 153 |
+
**Files uploaded automatically by script:**
|
| 154 |
+
```
|
| 155 |
+
✅ src/app.py (Main Streamlit app)
|
| 156 |
+
✅ src/inference.py (Inference utilities)
|
| 157 |
+
✅ src/config.py (Configuration)
|
| 158 |
+
✅ Dockerfile (Container definition)
|
| 159 |
+
✅ requirements.txt (Dependencies)
|
| 160 |
+
✅ README.md (Documentation)
|
| 161 |
+
✅ Other src/*.py files (If needed by app)
|
| 162 |
+
|
| 163 |
+
❌ NOT uploaded (ignored):
|
| 164 |
+
├── data/ ❌ Too large
|
| 165 |
+
├── mlruns/ ❌ MLflow tracking
|
| 166 |
+
├── models/ ❌ Model is in Model Repo
|
| 167 |
+
├── .github/ ❌ GitHub-specific
|
| 168 |
+
└── .venv/ ❌ Virtual environment
|
| 169 |
+
```
|
| 170 |
+
|
| 171 |
+
### How to Upload:
|
| 172 |
+
|
| 173 |
+
**Option 1: Run the deployment script (automatic)**
|
| 174 |
+
```bash
|
| 175 |
+
python src/deploy_to_hf.py
|
| 176 |
+
# This will:
|
| 177 |
+
# 1. Create/update the HF Space
|
| 178 |
+
# 2. Upload all deployment files
|
| 179 |
+
# 3. Configure it as a Streamlit app
|
| 180 |
+
```
|
| 181 |
+
|
| 182 |
+
**Option 2: Manual upload via HF Hub**
|
| 183 |
+
- Go to https://huggingface.co/spaces/ananttripathiak/engine-maintenance-space
|
| 184 |
+
- Upload files one by one or use HF CLI
|
| 185 |
+
|
| 186 |
+
---
|
| 187 |
+
|
| 188 |
+
## 📋 Complete Upload Checklist
|
| 189 |
+
|
| 190 |
+
### ✅ Step 1: GitHub (Manual - Do First)
|
| 191 |
+
- [ ] Create GitHub repository
|
| 192 |
+
- [ ] Add `.gitignore` file
|
| 193 |
+
- [ ] Push entire `mlops/` folder to GitHub
|
| 194 |
+
- [ ] Add GitHub Secrets (HF_TOKEN, HF_DATASET_REPO, HF_MODEL_REPO, HF_SPACE_REPO)
|
| 195 |
+
|
| 196 |
+
### ✅ Step 2: Hugging Face Dataset (Automatic)
|
| 197 |
+
- [ ] Set `HF_TOKEN` environment variable
|
| 198 |
+
- [ ] Run `python src/data_register.py` (uploads raw data)
|
| 199 |
+
- [ ] Run `python src/data_prep.py` (uploads train/test)
|
| 200 |
+
|
| 201 |
+
### ✅ Step 3: Hugging Face Model (Automatic)
|
| 202 |
+
- [ ] Run `python src/train.py` (trains model and uploads to HF)
|
| 203 |
+
|
| 204 |
+
### ✅ Step 4: Hugging Face Space (Automatic)
|
| 205 |
+
- [ ] Run `python src/deploy_to_hf.py` (deploys app to HF Space)
|
| 206 |
+
|
| 207 |
+
### ✅ Step 5: Verify
|
| 208 |
+
- [ ] Check GitHub repo: https://github.com/ananttripathi/engine-predictive-maintenance
|
| 209 |
+
- [ ] Check HF Dataset: https://huggingface.co/datasets/ananttripathiak/engine-maintenance-dataset
|
| 210 |
+
- [ ] Check HF Model: https://huggingface.co/ananttripathiak/engine-maintenance-model
|
| 211 |
+
- [ ] Check HF Space: https://huggingface.co/spaces/ananttripathiak/engine-maintenance-space
|
| 212 |
+
|
| 213 |
+
---
|
| 214 |
+
|
| 215 |
+
## 🎯 Key Differences
|
| 216 |
+
|
| 217 |
+
| Aspect | GitHub | Hugging Face |
|
| 218 |
+
|--------|--------|--------------|
|
| 219 |
+
| **Purpose** | Code repository & version control | Data/Model/App hosting |
|
| 220 |
+
| **What** | Entire project (code, data, docs) | Specific artifacts (data/model/app) |
|
| 221 |
+
| **How** | Manual `git push` | Automatic via scripts |
|
| 222 |
+
| **Size** | Can be large (includes everything) | Optimized (only what's needed) |
|
| 223 |
+
| **Access** | Public/Private repo | Public datasets/models/spaces |
|
| 224 |
+
| **CI/CD** | GitHub Actions workflow | HF Spaces auto-deploy |
|
| 225 |
+
|
| 226 |
+
---
|
| 227 |
+
|
| 228 |
+
## 💡 Pro Tips
|
| 229 |
+
|
| 230 |
+
1. **GitHub First**: Always push to GitHub first, then run HF scripts
|
| 231 |
+
2. **Use .gitignore**: Exclude large files like `mlruns/` and `.venv/` from GitHub
|
| 232 |
+
3. **HF Scripts are Smart**: They automatically create repos if they don't exist
|
| 233 |
+
4. **Check File Sizes**: HF has file size limits, so scripts exclude large files
|
| 234 |
+
5. **GitHub Secrets**: Store HF credentials in GitHub Secrets for CI/CD
|
| 235 |
+
|
| 236 |
+
---
|
| 237 |
+
|
| 238 |
+
## 🚨 Common Mistakes to Avoid
|
| 239 |
+
|
| 240 |
+
❌ **Don't upload to GitHub:**
|
| 241 |
+
- Virtual environment (`.venv/`)
|
| 242 |
+
- Large MLflow runs (`mlruns/`)
|
| 243 |
+
- Python cache files (`__pycache__/`)
|
| 244 |
+
|
| 245 |
+
❌ **Don't upload to HF Space:**
|
| 246 |
+
- Raw data files (too large)
|
| 247 |
+
- Model files (use Model Repo instead)
|
| 248 |
+
- MLflow tracking data
|
| 249 |
+
|
| 250 |
+
✅ **Do upload to GitHub:**
|
| 251 |
+
- All source code
|
| 252 |
+
- Configuration files
|
| 253 |
+
- Documentation
|
| 254 |
+
- GitHub Actions workflow
|
| 255 |
+
|
| 256 |
+
✅ **Do upload to HF:**
|
| 257 |
+
- Only what each repo type needs (data → Dataset, model → Model, app → Space)
|
| 258 |
+
|
| 259 |
+
---
|
| 260 |
+
|
| 261 |
+
## 📞 Need Help?
|
| 262 |
+
|
| 263 |
+
- **GitHub Issues**: Check your repo settings and secrets
|
| 264 |
+
- **HF Upload Errors**: Verify `HF_TOKEN` is set correctly
|
| 265 |
+
- **File Size Issues**: Check HF file size limits (usually 10GB for datasets)
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy
|
| 2 |
+
pandas
|
| 3 |
+
scikit-learn
|
| 4 |
+
mlflow
|
| 5 |
+
huggingface_hub
|
| 6 |
+
streamlit
|
| 7 |
+
joblib
|
| 8 |
+
matplotlib
|
| 9 |
+
seaborn
|
| 10 |
+
plotly
|
| 11 |
+
|
src/app.py
ADDED
|
@@ -0,0 +1,387 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Enhanced Streamlit application for engine predictive maintenance.
|
| 3 |
+
|
| 4 |
+
Intended to be run locally or deployed as a Hugging Face Space.
|
| 5 |
+
Features an interactive, modern UI with real-time predictions,
|
| 6 |
+
visualizations, and detailed insights.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
import os
|
| 12 |
+
import pandas as pd
|
| 13 |
+
import plotly.graph_objects as go
|
| 14 |
+
import plotly.express as px
|
| 15 |
+
import streamlit as st
|
| 16 |
+
|
| 17 |
+
import config
|
| 18 |
+
from inference import predict_engine_condition
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def _get_default_source() -> str:
|
| 22 |
+
"""Decide whether to load the model from HF or local based on env vars."""
|
| 23 |
+
if config.HF_TOKEN and config.HF_MODEL_REPO:
|
| 24 |
+
return "hf"
|
| 25 |
+
return "local"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def create_gauge_chart(value: float, title: str, color: str) -> go.Figure:
|
| 29 |
+
"""Create a gauge chart for sensor readings."""
|
| 30 |
+
fig = go.Figure(go.Indicator(
|
| 31 |
+
mode="gauge+number",
|
| 32 |
+
value=value,
|
| 33 |
+
domain={'x': [0, 1], 'y': [0, 1]},
|
| 34 |
+
title={'text': title, 'font': {'size': 16}},
|
| 35 |
+
gauge={
|
| 36 |
+
'axis': {'range': [None, 100]},
|
| 37 |
+
'bar': {'color': color},
|
| 38 |
+
'steps': [
|
| 39 |
+
{'range': [0, 50], 'color': "lightgray"},
|
| 40 |
+
{'range': [50, 80], 'color': "gray"}
|
| 41 |
+
],
|
| 42 |
+
'threshold': {
|
| 43 |
+
'line': {'color': "red", 'width': 4},
|
| 44 |
+
'thickness': 0.75,
|
| 45 |
+
'value': 90
|
| 46 |
+
}
|
| 47 |
+
}
|
| 48 |
+
))
|
| 49 |
+
fig.update_layout(height=200, margin=dict(l=20, r=20, t=40, b=20))
|
| 50 |
+
return fig
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def create_sensor_comparison_chart(sensor_data: dict) -> go.Figure:
|
| 54 |
+
"""Create a radar chart comparing sensor values."""
|
| 55 |
+
categories = list(sensor_data.keys())
|
| 56 |
+
values = list(sensor_data.values())
|
| 57 |
+
|
| 58 |
+
# Normalize values for better visualization (0-100 scale)
|
| 59 |
+
max_values = {
|
| 60 |
+
"Engine_RPM": 4000,
|
| 61 |
+
"Lub_Oil_Pressure": 10,
|
| 62 |
+
"Fuel_Pressure": 30,
|
| 63 |
+
"Coolant_Pressure": 10,
|
| 64 |
+
"Lub_Oil_Temperature": 150,
|
| 65 |
+
"Coolant_Temperature": 150,
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
+
normalized_values = [
|
| 69 |
+
(v / max_values.get(k, 100)) * 100 for k, v in zip(categories, values)
|
| 70 |
+
]
|
| 71 |
+
|
| 72 |
+
fig = go.Figure()
|
| 73 |
+
|
| 74 |
+
fig.add_trace(go.Scatterpolar(
|
| 75 |
+
r=normalized_values + [normalized_values[0]], # Close the loop
|
| 76 |
+
theta=[k.replace("_", " ") for k in categories] + [categories[0].replace("_", " ")],
|
| 77 |
+
fill='toself',
|
| 78 |
+
name='Current Readings',
|
| 79 |
+
line_color='#1f77b4'
|
| 80 |
+
))
|
| 81 |
+
|
| 82 |
+
fig.update_layout(
|
| 83 |
+
polar=dict(
|
| 84 |
+
radialaxis=dict(
|
| 85 |
+
visible=True,
|
| 86 |
+
range=[0, 100]
|
| 87 |
+
)),
|
| 88 |
+
showlegend=True,
|
| 89 |
+
height=400,
|
| 90 |
+
title="Sensor Readings Overview"
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
return fig
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
def main() -> None:
|
| 97 |
+
# MUST be first Streamlit command
|
| 98 |
+
st.set_page_config(
|
| 99 |
+
page_title="Engine Predictive Maintenance",
|
| 100 |
+
page_icon="🛠️",
|
| 101 |
+
layout="wide",
|
| 102 |
+
initial_sidebar_state="expanded"
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
# Custom CSS for better styling
|
| 106 |
+
st.markdown("""
|
| 107 |
+
<style>
|
| 108 |
+
.main-header {
|
| 109 |
+
font-size: 2.5rem;
|
| 110 |
+
font-weight: bold;
|
| 111 |
+
color: #1f77b4;
|
| 112 |
+
text-align: center;
|
| 113 |
+
margin-bottom: 1rem;
|
| 114 |
+
}
|
| 115 |
+
.metric-card {
|
| 116 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 117 |
+
padding: 1.5rem;
|
| 118 |
+
border-radius: 10px;
|
| 119 |
+
color: white;
|
| 120 |
+
text-align: center;
|
| 121 |
+
margin: 0.5rem 0;
|
| 122 |
+
}
|
| 123 |
+
.prediction-box {
|
| 124 |
+
padding: 2rem;
|
| 125 |
+
border-radius: 15px;
|
| 126 |
+
margin: 1.5rem 0;
|
| 127 |
+
box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
|
| 128 |
+
}
|
| 129 |
+
.success-box {
|
| 130 |
+
background: linear-gradient(135deg, #11998e 0%, #38ef7d 100%);
|
| 131 |
+
color: white;
|
| 132 |
+
}
|
| 133 |
+
.warning-box {
|
| 134 |
+
background: linear-gradient(135deg, #ee0979 0%, #ff6a00 100%);
|
| 135 |
+
color: white;
|
| 136 |
+
}
|
| 137 |
+
.stSlider > div > div > div {
|
| 138 |
+
background-color: #1f77b4;
|
| 139 |
+
}
|
| 140 |
+
.stButton > button {
|
| 141 |
+
width: 100%;
|
| 142 |
+
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| 143 |
+
color: white;
|
| 144 |
+
font-weight: bold;
|
| 145 |
+
font-size: 1.2rem;
|
| 146 |
+
padding: 0.75rem;
|
| 147 |
+
border-radius: 10px;
|
| 148 |
+
border: none;
|
| 149 |
+
transition: all 0.3s;
|
| 150 |
+
}
|
| 151 |
+
.stButton > button:hover {
|
| 152 |
+
transform: scale(1.05);
|
| 153 |
+
box-shadow: 0 5px 15px rgba(0, 0, 0, 0.3);
|
| 154 |
+
}
|
| 155 |
+
</style>
|
| 156 |
+
""", unsafe_allow_html=True)
|
| 157 |
+
|
| 158 |
+
# Compact Header
|
| 159 |
+
st.markdown('<h1 style="font-size: 2rem; text-align: center; color: #1f77b4; margin-bottom: 0.5rem;">🛠️ Engine Predictive Maintenance</h1>', unsafe_allow_html=True)
|
| 160 |
+
st.markdown('<p style="text-align: center; color: #666; margin-bottom: 1rem; font-size: 0.9rem;">AI-Powered Engine Health Monitoring & Failure Prediction</p>', unsafe_allow_html=True)
|
| 161 |
+
|
| 162 |
+
# Sidebar
|
| 163 |
+
with st.sidebar:
|
| 164 |
+
st.header("⚙�� Configuration")
|
| 165 |
+
|
| 166 |
+
default_source = _get_default_source()
|
| 167 |
+
source = st.radio(
|
| 168 |
+
"📦 Model Source:",
|
| 169 |
+
options=["local", "hf"],
|
| 170 |
+
index=0 if default_source == "hf" else 1,
|
| 171 |
+
format_func=lambda x: "🤖 Hugging Face Hub" if x == "hf" else "💾 Local File",
|
| 172 |
+
help="Select where to load the trained model from"
|
| 173 |
+
)
|
| 174 |
+
|
| 175 |
+
st.markdown("---")
|
| 176 |
+
|
| 177 |
+
st.header("📊 Quick Stats")
|
| 178 |
+
if os.path.exists(config.BEST_MODEL_LOCAL_PATH):
|
| 179 |
+
st.success("✅ Model Available")
|
| 180 |
+
st.caption("Trained model found locally")
|
| 181 |
+
else:
|
| 182 |
+
st.warning("⚠️ Model Not Found")
|
| 183 |
+
st.caption("Run training script first")
|
| 184 |
+
|
| 185 |
+
st.markdown("---")
|
| 186 |
+
|
| 187 |
+
st.header("ℹ️ About")
|
| 188 |
+
st.markdown("""
|
| 189 |
+
This application uses machine learning to predict engine failures based on:
|
| 190 |
+
- Engine RPM
|
| 191 |
+
- Oil & Fuel Pressures
|
| 192 |
+
- Coolant Pressure
|
| 193 |
+
- Temperature Readings
|
| 194 |
+
|
| 195 |
+
**Status**: 0 = Normal | 1 = Requires Maintenance
|
| 196 |
+
""")
|
| 197 |
+
|
| 198 |
+
st.markdown("---")
|
| 199 |
+
st.caption("Built with ❤️ using Streamlit & Scikit-learn")
|
| 200 |
+
|
| 201 |
+
# Balanced layout - inputs on left, larger visualization on right
|
| 202 |
+
col_input, col_viz = st.columns([1, 1.2])
|
| 203 |
+
|
| 204 |
+
with col_input:
|
| 205 |
+
# Input form
|
| 206 |
+
with st.form(key="engine_form", clear_on_submit=False):
|
| 207 |
+
st.markdown("### 🔧 Sensor Inputs")
|
| 208 |
+
|
| 209 |
+
# 2 columns for inputs
|
| 210 |
+
col_a, col_b = st.columns(2)
|
| 211 |
+
|
| 212 |
+
with col_a:
|
| 213 |
+
engine_rpm = st.number_input(
|
| 214 |
+
"⚙️ Engine RPM",
|
| 215 |
+
min_value=0.0,
|
| 216 |
+
max_value=4000.0,
|
| 217 |
+
value=800.0,
|
| 218 |
+
step=10.0,
|
| 219 |
+
help="Revolutions per minute"
|
| 220 |
+
)
|
| 221 |
+
lub_oil_pressure = st.number_input(
|
| 222 |
+
"🛢️ Lub Oil Pressure",
|
| 223 |
+
min_value=0.0,
|
| 224 |
+
max_value=10.0,
|
| 225 |
+
value=3.0,
|
| 226 |
+
step=0.1,
|
| 227 |
+
help="bar/kPa"
|
| 228 |
+
)
|
| 229 |
+
fuel_pressure = st.number_input(
|
| 230 |
+
"⛽ Fuel Pressure",
|
| 231 |
+
min_value=0.0,
|
| 232 |
+
max_value=30.0,
|
| 233 |
+
value=10.0,
|
| 234 |
+
step=0.1,
|
| 235 |
+
help="bar/kPa"
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
with col_b:
|
| 239 |
+
coolant_pressure = st.number_input(
|
| 240 |
+
"💧 Coolant Pressure",
|
| 241 |
+
min_value=0.0,
|
| 242 |
+
max_value=10.0,
|
| 243 |
+
value=2.0,
|
| 244 |
+
step=0.1,
|
| 245 |
+
help="bar/kPa"
|
| 246 |
+
)
|
| 247 |
+
lub_oil_temp = st.number_input(
|
| 248 |
+
"🌡️ Lub Oil Temp",
|
| 249 |
+
min_value=0.0,
|
| 250 |
+
max_value=150.0,
|
| 251 |
+
value=80.0,
|
| 252 |
+
step=0.5,
|
| 253 |
+
help="°C"
|
| 254 |
+
)
|
| 255 |
+
coolant_temp = st.number_input(
|
| 256 |
+
"🌡️ Coolant Temp",
|
| 257 |
+
min_value=0.0,
|
| 258 |
+
max_value=150.0,
|
| 259 |
+
value=80.0,
|
| 260 |
+
step=0.5,
|
| 261 |
+
help="°C"
|
| 262 |
+
)
|
| 263 |
+
|
| 264 |
+
submitted = st.form_submit_button("🚀 Predict Engine Condition", use_container_width=True)
|
| 265 |
+
|
| 266 |
+
with col_viz:
|
| 267 |
+
st.markdown("### 📊 Sensor Visualization")
|
| 268 |
+
|
| 269 |
+
# Real-time sensor visualization
|
| 270 |
+
sensor_data = {
|
| 271 |
+
"Engine_RPM": engine_rpm,
|
| 272 |
+
"Lub_Oil_Pressure": lub_oil_pressure,
|
| 273 |
+
"Fuel_Pressure": fuel_pressure,
|
| 274 |
+
"Coolant_Pressure": coolant_pressure,
|
| 275 |
+
"Lub_Oil_Temperature": lub_oil_temp,
|
| 276 |
+
"Coolant_Temperature": coolant_temp,
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
# Larger, more readable radar chart
|
| 280 |
+
radar_fig = create_sensor_comparison_chart(sensor_data)
|
| 281 |
+
radar_fig.update_layout(height=450, margin=dict(l=40, r=40, t=50, b=40)) # Larger and more readable
|
| 282 |
+
st.plotly_chart(radar_fig, use_container_width=True, config={'displayModeBar': False})
|
| 283 |
+
|
| 284 |
+
# Prediction results
|
| 285 |
+
if submitted:
|
| 286 |
+
inputs = {
|
| 287 |
+
"Engine_RPM": engine_rpm,
|
| 288 |
+
"Lub_Oil_Pressure": lub_oil_pressure,
|
| 289 |
+
"Fuel_Pressure": fuel_pressure,
|
| 290 |
+
"Coolant_Pressure": coolant_pressure,
|
| 291 |
+
"Lub_Oil_Temperature": lub_oil_temp,
|
| 292 |
+
"Coolant_Temperature": coolant_temp,
|
| 293 |
+
}
|
| 294 |
+
|
| 295 |
+
with st.spinner("🤖 Loading model and analyzing sensor data..."):
|
| 296 |
+
try:
|
| 297 |
+
result = predict_engine_condition(inputs=inputs, source=source)
|
| 298 |
+
except Exception as e:
|
| 299 |
+
st.error(
|
| 300 |
+
f"❌ **Prediction Failed**\n\n"
|
| 301 |
+
f"Error: {str(e)}\n\n"
|
| 302 |
+
f"**Troubleshooting:**\n"
|
| 303 |
+
f"- Ensure the model is trained: `python src/train.py`\n"
|
| 304 |
+
f"- Check model file exists: `models/best_model.joblib`\n"
|
| 305 |
+
f"- Verify HF credentials if using Hugging Face Hub"
|
| 306 |
+
)
|
| 307 |
+
return
|
| 308 |
+
|
| 309 |
+
pred_label = result["prediction"]
|
| 310 |
+
prob_faulty = result["probability_faulty"]
|
| 311 |
+
prob_normal = 1 - prob_faulty
|
| 312 |
+
|
| 313 |
+
# Compact results section
|
| 314 |
+
st.markdown("---")
|
| 315 |
+
|
| 316 |
+
result_col1, result_col2 = st.columns([1.5, 1])
|
| 317 |
+
|
| 318 |
+
with result_col1:
|
| 319 |
+
if pred_label == 1:
|
| 320 |
+
st.markdown(
|
| 321 |
+
f'<div class="prediction-box warning-box" style="padding: 1rem;">'
|
| 322 |
+
f'<h2 style="color: white; margin: 0; font-size: 1.5rem;">🚨 MAINTENANCE REQUIRED</h2>'
|
| 323 |
+
f'<p style="font-size: 1.1rem; margin: 0.5rem 0;">Engine is <b>LIKELY FAULTY</b> - Fault Probability: <b>{prob_faulty:.1%}</b></p>'
|
| 324 |
+
f'</div>',
|
| 325 |
+
unsafe_allow_html=True
|
| 326 |
+
)
|
| 327 |
+
with st.expander("🔧 Recommended Actions", expanded=False):
|
| 328 |
+
st.markdown("""
|
| 329 |
+
- Schedule immediate engine inspection
|
| 330 |
+
- Verify sensor readings are accurate
|
| 331 |
+
- Review maintenance history
|
| 332 |
+
- Consult maintenance specialist
|
| 333 |
+
""")
|
| 334 |
+
else:
|
| 335 |
+
st.markdown(
|
| 336 |
+
f'<div class="prediction-box success-box" style="padding: 1rem;">'
|
| 337 |
+
f'<h2 style="color: white; margin: 0; font-size: 1.5rem;">✅ ENGINE HEALTHY</h2>'
|
| 338 |
+
f'<p style="font-size: 1.1rem; margin: 0.5rem 0;">Engine is <b>OPERATING NORMALLY</b> - Fault Probability: <b>{prob_faulty:.1%}</b></p>'
|
| 339 |
+
f'</div>',
|
| 340 |
+
unsafe_allow_html=True
|
| 341 |
+
)
|
| 342 |
+
st.success("✅ All sensors within normal ranges. Continue regular monitoring.")
|
| 343 |
+
|
| 344 |
+
with result_col2:
|
| 345 |
+
# Compact probability gauge
|
| 346 |
+
fig = go.Figure(go.Indicator(
|
| 347 |
+
mode="gauge+number",
|
| 348 |
+
value=prob_faulty * 100,
|
| 349 |
+
domain={'x': [0, 1], 'y': [0, 1]},
|
| 350 |
+
title={'text': "Fault Risk %", 'font': {'size': 16}},
|
| 351 |
+
gauge={
|
| 352 |
+
'axis': {'range': [None, 100]},
|
| 353 |
+
'bar': {'color': "darkred" if pred_label == 1 else "darkgreen"},
|
| 354 |
+
'steps': [
|
| 355 |
+
{'range': [0, 30], 'color': "lightgreen"},
|
| 356 |
+
{'range': [30, 70], 'color': "yellow"},
|
| 357 |
+
{'range': [70, 100], 'color': "lightcoral"}
|
| 358 |
+
],
|
| 359 |
+
'threshold': {
|
| 360 |
+
'line': {'color': "red", 'width': 4},
|
| 361 |
+
'thickness': 0.75,
|
| 362 |
+
'value': 70
|
| 363 |
+
}
|
| 364 |
+
}
|
| 365 |
+
))
|
| 366 |
+
fig.update_layout(height=200, margin=dict(l=10, r=10, t=30, b=10))
|
| 367 |
+
st.plotly_chart(fig, use_container_width=True, config={'displayModeBar': False})
|
| 368 |
+
|
| 369 |
+
# Compact metrics
|
| 370 |
+
col_m1, col_m2 = st.columns(2)
|
| 371 |
+
with col_m1:
|
| 372 |
+
st.metric("Normal", f"{prob_normal:.0%}")
|
| 373 |
+
with col_m2:
|
| 374 |
+
st.metric("Fault", f"{prob_faulty:.0%}")
|
| 375 |
+
|
| 376 |
+
# Compact Footer
|
| 377 |
+
st.markdown("---")
|
| 378 |
+
st.markdown("""
|
| 379 |
+
<div style='text-align: center; color: #666; padding: 0.5rem; font-size: 0.85rem;'>
|
| 380 |
+
<p>🛠️ <b>Predictive Maintenance System</b> | Built with Streamlit, Scikit-learn & Plotly | Developed by <b>Anant Tripathi</b></p>
|
| 381 |
+
<p style='font-size: 0.75rem; color: #888; margin-top: 0.25rem;'>⚠️ Use as decision-support tool, not replacement for expert diagnostics</p>
|
| 382 |
+
</div>
|
| 383 |
+
""", unsafe_allow_html=True)
|
| 384 |
+
|
| 385 |
+
|
| 386 |
+
if __name__ == "__main__":
|
| 387 |
+
main()
|
src/config.py
ADDED
|
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
"""
|
| 5 |
+
Central configuration for the predictive maintenance project.
|
| 6 |
+
|
| 7 |
+
Update the Hugging Face repo IDs to match your own account, or
|
| 8 |
+
set them via environment variables.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
# Handle both local execution and GitHub Actions
|
| 12 |
+
if Path(__file__).resolve().name == "config.py":
|
| 13 |
+
# Running from src/ directory
|
| 14 |
+
PROJECT_ROOT = Path(__file__).resolve().parents[1]
|
| 15 |
+
else:
|
| 16 |
+
# Running as module
|
| 17 |
+
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
| 18 |
+
|
| 19 |
+
# -------------------------------------------------------------------------
|
| 20 |
+
# Data paths
|
| 21 |
+
# -------------------------------------------------------------------------
|
| 22 |
+
DATA_DIR = PROJECT_ROOT / "data"
|
| 23 |
+
RAW_DATA_FILE = DATA_DIR / "engine_data.csv"
|
| 24 |
+
PROCESSED_DIR = DATA_DIR / "processed"
|
| 25 |
+
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
|
| 26 |
+
|
| 27 |
+
TRAIN_FILE = PROCESSED_DIR / "train.csv"
|
| 28 |
+
TEST_FILE = PROCESSED_DIR / "test.csv"
|
| 29 |
+
|
| 30 |
+
# -------------------------------------------------------------------------
|
| 31 |
+
# Target and feature configuration
|
| 32 |
+
# -------------------------------------------------------------------------
|
| 33 |
+
# Raw CSV column names (as they appear in engine_data.csv)
|
| 34 |
+
RAW_COLUMN_RENAME_MAP = {
|
| 35 |
+
"Engine rpm": "Engine_RPM",
|
| 36 |
+
"Lub oil pressure": "Lub_Oil_Pressure",
|
| 37 |
+
"Fuel pressure": "Fuel_Pressure",
|
| 38 |
+
"Coolant pressure": "Coolant_Pressure",
|
| 39 |
+
"lub oil temp": "Lub_Oil_Temperature",
|
| 40 |
+
"Coolant temp": "Coolant_Temperature",
|
| 41 |
+
"Engine Condition": "Engine_Condition",
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
TARGET_COLUMN = "Engine_Condition"
|
| 45 |
+
|
| 46 |
+
FEATURE_COLUMNS = [
|
| 47 |
+
"Engine_RPM",
|
| 48 |
+
"Lub_Oil_Pressure",
|
| 49 |
+
"Fuel_Pressure",
|
| 50 |
+
"Coolant_Pressure",
|
| 51 |
+
"Lub_Oil_Temperature",
|
| 52 |
+
"Coolant_Temperature",
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
RANDOM_STATE = 42
|
| 56 |
+
TEST_SIZE = float(os.getenv("TEST_SIZE", "0.2"))
|
| 57 |
+
|
| 58 |
+
# -------------------------------------------------------------------------
|
| 59 |
+
# Hugging Face configuration
|
| 60 |
+
# -------------------------------------------------------------------------
|
| 61 |
+
HF_TOKEN = os.getenv("HF_TOKEN") # set this in your environment or GitHub secrets
|
| 62 |
+
|
| 63 |
+
# Default repo IDs can be overridden via environment variables
|
| 64 |
+
# Hugging Face username: ananttripathiak
|
| 65 |
+
# GitHub username: ananttripathi
|
| 66 |
+
HF_DATASET_REPO = os.getenv(
|
| 67 |
+
"HF_DATASET_REPO", "ananttripathiak/engine-maintenance-dataset"
|
| 68 |
+
)
|
| 69 |
+
HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "ananttripathiak/engine-maintenance-model")
|
| 70 |
+
HF_SPACE_REPO = os.getenv("HF_SPACE_REPO", "ananttripathiak/engine-maintenance-space")
|
| 71 |
+
|
| 72 |
+
# -------------------------------------------------------------------------
|
| 73 |
+
# MLflow configuration
|
| 74 |
+
# -------------------------------------------------------------------------
|
| 75 |
+
MLFLOW_TRACKING_URI = os.getenv(
|
| 76 |
+
"MLFLOW_TRACKING_URI", (PROJECT_ROOT / "mlruns").as_uri()
|
| 77 |
+
)
|
| 78 |
+
MLFLOW_EXPERIMENT_NAME = os.getenv(
|
| 79 |
+
"MLFLOW_EXPERIMENT_NAME", "engine_predictive_maintenance"
|
| 80 |
+
)
|
| 81 |
+
|
| 82 |
+
# -------------------------------------------------------------------------
|
| 83 |
+
# Model artifacts
|
| 84 |
+
# -------------------------------------------------------------------------
|
| 85 |
+
MODELS_DIR = PROJECT_ROOT / "models"
|
| 86 |
+
MODELS_DIR.mkdir(parents=True, exist_ok=True)
|
| 87 |
+
|
| 88 |
+
BEST_MODEL_LOCAL_PATH = MODELS_DIR / "best_model.joblib"
|
| 89 |
+
|
src/data_prep.py
ADDED
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data preparation script for the predictive maintenance project.
|
| 3 |
+
|
| 4 |
+
Responsibilities:
|
| 5 |
+
- Load the raw engine dataset from the Hugging Face dataset repo (preferred)
|
| 6 |
+
or from the local data folder as a fallback.
|
| 7 |
+
- Clean and preprocess the data (rename columns, handle missing values,
|
| 8 |
+
drop duplicates, basic sanity checks).
|
| 9 |
+
- Split the cleaned data into train and test sets.
|
| 10 |
+
- Save train and test CSVs locally.
|
| 11 |
+
- Upload the resulting train and test CSVs back to the Hugging Face dataset repo.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from __future__ import annotations
|
| 15 |
+
|
| 16 |
+
from pathlib import Path
|
| 17 |
+
from typing import Tuple
|
| 18 |
+
|
| 19 |
+
import numpy as np
|
| 20 |
+
import pandas as pd
|
| 21 |
+
from sklearn.model_selection import train_test_split
|
| 22 |
+
|
| 23 |
+
import config
|
| 24 |
+
from hf_data_utils import download_dataset_file, upload_dataset_file
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def _load_raw_data_from_hf_or_local() -> pd.DataFrame:
|
| 28 |
+
"""
|
| 29 |
+
Try to load the raw dataset from the Hugging Face dataset repo.
|
| 30 |
+
If that fails (e.g., no token or repo yet), fall back to the local CSV.
|
| 31 |
+
"""
|
| 32 |
+
# Preferred: load from HF dataset repo if token and repo are configured
|
| 33 |
+
if config.HF_TOKEN and config.HF_DATASET_REPO:
|
| 34 |
+
try:
|
| 35 |
+
remote_path = download_dataset_file(
|
| 36 |
+
filename="data/engine_data.csv",
|
| 37 |
+
repo_id=config.HF_DATASET_REPO,
|
| 38 |
+
token=config.HF_TOKEN,
|
| 39 |
+
local_dir=config.DATA_DIR,
|
| 40 |
+
)
|
| 41 |
+
return pd.read_csv(remote_path)
|
| 42 |
+
except Exception:
|
| 43 |
+
# Fall back to local file
|
| 44 |
+
pass
|
| 45 |
+
|
| 46 |
+
# Local fallback
|
| 47 |
+
if not config.RAW_DATA_FILE.exists():
|
| 48 |
+
raise FileNotFoundError(
|
| 49 |
+
f"Raw data file not found at {config.RAW_DATA_FILE}. "
|
| 50 |
+
"Ensure engine_data.csv exists or upload it to the HF dataset repo."
|
| 51 |
+
)
|
| 52 |
+
|
| 53 |
+
return pd.read_csv(config.RAW_DATA_FILE)
|
| 54 |
+
|
| 55 |
+
|
| 56 |
+
def _clean_data(df: pd.DataFrame) -> pd.DataFrame:
|
| 57 |
+
"""
|
| 58 |
+
Perform basic cleaning and feature engineering.
|
| 59 |
+
"""
|
| 60 |
+
# Standardize column names
|
| 61 |
+
df = df.rename(columns=config.RAW_COLUMN_RENAME_MAP)
|
| 62 |
+
|
| 63 |
+
# Keep only the expected columns (drop any extras, if present)
|
| 64 |
+
expected_cols = set(config.FEATURE_COLUMNS + [config.TARGET_COLUMN])
|
| 65 |
+
df = df[[col for col in df.columns if col in expected_cols]]
|
| 66 |
+
|
| 67 |
+
# Drop duplicate rows
|
| 68 |
+
df = df.drop_duplicates().reset_index(drop=True)
|
| 69 |
+
|
| 70 |
+
# Handle missing values: for this numeric dataset, fill with median
|
| 71 |
+
if df.isna().any().any():
|
| 72 |
+
df = df.fillna(df.median(numeric_only=True))
|
| 73 |
+
|
| 74 |
+
# Ensure target is integer/binary
|
| 75 |
+
df[config.TARGET_COLUMN] = df[config.TARGET_COLUMN].astype(int)
|
| 76 |
+
|
| 77 |
+
return df
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
def _train_test_split(
|
| 81 |
+
df: pd.DataFrame,
|
| 82 |
+
) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
| 83 |
+
"""
|
| 84 |
+
Split the cleaned dataframe into train and test sets.
|
| 85 |
+
"""
|
| 86 |
+
X = df[config.FEATURE_COLUMNS]
|
| 87 |
+
y = df[config.TARGET_COLUMN]
|
| 88 |
+
|
| 89 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 90 |
+
X,
|
| 91 |
+
y,
|
| 92 |
+
test_size=config.TEST_SIZE,
|
| 93 |
+
random_state=config.RANDOM_STATE,
|
| 94 |
+
stratify=y,
|
| 95 |
+
)
|
| 96 |
+
|
| 97 |
+
train_df = X_train.copy()
|
| 98 |
+
train_df[config.TARGET_COLUMN] = y_train
|
| 99 |
+
|
| 100 |
+
test_df = X_test.copy()
|
| 101 |
+
test_df[config.TARGET_COLUMN] = y_test
|
| 102 |
+
|
| 103 |
+
return train_df, test_df
|
| 104 |
+
|
| 105 |
+
|
| 106 |
+
def main() -> None:
|
| 107 |
+
"""
|
| 108 |
+
Execute the full data preparation pipeline.
|
| 109 |
+
"""
|
| 110 |
+
print("Loading raw data...")
|
| 111 |
+
raw_df = _load_raw_data_from_hf_or_local()
|
| 112 |
+
print(f"Raw data shape: {raw_df.shape}")
|
| 113 |
+
|
| 114 |
+
print("Cleaning data...")
|
| 115 |
+
clean_df = _clean_data(raw_df)
|
| 116 |
+
print(f"Clean data shape: {clean_df.shape}")
|
| 117 |
+
|
| 118 |
+
print("Performing train/test split...")
|
| 119 |
+
train_df, test_df = _train_test_split(clean_df)
|
| 120 |
+
print(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
|
| 121 |
+
|
| 122 |
+
# Save locally
|
| 123 |
+
config.PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
|
| 124 |
+
train_df.to_csv(config.TRAIN_FILE, index=False)
|
| 125 |
+
test_df.to_csv(config.TEST_FILE, index=False)
|
| 126 |
+
print(f"Saved train to {config.TRAIN_FILE}")
|
| 127 |
+
print(f"Saved test to {config.TEST_FILE}")
|
| 128 |
+
|
| 129 |
+
# Upload to HF dataset repo, if configured
|
| 130 |
+
if config.HF_TOKEN and config.HF_DATASET_REPO:
|
| 131 |
+
try:
|
| 132 |
+
print("Uploading train and test splits to Hugging Face dataset repo...")
|
| 133 |
+
upload_dataset_file(
|
| 134 |
+
local_path=config.TRAIN_FILE,
|
| 135 |
+
repo_id=config.HF_DATASET_REPO,
|
| 136 |
+
repo_path="data/train.csv",
|
| 137 |
+
token=config.HF_TOKEN,
|
| 138 |
+
)
|
| 139 |
+
upload_dataset_file(
|
| 140 |
+
local_path=config.TEST_FILE,
|
| 141 |
+
repo_id=config.HF_DATASET_REPO,
|
| 142 |
+
repo_path="data/test.csv",
|
| 143 |
+
token=config.HF_TOKEN,
|
| 144 |
+
)
|
| 145 |
+
print("Upload to Hugging Face completed.")
|
| 146 |
+
except Exception as e:
|
| 147 |
+
print(
|
| 148 |
+
f"Warning: Failed to upload train/test to Hugging Face dataset repo: {e}"
|
| 149 |
+
)
|
| 150 |
+
|
| 151 |
+
|
| 152 |
+
if __name__ == "__main__":
|
| 153 |
+
main()
|
| 154 |
+
|
src/data_register.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Data registration script for the predictive maintenance project.
|
| 3 |
+
|
| 4 |
+
This script is the analogue of the notebook's `data_register.py`:
|
| 5 |
+
- It ensures the Hugging Face dataset repo exists.
|
| 6 |
+
- It uploads the raw engine dataset from the local `data/` folder
|
| 7 |
+
into the dataset repo so it can be consumed by other stages
|
| 8 |
+
(EDA, data preparation, model training) using a consistent source.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import sys
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
try:
|
| 17 |
+
import config
|
| 18 |
+
from hf_data_utils import register_raw_engine_data_to_hf
|
| 19 |
+
except ImportError as e:
|
| 20 |
+
print(f"ERROR: Failed to import modules: {e}", file=sys.stderr)
|
| 21 |
+
print(f"Python path: {sys.path}", file=sys.stderr)
|
| 22 |
+
sys.exit(1)
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def main() -> None:
|
| 26 |
+
try:
|
| 27 |
+
print(f"PROJECT_ROOT: {config.PROJECT_ROOT}")
|
| 28 |
+
print(f"RAW_DATA_FILE: {config.RAW_DATA_FILE}")
|
| 29 |
+
print(f"RAW_DATA_FILE exists: {config.RAW_DATA_FILE.exists()}")
|
| 30 |
+
print(f"HF_DATASET_REPO: {config.HF_DATASET_REPO}")
|
| 31 |
+
print(f"HF_TOKEN is set: {bool(config.HF_TOKEN)}")
|
| 32 |
+
|
| 33 |
+
if not config.RAW_DATA_FILE.exists():
|
| 34 |
+
raise FileNotFoundError(
|
| 35 |
+
f"Expected raw data at {config.RAW_DATA_FILE}, "
|
| 36 |
+
"but the file does not exist. Make sure `engine_data.csv` "
|
| 37 |
+
"is placed in the `data/` folder."
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
if not config.HF_TOKEN:
|
| 41 |
+
raise ValueError(
|
| 42 |
+
"HF_TOKEN is not set. Please set it as an environment variable "
|
| 43 |
+
"or in GitHub Secrets."
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
print(f"Registering raw engine dataset from: {config.RAW_DATA_FILE}")
|
| 47 |
+
print(f"Target HF dataset repo: {config.HF_DATASET_REPO}")
|
| 48 |
+
register_raw_engine_data_to_hf()
|
| 49 |
+
print("✅ Dataset registration to Hugging Face completed.")
|
| 50 |
+
except Exception as e:
|
| 51 |
+
print(f"ERROR: {type(e).__name__}: {e}", file=sys.stderr)
|
| 52 |
+
import traceback
|
| 53 |
+
traceback.print_exc()
|
| 54 |
+
sys.exit(1)
|
| 55 |
+
|
| 56 |
+
|
| 57 |
+
if __name__ == "__main__":
|
| 58 |
+
main()
|
| 59 |
+
|
src/deploy_to_hf.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Helper script to push the Streamlit app and deployment files
|
| 3 |
+
to a Hugging Face Space.
|
| 4 |
+
|
| 5 |
+
This script mirrors the behaviour of the notebook's `hosting.py`:
|
| 6 |
+
- Ensures the Space exists (creating it if necessary),
|
| 7 |
+
- Uploads only the files required for deployment (code, Dockerfile,
|
| 8 |
+
and requirements), excluding data, MLflow artifacts, etc.
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
from __future__ import annotations
|
| 12 |
+
|
| 13 |
+
import os
|
| 14 |
+
|
| 15 |
+
from huggingface_hub import HfApi
|
| 16 |
+
|
| 17 |
+
import config
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def main() -> None:
|
| 21 |
+
token = config.HF_TOKEN or os.getenv("HF_TOKEN")
|
| 22 |
+
if not token:
|
| 23 |
+
raise ValueError(
|
| 24 |
+
"HF_TOKEN is not set. Please export HF_TOKEN or configure it in config.py."
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
space_repo = config.HF_SPACE_REPO or os.getenv("HF_SPACE_REPO")
|
| 28 |
+
if not space_repo:
|
| 29 |
+
raise ValueError(
|
| 30 |
+
"HF_SPACE_REPO is not set. Set it as an environment variable or in config.py."
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
api = HfApi(token=token)
|
| 34 |
+
|
| 35 |
+
# Create the Space if it does not exist
|
| 36 |
+
# Use "docker" SDK since we're deploying with Dockerfile
|
| 37 |
+
api.create_repo(
|
| 38 |
+
repo_id=space_repo,
|
| 39 |
+
repo_type="space",
|
| 40 |
+
space_sdk="docker",
|
| 41 |
+
exist_ok=True,
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# Upload project files needed for deployment.
|
| 45 |
+
# We ignore large local artifacts like raw data, mlruns, and models,
|
| 46 |
+
# similar to how the reference notebook uploads only the deployment folder.
|
| 47 |
+
api.upload_folder(
|
| 48 |
+
folder_path=str(config.PROJECT_ROOT),
|
| 49 |
+
path_in_repo=".",
|
| 50 |
+
repo_id=space_repo,
|
| 51 |
+
repo_type="space",
|
| 52 |
+
ignore_patterns=[
|
| 53 |
+
"data/*",
|
| 54 |
+
"mlruns/*",
|
| 55 |
+
"models/*",
|
| 56 |
+
".git/*",
|
| 57 |
+
"__pycache__/*",
|
| 58 |
+
".github/*",
|
| 59 |
+
],
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
print(f"Deployment files pushed to Hugging Face Space: {space_repo}")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
if __name__ == "__main__":
|
| 66 |
+
main()
|
| 67 |
+
|
| 68 |
+
|
src/eda.py
ADDED
|
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Exploratory Data Analysis (EDA) script for the predictive maintenance project.
|
| 3 |
+
|
| 4 |
+
This script covers:
|
| 5 |
+
- Data overview (shape, types, missing values, basic statistics)
|
| 6 |
+
- Univariate analysis (distributions of features, target balance)
|
| 7 |
+
- Bivariate/multivariate analysis (correlations and pairwise relationships)
|
| 8 |
+
|
| 9 |
+
Figures are saved under `notebooks/figures/` for easy inclusion in reports.
|
| 10 |
+
"""
|
| 11 |
+
|
| 12 |
+
from __future__ import annotations
|
| 13 |
+
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
|
| 16 |
+
import matplotlib.pyplot as plt
|
| 17 |
+
import seaborn as sns
|
| 18 |
+
|
| 19 |
+
import config
|
| 20 |
+
from data_prep import _clean_data, _load_raw_data_from_hf_or_local
|
| 21 |
+
|
| 22 |
+
|
| 23 |
+
FIGURES_DIR = config.PROJECT_ROOT / "notebooks" / "figures"
|
| 24 |
+
FIGURES_DIR.mkdir(parents=True, exist_ok=True)
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def run_eda() -> None:
|
| 28 |
+
# Load and clean data using the same logic as the pipeline
|
| 29 |
+
raw_df = _load_raw_data_from_hf_or_local()
|
| 30 |
+
df = _clean_data(raw_df)
|
| 31 |
+
|
| 32 |
+
print("=== DATA OVERVIEW ===")
|
| 33 |
+
print(f"Shape: {df.shape}")
|
| 34 |
+
print("\nData types:")
|
| 35 |
+
print(df.dtypes)
|
| 36 |
+
print("\nMissing values per column:")
|
| 37 |
+
print(df.isna().sum())
|
| 38 |
+
print("\nSummary statistics:")
|
| 39 |
+
print(df.describe())
|
| 40 |
+
|
| 41 |
+
# Univariate analysis: target distribution
|
| 42 |
+
plt.figure(figsize=(4, 4))
|
| 43 |
+
sns.countplot(x=config.TARGET_COLUMN, data=df)
|
| 44 |
+
plt.title("Engine Condition Distribution")
|
| 45 |
+
plt.xlabel("Engine Condition (0 = Normal, 1 = Faulty)")
|
| 46 |
+
plt.ylabel("Count")
|
| 47 |
+
plt.tight_layout()
|
| 48 |
+
plt.savefig(FIGURES_DIR / "target_distribution.png")
|
| 49 |
+
plt.close()
|
| 50 |
+
|
| 51 |
+
# Univariate analysis: histograms for features
|
| 52 |
+
df[config.FEATURE_COLUMNS].hist(bins=30, figsize=(12, 8))
|
| 53 |
+
plt.suptitle("Feature Distributions", y=1.02)
|
| 54 |
+
plt.tight_layout()
|
| 55 |
+
plt.savefig(FIGURES_DIR / "feature_histograms.png")
|
| 56 |
+
plt.close()
|
| 57 |
+
|
| 58 |
+
# Correlation heatmap (multivariate)
|
| 59 |
+
plt.figure(figsize=(8, 6))
|
| 60 |
+
corr = df[config.FEATURE_COLUMNS + [config.TARGET_COLUMN]].corr()
|
| 61 |
+
sns.heatmap(corr, annot=True, cmap="coolwarm", fmt=".2f")
|
| 62 |
+
plt.title("Correlation Heatmap")
|
| 63 |
+
plt.tight_layout()
|
| 64 |
+
plt.savefig(FIGURES_DIR / "correlation_heatmap.png")
|
| 65 |
+
plt.close()
|
| 66 |
+
|
| 67 |
+
# Pairplot for a subset of features (bivariate relationships)
|
| 68 |
+
subset_cols = ["Engine_RPM", "Lub_Oil_Pressure", "Fuel_Pressure", config.TARGET_COLUMN]
|
| 69 |
+
sns.pairplot(
|
| 70 |
+
df[subset_cols],
|
| 71 |
+
hue=config.TARGET_COLUMN,
|
| 72 |
+
diag_kind="hist",
|
| 73 |
+
corner=True,
|
| 74 |
+
)
|
| 75 |
+
plt.suptitle("Pairwise Relationships (subset of features)", y=1.02)
|
| 76 |
+
plt.tight_layout()
|
| 77 |
+
plt.savefig(FIGURES_DIR / "pairplot_subset.png")
|
| 78 |
+
plt.close()
|
| 79 |
+
|
| 80 |
+
print(f"\nEDA figures saved to: {FIGURES_DIR}")
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
if __name__ == "__main__":
|
| 84 |
+
run_eda()
|
| 85 |
+
|
src/hf_data_utils.py
ADDED
|
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility functions for interacting with the Hugging Face Hub for DATASETS.
|
| 3 |
+
|
| 4 |
+
These helpers are used to:
|
| 5 |
+
- Register the raw engine dataset as a Hugging Face dataset repo.
|
| 6 |
+
- Upload processed train/test splits back to the dataset repo.
|
| 7 |
+
- Download files from the dataset repo for use in data preparation and modeling.
|
| 8 |
+
|
| 9 |
+
All functions expect a valid HF token to be available, typically via:
|
| 10 |
+
- The HF_TOKEN environment variable, or
|
| 11 |
+
- An explicit argument.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
from pathlib import Path
|
| 15 |
+
from typing import Optional
|
| 16 |
+
|
| 17 |
+
from huggingface_hub import HfApi, hf_hub_download
|
| 18 |
+
|
| 19 |
+
import config
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def _get_token(explicit_token: Optional[str] = None) -> str:
|
| 23 |
+
token = explicit_token or config.HF_TOKEN
|
| 24 |
+
if not token:
|
| 25 |
+
raise ValueError(
|
| 26 |
+
"Hugging Face token is not set. "
|
| 27 |
+
"Set HF_TOKEN in the environment or pass token explicitly."
|
| 28 |
+
)
|
| 29 |
+
return token
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def create_or_get_dataset_repo(
|
| 33 |
+
repo_id: str, token: Optional[str] = None, private: bool = False
|
| 34 |
+
) -> None:
|
| 35 |
+
"""
|
| 36 |
+
Create the dataset repo on Hugging Face Hub if it does not already exist.
|
| 37 |
+
"""
|
| 38 |
+
token = _get_token(token)
|
| 39 |
+
api = HfApi(token=token)
|
| 40 |
+
api.create_repo(
|
| 41 |
+
repo_id=repo_id,
|
| 42 |
+
repo_type="dataset",
|
| 43 |
+
private=private,
|
| 44 |
+
exist_ok=True,
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
|
| 48 |
+
def upload_dataset_file(
|
| 49 |
+
local_path: Path,
|
| 50 |
+
repo_id: Optional[str] = None,
|
| 51 |
+
repo_path: Optional[str] = None,
|
| 52 |
+
token: Optional[str] = None,
|
| 53 |
+
) -> None:
|
| 54 |
+
"""
|
| 55 |
+
Upload a single file to the Hugging Face dataset repo.
|
| 56 |
+
|
| 57 |
+
Parameters
|
| 58 |
+
----------
|
| 59 |
+
local_path : Path
|
| 60 |
+
The local file to upload.
|
| 61 |
+
repo_id : str, optional
|
| 62 |
+
The dataset repo ID (e.g., 'username/engine-maintenance-dataset').
|
| 63 |
+
Defaults to config.HF_DATASET_REPO.
|
| 64 |
+
repo_path : str, optional
|
| 65 |
+
The path inside the repo (e.g., 'data/train.csv'). Defaults to the
|
| 66 |
+
file name if not provided.
|
| 67 |
+
token : str, optional
|
| 68 |
+
Hugging Face token. Defaults to config.HF_TOKEN.
|
| 69 |
+
"""
|
| 70 |
+
token = _get_token(token)
|
| 71 |
+
repo_id = repo_id or config.HF_DATASET_REPO
|
| 72 |
+
repo_path = repo_path or local_path.name
|
| 73 |
+
|
| 74 |
+
api = HfApi(token=token)
|
| 75 |
+
create_or_get_dataset_repo(repo_id=repo_id, token=token)
|
| 76 |
+
|
| 77 |
+
api.upload_file(
|
| 78 |
+
path_or_fileobj=str(local_path),
|
| 79 |
+
path_in_repo=repo_path,
|
| 80 |
+
repo_id=repo_id,
|
| 81 |
+
repo_type="dataset",
|
| 82 |
+
)
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def download_dataset_file(
|
| 86 |
+
filename: str,
|
| 87 |
+
repo_id: Optional[str] = None,
|
| 88 |
+
token: Optional[str] = None,
|
| 89 |
+
local_dir: Optional[Path] = None,
|
| 90 |
+
) -> Path:
|
| 91 |
+
"""
|
| 92 |
+
Download a file from the Hugging Face dataset repo and return its local path.
|
| 93 |
+
|
| 94 |
+
Parameters
|
| 95 |
+
----------
|
| 96 |
+
filename : str
|
| 97 |
+
The filename inside the dataset repo (e.g., 'data/engine_data.csv').
|
| 98 |
+
repo_id : str, optional
|
| 99 |
+
The dataset repo ID. Defaults to config.HF_DATASET_REPO.
|
| 100 |
+
token : str, optional
|
| 101 |
+
Hugging Face token.
|
| 102 |
+
local_dir : Path, optional
|
| 103 |
+
Directory to place the downloaded file. Defaults to config.DATA_DIR.
|
| 104 |
+
"""
|
| 105 |
+
token = _get_token(token)
|
| 106 |
+
repo_id = repo_id or config.HF_DATASET_REPO
|
| 107 |
+
local_dir = local_dir or config.DATA_DIR
|
| 108 |
+
local_dir.mkdir(parents=True, exist_ok=True)
|
| 109 |
+
|
| 110 |
+
downloaded_path = hf_hub_download(
|
| 111 |
+
repo_id=repo_id,
|
| 112 |
+
filename=filename,
|
| 113 |
+
repo_type="dataset",
|
| 114 |
+
token=token,
|
| 115 |
+
local_dir=str(local_dir),
|
| 116 |
+
local_dir_use_symlinks=False,
|
| 117 |
+
)
|
| 118 |
+
return Path(downloaded_path)
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def register_raw_engine_data_to_hf(
|
| 122 |
+
token: Optional[str] = None,
|
| 123 |
+
repo_id: Optional[str] = None,
|
| 124 |
+
) -> None:
|
| 125 |
+
"""
|
| 126 |
+
Convenience function to register the original engine_data.csv
|
| 127 |
+
in the dataset repo under 'data/engine_data.csv'.
|
| 128 |
+
"""
|
| 129 |
+
repo_id = repo_id or config.HF_DATASET_REPO
|
| 130 |
+
local_path = config.RAW_DATA_FILE
|
| 131 |
+
if not local_path.exists():
|
| 132 |
+
raise FileNotFoundError(
|
| 133 |
+
f"Raw data file not found at {local_path}. "
|
| 134 |
+
"Ensure engine_data.csv is present in the data/ folder."
|
| 135 |
+
)
|
| 136 |
+
|
| 137 |
+
upload_dataset_file(
|
| 138 |
+
local_path=local_path,
|
| 139 |
+
repo_id=repo_id,
|
| 140 |
+
repo_path="data/engine_data.csv",
|
| 141 |
+
token=token,
|
| 142 |
+
)
|
| 143 |
+
|
src/hf_model_utils.py
ADDED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Utility functions for interacting with the Hugging Face Hub for MODELS.
|
| 3 |
+
|
| 4 |
+
Used to:
|
| 5 |
+
- Upload the best trained model to a model repo.
|
| 6 |
+
- Download the registered model for inference or deployment.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Optional
|
| 11 |
+
|
| 12 |
+
import joblib
|
| 13 |
+
from huggingface_hub import HfApi, hf_hub_download
|
| 14 |
+
|
| 15 |
+
import config
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def _get_token(explicit_token: Optional[str] = None) -> str:
|
| 19 |
+
token = explicit_token or config.HF_TOKEN
|
| 20 |
+
if not token:
|
| 21 |
+
raise ValueError(
|
| 22 |
+
"Hugging Face token is not set. "
|
| 23 |
+
"Set HF_TOKEN in the environment or pass token explicitly."
|
| 24 |
+
)
|
| 25 |
+
return token
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def create_or_get_model_repo(
|
| 29 |
+
repo_id: str, token: Optional[str] = None, private: bool = False
|
| 30 |
+
) -> None:
|
| 31 |
+
"""
|
| 32 |
+
Create the model repo on Hugging Face Hub if it does not already exist.
|
| 33 |
+
"""
|
| 34 |
+
token = _get_token(token)
|
| 35 |
+
api = HfApi(token=token)
|
| 36 |
+
api.create_repo(
|
| 37 |
+
repo_id=repo_id,
|
| 38 |
+
repo_type="model",
|
| 39 |
+
private=private,
|
| 40 |
+
exist_ok=True,
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
def upload_model(
|
| 45 |
+
local_model_path: Path,
|
| 46 |
+
repo_id: Optional[str] = None,
|
| 47 |
+
repo_path: str = "model.joblib",
|
| 48 |
+
token: Optional[str] = None,
|
| 49 |
+
) -> None:
|
| 50 |
+
"""
|
| 51 |
+
Upload the trained model artifact to the Hugging Face model hub.
|
| 52 |
+
"""
|
| 53 |
+
token = _get_token(token)
|
| 54 |
+
repo_id = repo_id or config.HF_MODEL_REPO
|
| 55 |
+
|
| 56 |
+
api = HfApi(token=token)
|
| 57 |
+
create_or_get_model_repo(repo_id=repo_id, token=token)
|
| 58 |
+
|
| 59 |
+
api.upload_file(
|
| 60 |
+
path_or_fileobj=str(local_model_path),
|
| 61 |
+
path_in_repo=repo_path,
|
| 62 |
+
repo_id=repo_id,
|
| 63 |
+
repo_type="model",
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
def download_model(
|
| 68 |
+
repo_id: Optional[str] = None,
|
| 69 |
+
filename: str = "model.joblib",
|
| 70 |
+
token: Optional[str] = None,
|
| 71 |
+
local_dir: Optional[Path] = None,
|
| 72 |
+
):
|
| 73 |
+
"""
|
| 74 |
+
Download a model artifact from the Hugging Face model hub and load it.
|
| 75 |
+
"""
|
| 76 |
+
token = _get_token(token)
|
| 77 |
+
repo_id = repo_id or config.HF_MODEL_REPO
|
| 78 |
+
local_dir = local_dir or config.MODELS_DIR
|
| 79 |
+
local_dir.mkdir(parents=True, exist_ok=True)
|
| 80 |
+
|
| 81 |
+
downloaded_path = hf_hub_download(
|
| 82 |
+
repo_id=repo_id,
|
| 83 |
+
filename=filename,
|
| 84 |
+
repo_type="model",
|
| 85 |
+
token=token,
|
| 86 |
+
local_dir=str(local_dir),
|
| 87 |
+
local_dir_use_symlinks=False,
|
| 88 |
+
)
|
| 89 |
+
|
| 90 |
+
return joblib.load(downloaded_path)
|
| 91 |
+
|
src/inference.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Inference utilities for the predictive maintenance model.
|
| 3 |
+
|
| 4 |
+
These functions are used both by scripts and by the Streamlit app
|
| 5 |
+
to load a trained model (from local storage or Hugging Face) and
|
| 6 |
+
generate predictions from raw sensor values.
|
| 7 |
+
"""
|
| 8 |
+
|
| 9 |
+
from __future__ import annotations
|
| 10 |
+
|
| 11 |
+
from typing import Dict, Iterable, Optional
|
| 12 |
+
|
| 13 |
+
import joblib
|
| 14 |
+
import numpy as np
|
| 15 |
+
import pandas as pd
|
| 16 |
+
|
| 17 |
+
import config
|
| 18 |
+
from hf_model_utils import download_model
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def load_local_model() -> object:
|
| 22 |
+
"""
|
| 23 |
+
Load the best trained model from the local models directory.
|
| 24 |
+
"""
|
| 25 |
+
if not config.BEST_MODEL_LOCAL_PATH.exists():
|
| 26 |
+
raise FileNotFoundError(
|
| 27 |
+
f"Local model not found at {config.BEST_MODEL_LOCAL_PATH}. "
|
| 28 |
+
"Run train.py to create it, or configure HF model loading."
|
| 29 |
+
)
|
| 30 |
+
return joblib.load(config.BEST_MODEL_LOCAL_PATH)
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def load_hf_model() -> object:
|
| 34 |
+
"""
|
| 35 |
+
Load the model directly from the Hugging Face model hub.
|
| 36 |
+
"""
|
| 37 |
+
return download_model()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def build_input_dataframe(
|
| 41 |
+
inputs: Dict[str, float],
|
| 42 |
+
) -> pd.DataFrame:
|
| 43 |
+
"""
|
| 44 |
+
Convert a dictionary of feature values into a single-row DataFrame
|
| 45 |
+
with columns ordered according to config.FEATURE_COLUMNS.
|
| 46 |
+
"""
|
| 47 |
+
data = {col: float(inputs.get(col, 0.0)) for col in config.FEATURE_COLUMNS}
|
| 48 |
+
return pd.DataFrame([data])
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def predict_engine_condition(
|
| 52 |
+
inputs: Dict[str, float],
|
| 53 |
+
model: Optional[object] = None,
|
| 54 |
+
source: str = "local",
|
| 55 |
+
) -> Dict[str, float]:
|
| 56 |
+
"""
|
| 57 |
+
Predict whether the engine requires maintenance.
|
| 58 |
+
|
| 59 |
+
Parameters
|
| 60 |
+
----------
|
| 61 |
+
inputs : dict
|
| 62 |
+
Keys correspond to feature names in config.FEATURE_COLUMNS.
|
| 63 |
+
model : object, optional
|
| 64 |
+
Pre-loaded sklearn Pipeline model. If None, it will be loaded
|
| 65 |
+
from `source`.
|
| 66 |
+
source : {'local', 'hf'}
|
| 67 |
+
If model is None, determines where to load it from.
|
| 68 |
+
|
| 69 |
+
Returns
|
| 70 |
+
-------
|
| 71 |
+
dict
|
| 72 |
+
Contains the predicted class label (0/1) and the probability
|
| 73 |
+
of the positive class.
|
| 74 |
+
"""
|
| 75 |
+
if model is None:
|
| 76 |
+
if source == "hf":
|
| 77 |
+
model = load_hf_model()
|
| 78 |
+
else:
|
| 79 |
+
model = load_local_model()
|
| 80 |
+
|
| 81 |
+
df = build_input_dataframe(inputs)
|
| 82 |
+
proba = model.predict_proba(df)[0, 1]
|
| 83 |
+
pred = int(proba >= 0.5)
|
| 84 |
+
|
| 85 |
+
return {
|
| 86 |
+
"prediction": pred,
|
| 87 |
+
"probability_faulty": float(proba),
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
__all__ = [
|
| 92 |
+
"load_local_model",
|
| 93 |
+
"load_hf_model",
|
| 94 |
+
"build_input_dataframe",
|
| 95 |
+
"predict_engine_condition",
|
| 96 |
+
]
|
| 97 |
+
|
src/train.py
ADDED
|
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Model training and experimentation tracking script.
|
| 3 |
+
|
| 4 |
+
Responsibilities:
|
| 5 |
+
- Load prepared train and test datasets (preferably from Hugging Face dataset repo).
|
| 6 |
+
- Define a model pipeline (Random Forest by default) and hyperparameter search space.
|
| 7 |
+
- Run hyperparameter tuning with cross-validation.
|
| 8 |
+
- Log all tuned parameters and evaluation metrics with MLflow.
|
| 9 |
+
- Save the best model locally.
|
| 10 |
+
- Register/upload the best model to the Hugging Face model hub.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
from __future__ import annotations
|
| 14 |
+
|
| 15 |
+
from typing import Dict, Tuple
|
| 16 |
+
|
| 17 |
+
import joblib
|
| 18 |
+
import mlflow
|
| 19 |
+
import mlflow.sklearn # noqa: F401
|
| 20 |
+
import numpy as np
|
| 21 |
+
import pandas as pd
|
| 22 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 23 |
+
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
|
| 24 |
+
from sklearn.model_selection import RandomizedSearchCV
|
| 25 |
+
from sklearn.pipeline import Pipeline
|
| 26 |
+
from sklearn.preprocessing import StandardScaler
|
| 27 |
+
|
| 28 |
+
import config
|
| 29 |
+
from hf_data_utils import download_dataset_file
|
| 30 |
+
from hf_model_utils import upload_model
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def _load_train_test_from_hf_or_local() -> Tuple[pd.DataFrame, pd.DataFrame]:
|
| 34 |
+
"""
|
| 35 |
+
Load train and test data from the HF dataset repo if available,
|
| 36 |
+
otherwise fall back to local CSVs created by data_prep.py.
|
| 37 |
+
"""
|
| 38 |
+
if config.HF_TOKEN and config.HF_DATASET_REPO:
|
| 39 |
+
try:
|
| 40 |
+
train_path = download_dataset_file(
|
| 41 |
+
filename="data/train.csv",
|
| 42 |
+
repo_id=config.HF_DATASET_REPO,
|
| 43 |
+
token=config.HF_TOKEN,
|
| 44 |
+
local_dir=config.DATA_DIR,
|
| 45 |
+
)
|
| 46 |
+
test_path = download_dataset_file(
|
| 47 |
+
filename="data/test.csv",
|
| 48 |
+
repo_id=config.HF_DATASET_REPO,
|
| 49 |
+
token=config.HF_TOKEN,
|
| 50 |
+
local_dir=config.DATA_DIR,
|
| 51 |
+
)
|
| 52 |
+
train_df = pd.read_csv(train_path)
|
| 53 |
+
test_df = pd.read_csv(test_path)
|
| 54 |
+
return train_df, test_df
|
| 55 |
+
except Exception:
|
| 56 |
+
# Fall back to local
|
| 57 |
+
pass
|
| 58 |
+
|
| 59 |
+
if not config.TRAIN_FILE.exists() or not config.TEST_FILE.exists():
|
| 60 |
+
raise FileNotFoundError(
|
| 61 |
+
"Train/test files not found locally or in the HF dataset repo. "
|
| 62 |
+
"Run data_prep.py first to generate the splits."
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
train_df = pd.read_csv(config.TRAIN_FILE)
|
| 66 |
+
test_df = pd.read_csv(config.TEST_FILE)
|
| 67 |
+
return train_df, test_df
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def _build_model_and_search_space() -> Tuple[Pipeline, Dict]:
|
| 71 |
+
"""
|
| 72 |
+
Build a sklearn Pipeline and define the hyperparameter search space.
|
| 73 |
+
|
| 74 |
+
We use a RandomForestClassifier with a StandardScaler on numeric features.
|
| 75 |
+
"""
|
| 76 |
+
clf = RandomForestClassifier(random_state=config.RANDOM_STATE)
|
| 77 |
+
|
| 78 |
+
pipeline = Pipeline(
|
| 79 |
+
steps=[
|
| 80 |
+
("scaler", StandardScaler()),
|
| 81 |
+
("clf", clf),
|
| 82 |
+
]
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
param_distributions = {
|
| 86 |
+
"clf__n_estimators": [100, 200, 300, 400],
|
| 87 |
+
"clf__max_depth": [None, 5, 10, 20],
|
| 88 |
+
"clf__min_samples_split": [2, 5, 10],
|
| 89 |
+
"clf__min_samples_leaf": [1, 2, 4],
|
| 90 |
+
# 'auto' is deprecated in recent sklearn versions; use valid options only
|
| 91 |
+
"clf__max_features": ["sqrt", "log2", None],
|
| 92 |
+
"clf__bootstrap": [True, False],
|
| 93 |
+
}
|
| 94 |
+
|
| 95 |
+
return pipeline, param_distributions
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def _evaluate_model(
|
| 99 |
+
model: Pipeline, X_test: pd.DataFrame, y_test: pd.Series
|
| 100 |
+
) -> Dict[str, float]:
|
| 101 |
+
"""
|
| 102 |
+
Compute standard binary classification metrics.
|
| 103 |
+
"""
|
| 104 |
+
y_pred = model.predict(X_test)
|
| 105 |
+
metrics = {
|
| 106 |
+
"accuracy": accuracy_score(y_test, y_pred),
|
| 107 |
+
"precision": precision_score(y_test, y_pred, zero_division=0),
|
| 108 |
+
"recall": recall_score(y_test, y_pred, zero_division=0),
|
| 109 |
+
"f1": f1_score(y_test, y_pred, zero_division=0),
|
| 110 |
+
}
|
| 111 |
+
return metrics
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def main() -> None:
|
| 115 |
+
"""
|
| 116 |
+
Execute the training, tuning, logging, and model registration pipeline.
|
| 117 |
+
"""
|
| 118 |
+
print("Loading train and test data...")
|
| 119 |
+
train_df, test_df = _load_train_test_from_hf_or_local()
|
| 120 |
+
|
| 121 |
+
X_train = train_df[config.FEATURE_COLUMNS]
|
| 122 |
+
y_train = train_df[config.TARGET_COLUMN]
|
| 123 |
+
X_test = test_df[config.FEATURE_COLUMNS]
|
| 124 |
+
y_test = test_df[config.TARGET_COLUMN]
|
| 125 |
+
|
| 126 |
+
print("Building model and hyperparameter search space...")
|
| 127 |
+
pipeline, param_distributions = _build_model_and_search_space()
|
| 128 |
+
|
| 129 |
+
search = RandomizedSearchCV(
|
| 130 |
+
estimator=pipeline,
|
| 131 |
+
param_distributions=param_distributions,
|
| 132 |
+
n_iter=20,
|
| 133 |
+
cv=5,
|
| 134 |
+
scoring="f1",
|
| 135 |
+
n_jobs=-1,
|
| 136 |
+
verbose=1,
|
| 137 |
+
random_state=config.RANDOM_STATE,
|
| 138 |
+
)
|
| 139 |
+
|
| 140 |
+
# Configure MLflow
|
| 141 |
+
mlflow.set_tracking_uri(config.MLFLOW_TRACKING_URI)
|
| 142 |
+
mlflow.set_experiment(config.MLFLOW_EXPERIMENT_NAME)
|
| 143 |
+
|
| 144 |
+
print("Starting hyperparameter tuning with MLflow tracking...")
|
| 145 |
+
with mlflow.start_run(run_name="RandomForest_random_search"):
|
| 146 |
+
search.fit(X_train, y_train)
|
| 147 |
+
|
| 148 |
+
best_model: Pipeline = search.best_estimator_
|
| 149 |
+
best_params = search.best_params_
|
| 150 |
+
|
| 151 |
+
# Log all evaluated parameter combinations as nested runs,
|
| 152 |
+
# similar to the reference notebook pattern.
|
| 153 |
+
results = search.cv_results_
|
| 154 |
+
for i in range(len(results["params"])):
|
| 155 |
+
param_set = results["params"][i]
|
| 156 |
+
mean_score = results["mean_test_score"][i]
|
| 157 |
+
with mlflow.start_run(nested=True):
|
| 158 |
+
mlflow.log_params(param_set)
|
| 159 |
+
mlflow.log_metric("mean_cv_f1", float(mean_score))
|
| 160 |
+
|
| 161 |
+
# Evaluation
|
| 162 |
+
metrics = _evaluate_model(best_model, X_test, y_test)
|
| 163 |
+
|
| 164 |
+
# Log parameters and metrics
|
| 165 |
+
mlflow.log_params(best_params)
|
| 166 |
+
for name, value in metrics.items():
|
| 167 |
+
mlflow.log_metric(name, float(value))
|
| 168 |
+
|
| 169 |
+
# Save model locally
|
| 170 |
+
config.MODELS_DIR.mkdir(parents=True, exist_ok=True)
|
| 171 |
+
joblib.dump(best_model, config.BEST_MODEL_LOCAL_PATH)
|
| 172 |
+
mlflow.log_artifact(str(config.BEST_MODEL_LOCAL_PATH), artifact_path="artifacts")
|
| 173 |
+
|
| 174 |
+
# Also log the model in MLflow's model registry format
|
| 175 |
+
mlflow.sklearn.log_model(best_model, artifact_path="engine_model")
|
| 176 |
+
|
| 177 |
+
print("Best parameters found:")
|
| 178 |
+
for k, v in best_params.items():
|
| 179 |
+
print(f" {k}: {v}")
|
| 180 |
+
|
| 181 |
+
print("Evaluation metrics on test set:")
|
| 182 |
+
for k, v in metrics.items():
|
| 183 |
+
print(f" {k}: {v:.4f}")
|
| 184 |
+
|
| 185 |
+
# Upload best model to Hugging Face model hub, if configured
|
| 186 |
+
if config.HF_TOKEN and config.HF_MODEL_REPO:
|
| 187 |
+
try:
|
| 188 |
+
print("Uploading best model to Hugging Face model hub...")
|
| 189 |
+
upload_model(
|
| 190 |
+
local_model_path=config.BEST_MODEL_LOCAL_PATH,
|
| 191 |
+
repo_id=config.HF_MODEL_REPO,
|
| 192 |
+
repo_path="model.joblib",
|
| 193 |
+
token=config.HF_TOKEN,
|
| 194 |
+
)
|
| 195 |
+
print("Model upload to Hugging Face completed.")
|
| 196 |
+
except Exception as e:
|
| 197 |
+
print(f"Warning: Failed to upload model to Hugging Face: {e}")
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
if __name__ == "__main__":
|
| 201 |
+
main()
|
| 202 |
+
|