Update README.md
Browse files
README.md
CHANGED
|
@@ -38,7 +38,6 @@ This repository contains trained machine learning models for predicting pectin q
|
|
| 38 |
| Support Vector Regression | support_vector_regression | 0.4832 | 6612.2360 | Machine learning model for pectin production |
|
| 39 |
| XGBoost | xgboost | 0.9203 | 1074.2310 | XGBoost model with excellent performance on tabular data |
|
| 40 |
|
| 41 |
-
|
| 42 |
### Best Model Performance
|
| 43 |
- **Average R²**: 0.9427
|
| 44 |
- **Average MAE**: 868.44
|
|
@@ -47,32 +46,53 @@ This repository contains trained machine learning models for predicting pectin q
|
|
| 47 |
## 📊 Model Details
|
| 48 |
|
| 49 |
### Target Variables
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
|
| 56 |
### Feature Variables
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
## 🚀 Quick Start
|
| 65 |
|
| 66 |
### Installation
|
| 67 |
```bash
|
| 68 |
-
pip install transformers huggingface-hub scikit-learn xgboost pandas numpy joblib
|
| 69 |
```
|
| 70 |
|
| 71 |
### Basic Usage
|
| 72 |
|
| 73 |
-
|
| 74 |
-
### Using the Best Model
|
| 75 |
-
|
| 76 |
```python
|
| 77 |
from huggingface_hub import hf_hub_download
|
| 78 |
import joblib
|
|
@@ -80,6 +100,9 @@ import pandas as pd
|
|
| 80 |
import numpy as np
|
| 81 |
import pickle
|
| 82 |
|
|
|
|
|
|
|
|
|
|
| 83 |
# Download model and supporting files
|
| 84 |
model_path = hf_hub_download(
|
| 85 |
repo_id="arabovs-ai-lab/PectinProductionModels",
|
|
@@ -105,8 +128,14 @@ scaler = joblib.load(scaler_path)
|
|
| 105 |
with open(encoder_path, 'rb') as f:
|
| 106 |
label_encoder = pickle.load(f)
|
| 107 |
|
| 108 |
-
# Prepare input data
|
| 109 |
-
input_data = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
# Create DataFrame
|
| 112 |
df = pd.DataFrame([input_data])
|
|
@@ -114,7 +143,7 @@ df = pd.DataFrame([input_data])
|
|
| 114 |
# Preprocess: encode sample type
|
| 115 |
df['sample_encoded'] = label_encoder.transform([input_data['sample']])[0]
|
| 116 |
|
| 117 |
-
# Create method_encoded feature
|
| 118 |
df['method_encoded'] = 1 if input_data['time_min'] <= 15 else 0
|
| 119 |
|
| 120 |
# Select features in correct order
|
|
@@ -129,7 +158,8 @@ predictions = model.predict(X_scaled)
|
|
| 129 |
|
| 130 |
# Create results dictionary
|
| 131 |
results = {}
|
| 132 |
-
|
|
|
|
| 133 |
results[target] = predictions[0, i]
|
| 134 |
|
| 135 |
print("Prediction results:")
|
|
@@ -137,34 +167,124 @@ for target, value in results.items():
|
|
| 137 |
print(f" {target}: {value:.4f}")
|
| 138 |
```
|
| 139 |
|
| 140 |
-
|
|
|
|
|
|
|
| 141 |
|
| 142 |
```python
|
| 143 |
import pandas as pd
|
|
|
|
| 144 |
from huggingface_hub import hf_hub_download
|
| 145 |
import joblib
|
| 146 |
import pickle
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
class PectinPredictor:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
def __init__(self, repo_id="arabovs-ai-lab/PectinProductionModels"):
|
|
|
|
| 150 |
self.repo_id = repo_id
|
| 151 |
self.model = None
|
| 152 |
self.scaler = None
|
| 153 |
self.label_encoder = None
|
|
|
|
| 154 |
self.feature_columns = ['time_min', 'temperature_c', 'pressure_atm', 'ph', 'sample_encoded', 'method_encoded']
|
|
|
|
| 155 |
self.target_columns = ['pectin_yield', 'galacturonic_acid', 'molecular_weight', 'esterification_degree']
|
| 156 |
|
| 157 |
-
def load_from_hub(self):
|
| 158 |
-
"""
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
model_path = hf_hub_download(
|
| 161 |
repo_id=self.repo_id,
|
| 162 |
-
filename="
|
| 163 |
repo_type="model"
|
| 164 |
)
|
| 165 |
-
self.model = joblib.load(model_path)
|
| 166 |
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
scaler_path = hf_hub_download(
|
| 169 |
repo_id=self.repo_id,
|
| 170 |
filename="scaler.pkl",
|
|
@@ -172,7 +292,7 @@ class PectinPredictor:
|
|
| 172 |
)
|
| 173 |
self.scaler = joblib.load(scaler_path)
|
| 174 |
|
| 175 |
-
# Download label encoder
|
| 176 |
encoder_path = hf_hub_download(
|
| 177 |
repo_id=self.repo_id,
|
| 178 |
filename="label_encoder.pkl",
|
|
@@ -180,77 +300,280 @@ class PectinPredictor:
|
|
| 180 |
)
|
| 181 |
with open(encoder_path, 'rb') as f:
|
| 182 |
self.label_encoder = pickle.load(f)
|
| 183 |
-
|
| 184 |
-
def
|
| 185 |
-
"""
|
| 186 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 187 |
processed_df = input_df.copy()
|
| 188 |
|
| 189 |
-
#
|
| 190 |
processed_df['sample_encoded'] = self.label_encoder.transform(processed_df['sample'])
|
| 191 |
|
| 192 |
-
# Create
|
| 193 |
processed_df['method_encoded'] = np.where(processed_df['time_min'] <= 15, 1, 0)
|
| 194 |
|
| 195 |
-
# Select and
|
| 196 |
X = processed_df[self.feature_columns]
|
| 197 |
X_scaled = self.scaler.transform(X)
|
| 198 |
|
| 199 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
predictions = self.model.predict(X_scaled)
|
| 201 |
|
| 202 |
-
#
|
| 203 |
result_df = input_df.copy()
|
| 204 |
for i, target in enumerate(self.target_columns):
|
| 205 |
result_df[f'predicted_{target}'] = predictions[:, i]
|
| 206 |
|
| 207 |
return result_df
|
| 208 |
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
# Load your data
|
| 214 |
-
# df = pd.read_excel("your_data.xlsx")
|
| 215 |
-
# results = predictor.predict_batch(df)
|
| 216 |
-
```
|
| 217 |
-
|
| 218 |
-
### Comparing Different Models
|
| 219 |
-
|
| 220 |
-
```python
|
| 221 |
-
from huggingface_hub import hf_hub_download
|
| 222 |
-
import joblib
|
| 223 |
-
|
| 224 |
-
def compare_models(input_data, repo_id="arabovs-ai-lab/PectinProductionModels"):
|
| 225 |
-
"""Compare predictions from different models."""
|
| 226 |
-
models_to_compare = [
|
| 227 |
-
"best_model/model.pkl",
|
| 228 |
-
"gradient_boosting/model.pkl",
|
| 229 |
-
"random_forest/model.pkl",
|
| 230 |
-
"xgboost/model.pkl"
|
| 231 |
-
]
|
| 232 |
-
|
| 233 |
-
results = {}
|
| 234 |
-
|
| 235 |
-
for model_path in models_to_compare:
|
| 236 |
-
model_name = model_path.split('/')[0]
|
| 237 |
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
)
|
| 244 |
|
| 245 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 246 |
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
-
|
| 252 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
## 📁 Repository Structure
|
| 256 |
|
|
@@ -270,7 +593,7 @@ arabovs-ai-lab/PectinProductionModels/
|
|
| 270 |
├── k_neighbors/ # K-Neighbors model
|
| 271 |
├── multilayer_perceptron/ # MLP model
|
| 272 |
├── scaler.pkl # Feature scaler
|
| 273 |
-
├── label_encoder.pkl # Label encoder for
|
| 274 |
├── model_metadata.json # Training metadata
|
| 275 |
├── models_metadata.json # All models metadata
|
| 276 |
└── README.md # This file
|
|
@@ -279,7 +602,7 @@ arabovs-ai-lab/PectinProductionModels/
|
|
| 279 |
## 🧪 Training Information
|
| 280 |
|
| 281 |
- **Dataset**: 1000 experimental records
|
| 282 |
-
- **Features**: 6 process parameters
|
| 283 |
- **Targets**: 4 quality parameters
|
| 284 |
- **Validation**: 80/20 train-test split
|
| 285 |
- **Cross-validation**: 5-fold
|
|
@@ -287,10 +610,40 @@ arabovs-ai-lab/PectinProductionModels/
|
|
| 287 |
|
| 288 |
## 💡 Key Features
|
| 289 |
|
| 290 |
-
- **Multi-target regression**: Predicts 4 parameters simultaneously
|
| 291 |
- **Process optimization**: Helps optimize pectin production conditions
|
| 292 |
- **Quality prediction**: Estimates pectin quality from process variables
|
| 293 |
- **Multiple algorithms**: 10 different ML algorithms for comparison
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 294 |
|
| 295 |
## 📄 License
|
| 296 |
|
|
@@ -306,4 +659,4 @@ MIT License
|
|
| 306 |
- [Pectin Production Technology](https://en.wikipedia.org/wiki/Pectin)
|
| 307 |
- [Scikit-learn](https://scikit-learn.org/)
|
| 308 |
- [Hugging Face Hub](https://huggingface.co/docs/hub/)
|
| 309 |
-
|
|
|
|
| 38 |
| Support Vector Regression | support_vector_regression | 0.4832 | 6612.2360 | Machine learning model for pectin production |
|
| 39 |
| XGBoost | xgboost | 0.9203 | 1074.2310 | XGBoost model with excellent performance on tabular data |
|
| 40 |
|
|
|
|
| 41 |
### Best Model Performance
|
| 42 |
- **Average R²**: 0.9427
|
| 43 |
- **Average MAE**: 868.44
|
|
|
|
| 46 |
## 📊 Model Details
|
| 47 |
|
| 48 |
### Target Variables
|
| 49 |
+
- `pectin_yield`: Пектиновые вещества, ПВ, % - Pectin yield (%)
|
| 50 |
+
- `galacturonic_acid`: Галактуроновая кислота, ГК, % - Galacturonic acid content (%)
|
| 51 |
+
- `molecular_weight`: Молекулярная масса, Mw, Д - Molecular weight (Da)
|
| 52 |
+
- `esterification_degree`: Степень этерификации, СЭ, % - Esterification degree (%)
|
|
|
|
| 53 |
|
| 54 |
### Feature Variables
|
| 55 |
+
- `time_min`: Время процесса, t, мин - Extraction time (minutes)
|
| 56 |
+
- `temperature_c`: Температура, T, °C - Temperature (°C)
|
| 57 |
+
- `pressure_atm`: Давление, P, атм - Pressure (atm)
|
| 58 |
+
- `ph`: Кислотность, pH - pH level
|
| 59 |
+
- `sample_encoded`: Тип сырья - Raw material type (encoded)
|
| 60 |
+
- `method_encoded`: Метод экстракции - Extraction method (encoded: 1 for fast ≤15 min, 0 for slow >15 min)
|
| 61 |
+
|
| 62 |
+
**Note**: Parameter Т:Ж (соотношение твердое:жидкое) was excluded from model training because it had a constant value of 1:20 across all experiments and therefore carried no predictive information.
|
| 63 |
+
|
| 64 |
+
## 📋 Experimental Data Examples
|
| 65 |
+
|
| 66 |
+
### Sample Experimental Data
|
| 67 |
+
|
| 68 |
+
| Exp | Sample | t, мин | T, °C | P, атм | pH | ПВ, % | ГК, % | Mw, Д | СЭ, % |
|
| 69 |
+
|-----|--------|--------|-------|--------|-----|-------|-------|-------|-------|
|
| 70 |
+
| 1 | ЯП(М) | 7 | 120 | 2.08 | 2.0 | 25.864 | 52.706 | 103773.64 | 71.17 |
|
| 71 |
+
| 2 | ЯП(М) | 7 | 120 | 1.74 | 2.08 | 24.83 | 51.645 | 103098.49 | 70.015 |
|
| 72 |
+
| 3 | Абр. | 5 | 130 | 2.09 | 1.74 | 14.755 | 67.55 | 127235.35 | 82.813 |
|
| 73 |
+
| 4 | ЯП(М) | 7 | 120 | 2.05 | 2.0 | 26.353 | 53.804 | 105994.85 | 65.415 |
|
| 74 |
+
|
| 75 |
+
### Raw Material Types
|
| 76 |
+
|
| 77 |
+
| Code | Full Name | Type |
|
| 78 |
+
|------|-----------|------|
|
| 79 |
+
| Абр. | Абрикосовый (Apricot) | Fruit |
|
| 80 |
+
| Рв. | Ревень (Rhubarb) | Vegetable |
|
| 81 |
+
| Айв. | Айвы (Quince) | Fruit |
|
| 82 |
+
| Ткв. | Тыквенный (Pumpkin) | Vegetable |
|
| 83 |
+
| КрП | Корзинка подсолнечника (Sunflower head) | Plant |
|
| 84 |
+
| ЯП(Ф) | Яблочный пектин Файзобод (Apple Faizobod) | Fruit |
|
| 85 |
+
| ЯП(М) | Яблочный пектин Муминобод (Apple Muminobod) | Fruit |
|
| 86 |
|
| 87 |
## 🚀 Quick Start
|
| 88 |
|
| 89 |
### Installation
|
| 90 |
```bash
|
| 91 |
+
pip install transformers huggingface-hub scikit-learn xgboost pandas numpy joblib tabulate
|
| 92 |
```
|
| 93 |
|
| 94 |
### Basic Usage
|
| 95 |
|
|
|
|
|
|
|
|
|
|
| 96 |
```python
|
| 97 |
from huggingface_hub import hf_hub_download
|
| 98 |
import joblib
|
|
|
|
| 100 |
import numpy as np
|
| 101 |
import pickle
|
| 102 |
|
| 103 |
+
import warnings
|
| 104 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
|
| 105 |
+
|
| 106 |
# Download model and supporting files
|
| 107 |
model_path = hf_hub_download(
|
| 108 |
repo_id="arabovs-ai-lab/PectinProductionModels",
|
|
|
|
| 128 |
with open(encoder_path, 'rb') as f:
|
| 129 |
label_encoder = pickle.load(f)
|
| 130 |
|
| 131 |
+
# Prepare input data (Т:Ж parameter is not required as it was constant)
|
| 132 |
+
input_data = {
|
| 133 |
+
'sample': 'Айв.',
|
| 134 |
+
'time_min': 5,
|
| 135 |
+
'temperature_c': 120,
|
| 136 |
+
'pressure_atm': 1.0,
|
| 137 |
+
'ph': 2.5
|
| 138 |
+
}
|
| 139 |
|
| 140 |
# Create DataFrame
|
| 141 |
df = pd.DataFrame([input_data])
|
|
|
|
| 143 |
# Preprocess: encode sample type
|
| 144 |
df['sample_encoded'] = label_encoder.transform([input_data['sample']])[0]
|
| 145 |
|
| 146 |
+
# Create method_encoded feature based on extraction time
|
| 147 |
df['method_encoded'] = 1 if input_data['time_min'] <= 15 else 0
|
| 148 |
|
| 149 |
# Select features in correct order
|
|
|
|
| 158 |
|
| 159 |
# Create results dictionary
|
| 160 |
results = {}
|
| 161 |
+
target_names = ['pectin_yield', 'galacturonic_acid', 'molecular_weight', 'esterification_degree']
|
| 162 |
+
for i, target in enumerate(target_names):
|
| 163 |
results[target] = predictions[0, i]
|
| 164 |
|
| 165 |
print("Prediction results:")
|
|
|
|
| 167 |
print(f" {target}: {value:.4f}")
|
| 168 |
```
|
| 169 |
|
| 170 |
+
## 🔬 Advanced Model Comparison System
|
| 171 |
+
|
| 172 |
+
For comprehensive comparison of all available models, use the `PectinPredictor` class:
|
| 173 |
|
| 174 |
```python
|
| 175 |
import pandas as pd
|
| 176 |
+
import numpy as np
|
| 177 |
from huggingface_hub import hf_hub_download
|
| 178 |
import joblib
|
| 179 |
import pickle
|
| 180 |
+
import warnings
|
| 181 |
+
from sklearn.exceptions import InconsistentVersionWarning
|
| 182 |
+
from tabulate import tabulate
|
| 183 |
+
|
| 184 |
+
# Suppress sklearn version compatibility warnings
|
| 185 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
|
| 186 |
+
warnings.filterwarnings("ignore", category=UserWarning, module="xgboost")
|
| 187 |
|
| 188 |
class PectinPredictor:
|
| 189 |
+
"""
|
| 190 |
+
A machine learning model for predicting pectin production parameters
|
| 191 |
+
from experimental conditions using pre-trained models from Hugging Face Hub.
|
| 192 |
+
"""
|
| 193 |
+
|
| 194 |
+
# Available models with descriptions and metadata
|
| 195 |
+
AVAILABLE_MODELS = {
|
| 196 |
+
"best_model": {
|
| 197 |
+
"subfolder": "best_model",
|
| 198 |
+
"description": "🎯 Best overall model (Gradient Boosting) - optimal performance",
|
| 199 |
+
"color": "#FF6B6B"
|
| 200 |
+
},
|
| 201 |
+
"gradient_boosting": {
|
| 202 |
+
"subfolder": "gradient_boosting",
|
| 203 |
+
"description": "📈 Gradient Boosting - best for multi-task regression",
|
| 204 |
+
"color": "#4ECDC4"
|
| 205 |
+
},
|
| 206 |
+
"random_forest": {
|
| 207 |
+
"subfolder": "random_forest",
|
| 208 |
+
"description": "🌲 Random Forest - reliable and stable",
|
| 209 |
+
"color": "#45B7D1"
|
| 210 |
+
},
|
| 211 |
+
"xgboost": {
|
| 212 |
+
"subfolder": "xgboost",
|
| 213 |
+
"description": "⚡ XGBoost - high performance on tabular data",
|
| 214 |
+
"color": "#96CEB4"
|
| 215 |
+
},
|
| 216 |
+
"linear_regression": {
|
| 217 |
+
"subfolder": "linear_regression",
|
| 218 |
+
"description": "📊 Linear Regression - basic linear model",
|
| 219 |
+
"color": "#FECA57"
|
| 220 |
+
},
|
| 221 |
+
"extra_trees": {
|
| 222 |
+
"subfolder": "extra_trees",
|
| 223 |
+
"description": "🌳 Extra Trees - extreme random forests",
|
| 224 |
+
"color": "#FF9FF3"
|
| 225 |
+
},
|
| 226 |
+
"k_neighbors": {
|
| 227 |
+
"subfolder": "k-neighbors",
|
| 228 |
+
"description": "📏 K-Neighbors - nearest neighbors method",
|
| 229 |
+
"color": "#54A0FF"
|
| 230 |
+
},
|
| 231 |
+
"lasso_regression": {
|
| 232 |
+
"subfolder": "lasso_regression",
|
| 233 |
+
"description": "🎯 Lasso Regression - L1 regularization",
|
| 234 |
+
"color": "#5F27CD"
|
| 235 |
+
},
|
| 236 |
+
"multilayer_perceptron": {
|
| 237 |
+
"subfolder": "multilayer_perceptron",
|
| 238 |
+
"description": "🧠 Neural Network MLP - multilayer perceptron",
|
| 239 |
+
"color": "#00D2D3"
|
| 240 |
+
},
|
| 241 |
+
"ridge_regression": {
|
| 242 |
+
"subfolder": "ridge_regression",
|
| 243 |
+
"description": "🏔️ Ridge Regression - L2 regularization",
|
| 244 |
+
"color": "#FF9F43"
|
| 245 |
+
},
|
| 246 |
+
"support_vector_regression": {
|
| 247 |
+
"subfolder": "support_vector_regression",
|
| 248 |
+
"description": "🔗 Support Vector Regression - support vector method",
|
| 249 |
+
"color": "#A3CB38"
|
| 250 |
+
}
|
| 251 |
+
}
|
| 252 |
+
|
| 253 |
def __init__(self, repo_id="arabovs-ai-lab/PectinProductionModels"):
|
| 254 |
+
"""Initialize the predictor with model repository ID."""
|
| 255 |
self.repo_id = repo_id
|
| 256 |
self.model = None
|
| 257 |
self.scaler = None
|
| 258 |
self.label_encoder = None
|
| 259 |
+
# Model input features (after preprocessing)
|
| 260 |
self.feature_columns = ['time_min', 'temperature_c', 'pressure_atm', 'ph', 'sample_encoded', 'method_encoded']
|
| 261 |
+
# Model output targets (pectin characteristics)
|
| 262 |
self.target_columns = ['pectin_yield', 'galacturonic_acid', 'molecular_weight', 'esterification_degree']
|
| 263 |
|
| 264 |
+
def load_from_hub(self, model_type="best_model"):
|
| 265 |
+
"""
|
| 266 |
+
Load model, scaler, and label encoder from Hugging Face Hub repository.
|
| 267 |
+
|
| 268 |
+
Args:
|
| 269 |
+
model_type: Key from AVAILABLE_MODELS to load specific model
|
| 270 |
+
"""
|
| 271 |
+
if model_type not in self.AVAILABLE_MODELS:
|
| 272 |
+
raise ValueError(f"Model type '{model_type}' not found. Available: {list(self.AVAILABLE_MODELS.keys())}")
|
| 273 |
+
|
| 274 |
+
model_info = self.AVAILABLE_MODELS[model_type]
|
| 275 |
+
|
| 276 |
+
# Download and load the specified model
|
| 277 |
model_path = hf_hub_download(
|
| 278 |
repo_id=self.repo_id,
|
| 279 |
+
filename=f"{model_info['subfolder']}/model.pkl",
|
| 280 |
repo_type="model"
|
| 281 |
)
|
|
|
|
| 282 |
|
| 283 |
+
with warnings.catch_warnings():
|
| 284 |
+
warnings.filterwarnings("ignore", category=UserWarning)
|
| 285 |
+
self.model = joblib.load(model_path)
|
| 286 |
+
|
| 287 |
+
# Download and load the feature scaler for data normalization
|
| 288 |
scaler_path = hf_hub_download(
|
| 289 |
repo_id=self.repo_id,
|
| 290 |
filename="scaler.pkl",
|
|
|
|
| 292 |
)
|
| 293 |
self.scaler = joblib.load(scaler_path)
|
| 294 |
|
| 295 |
+
# Download and load the label encoder for sample type conversion
|
| 296 |
encoder_path = hf_hub_download(
|
| 297 |
repo_id=self.repo_id,
|
| 298 |
filename="label_encoder.pkl",
|
|
|
|
| 300 |
)
|
| 301 |
with open(encoder_path, 'rb') as f:
|
| 302 |
self.label_encoder = pickle.load(f)
|
| 303 |
+
|
| 304 |
+
def prepare_dataframe(self, df):
|
| 305 |
+
"""
|
| 306 |
+
Rename DataFrame columns from Russian to English to match model expectations.
|
| 307 |
+
"""
|
| 308 |
+
column_mapping = {
|
| 309 |
+
'Образец \nпектина': 'sample',
|
| 310 |
+
't, мин': 'time_min',
|
| 311 |
+
'T, °C': 'temperature_c',
|
| 312 |
+
'P, атм': 'pressure_atm',
|
| 313 |
+
'pH': 'ph'
|
| 314 |
+
}
|
| 315 |
+
return df.rename(columns=column_mapping)
|
| 316 |
+
|
| 317 |
+
def preprocess_input(self, input_df):
|
| 318 |
+
"""
|
| 319 |
+
Preprocess input data for model prediction.
|
| 320 |
+
Applies feature engineering, encoding, and scaling.
|
| 321 |
+
"""
|
| 322 |
processed_df = input_df.copy()
|
| 323 |
|
| 324 |
+
# Convert sample names to numeric codes using trained label encoder
|
| 325 |
processed_df['sample_encoded'] = self.label_encoder.transform(processed_df['sample'])
|
| 326 |
|
| 327 |
+
# Create binary feature indicating extraction method based on time
|
| 328 |
processed_df['method_encoded'] = np.where(processed_df['time_min'] <= 15, 1, 0)
|
| 329 |
|
| 330 |
+
# Select features in correct order and apply scaling
|
| 331 |
X = processed_df[self.feature_columns]
|
| 332 |
X_scaled = self.scaler.transform(X)
|
| 333 |
|
| 334 |
+
return X_scaled
|
| 335 |
+
|
| 336 |
+
def predict_batch(self, input_df, model_type="best_model"):
|
| 337 |
+
"""
|
| 338 |
+
Generate predictions for multiple experimental conditions.
|
| 339 |
+
|
| 340 |
+
Args:
|
| 341 |
+
input_df: DataFrame containing experimental parameters
|
| 342 |
+
model_type: Which model to use for prediction
|
| 343 |
+
|
| 344 |
+
Returns:
|
| 345 |
+
Original DataFrame augmented with prediction columns
|
| 346 |
+
"""
|
| 347 |
+
# Load specified model if not already loaded or different from current
|
| 348 |
+
if self.model is None or model_type != getattr(self, '_current_model', None):
|
| 349 |
+
self.load_from_hub(model_type)
|
| 350 |
+
self._current_model = model_type
|
| 351 |
+
|
| 352 |
+
# Preprocess input data
|
| 353 |
+
X_scaled = self.preprocess_input(input_df)
|
| 354 |
+
|
| 355 |
+
# Generate predictions using the trained model
|
| 356 |
predictions = self.model.predict(X_scaled)
|
| 357 |
|
| 358 |
+
# Combine original data with predictions
|
| 359 |
result_df = input_df.copy()
|
| 360 |
for i, target in enumerate(self.target_columns):
|
| 361 |
result_df[f'predicted_{target}'] = predictions[:, i]
|
| 362 |
|
| 363 |
return result_df
|
| 364 |
|
| 365 |
+
def compare_all_models(self, input_data):
|
| 366 |
+
"""
|
| 367 |
+
Compare predictions from ALL available machine learning models.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 368 |
|
| 369 |
+
Args:
|
| 370 |
+
input_data: DataFrame or dictionary with input features
|
| 371 |
+
|
| 372 |
+
Returns:
|
| 373 |
+
DataFrame with predictions from each model for easy comparison
|
| 374 |
+
"""
|
| 375 |
+
# Convert single input to DataFrame if needed
|
| 376 |
+
if isinstance(input_data, dict):
|
| 377 |
+
input_df = pd.DataFrame([input_data])
|
| 378 |
+
else:
|
| 379 |
+
input_df = input_data.copy()
|
| 380 |
+
|
| 381 |
+
# Preprocess input data once for all models
|
| 382 |
+
X_scaled = self.preprocess_input(input_df)
|
| 383 |
+
|
| 384 |
+
comparison_results = []
|
| 385 |
+
|
| 386 |
+
for model_name, model_info in self.AVAILABLE_MODELS.items():
|
| 387 |
+
try:
|
| 388 |
+
# Download and load model
|
| 389 |
+
model_path = hf_hub_download(
|
| 390 |
+
repo_id=self.repo_id,
|
| 391 |
+
filename=f"{model_info['subfolder']}/model.pkl",
|
| 392 |
+
repo_type="model"
|
| 393 |
+
)
|
| 394 |
+
|
| 395 |
+
# Load model with suppressed warnings
|
| 396 |
+
with warnings.catch_warnings():
|
| 397 |
+
warnings.filterwarnings("ignore", category=UserWarning)
|
| 398 |
+
model = joblib.load(model_path)
|
| 399 |
+
|
| 400 |
+
# Generate predictions
|
| 401 |
+
predictions = model.predict(X_scaled)
|
| 402 |
+
|
| 403 |
+
# Extract predictions for this sample
|
| 404 |
+
result = {
|
| 405 |
+
'model': model_name,
|
| 406 |
+
'description': model_info['description']
|
| 407 |
+
}
|
| 408 |
+
|
| 409 |
+
for i, target in enumerate(self.target_columns):
|
| 410 |
+
if len(predictions.shape) > 1:
|
| 411 |
+
result[target] = predictions[0, i]
|
| 412 |
+
else:
|
| 413 |
+
result[target] = predictions[i]
|
| 414 |
+
|
| 415 |
+
comparison_results.append(result)
|
| 416 |
+
|
| 417 |
+
except Exception as e:
|
| 418 |
+
print(f"⚠️ Could not load model {model_name}: {e}")
|
| 419 |
+
continue
|
| 420 |
+
|
| 421 |
+
return pd.DataFrame(comparison_results)
|
| 422 |
+
|
| 423 |
+
def create_comparison_tables(self, comparison_df):
|
| 424 |
+
"""
|
| 425 |
+
Create formatted comparison tables for easy analysis.
|
| 426 |
+
|
| 427 |
+
Args:
|
| 428 |
+
comparison_df: DataFrame from compare_all_models()
|
| 429 |
+
|
| 430 |
+
Returns:
|
| 431 |
+
Dictionary with different formatted tables
|
| 432 |
+
"""
|
| 433 |
+
tables = {}
|
| 434 |
+
|
| 435 |
+
# Table 1: Detailed comparison with all metrics
|
| 436 |
+
detailed_table = comparison_df.copy()
|
| 437 |
+
detailed_table = detailed_table.round(4)
|
| 438 |
+
tables['detailed'] = tabulate(
|
| 439 |
+
detailed_table,
|
| 440 |
+
headers='keys',
|
| 441 |
+
tablefmt='grid',
|
| 442 |
+
showindex=False
|
| 443 |
)
|
| 444 |
|
| 445 |
+
# Table 2: Summary statistics
|
| 446 |
+
summary_data = []
|
| 447 |
+
for target in self.target_columns:
|
| 448 |
+
values = comparison_df[target]
|
| 449 |
+
summary_data.append({
|
| 450 |
+
'Target': target,
|
| 451 |
+
'Mean': values.mean(),
|
| 452 |
+
'Std': values.std(),
|
| 453 |
+
'Min': values.min(),
|
| 454 |
+
'Max': values.max(),
|
| 455 |
+
'Range': values.max() - values.min()
|
| 456 |
+
})
|
| 457 |
|
| 458 |
+
summary_df = pd.DataFrame(summary_data).round(4)
|
| 459 |
+
tables['summary'] = tabulate(
|
| 460 |
+
summary_df,
|
| 461 |
+
headers='keys',
|
| 462 |
+
tablefmt='grid',
|
| 463 |
+
showindex=False
|
| 464 |
+
)
|
| 465 |
|
| 466 |
+
# Table 3: Ranked by pectin yield (most important metric)
|
| 467 |
+
ranked_df = comparison_df.sort_values('pectin_yield', ascending=False).round(4)
|
| 468 |
+
tables['ranked'] = tabulate(
|
| 469 |
+
ranked_df,
|
| 470 |
+
headers='keys',
|
| 471 |
+
tablefmt='grid',
|
| 472 |
+
showindex=False
|
| 473 |
+
)
|
| 474 |
+
|
| 475 |
+
return tables
|
| 476 |
+
|
| 477 |
+
def calculate_prediction_metrics(self, df_with_predictions):
|
| 478 |
+
"""
|
| 479 |
+
Calculate basic metrics to evaluate prediction quality against actual values.
|
| 480 |
+
"""
|
| 481 |
+
metrics = {}
|
| 482 |
+
|
| 483 |
+
for target in self.target_columns:
|
| 484 |
+
actual_col = None
|
| 485 |
+
# Find the actual value column
|
| 486 |
+
if target == 'pectin_yield':
|
| 487 |
+
actual_col = 'ПВ, %'
|
| 488 |
+
elif target == 'galacturonic_acid':
|
| 489 |
+
actual_col = 'ГК, %'
|
| 490 |
+
elif target == 'molecular_weight':
|
| 491 |
+
actual_col = 'Mw, Д'
|
| 492 |
+
elif target == 'esterification_degree':
|
| 493 |
+
actual_col = 'СЭ, %'
|
| 494 |
+
|
| 495 |
+
if actual_col and actual_col in df_with_predictions.columns:
|
| 496 |
+
actual = df_with_predictions[actual_col]
|
| 497 |
+
predicted = df_with_predictions[f'predicted_{target}']
|
| 498 |
+
|
| 499 |
+
# Calculate metrics
|
| 500 |
+
rmse = np.sqrt(np.mean((actual - predicted) ** 2))
|
| 501 |
+
mae = np.mean(np.abs(actual - predicted))
|
| 502 |
+
|
| 503 |
+
metrics[target] = {
|
| 504 |
+
'RMSE': rmse,
|
| 505 |
+
'MAE': mae,
|
| 506 |
+
'correlation': np.corrcoef(actual, predicted)[0, 1]
|
| 507 |
+
}
|
| 508 |
+
|
| 509 |
+
return metrics
|
| 510 |
|
| 511 |
+
# Example usage
|
| 512 |
+
if __name__ == "__main__":
|
| 513 |
+
# Initialize predictor
|
| 514 |
+
predictor = PectinPredictor()
|
| 515 |
+
|
| 516 |
+
# Load experimental data
|
| 517 |
+
df = pd.read_excel("/content/ShortExperiments_DataSet.xlsx")
|
| 518 |
+
df_renamed = predictor.prepare_dataframe(df)
|
| 519 |
+
|
| 520 |
+
print("🔬 PECTIN PRODUCTION MODEL COMPARISON SYSTEM")
|
| 521 |
+
print("=" * 60)
|
| 522 |
+
|
| 523 |
+
# 1. Batch prediction with best model
|
| 524 |
+
print("\n1. BATCH PREDICTIONS WITH BEST MODEL:")
|
| 525 |
+
print("-" * 40)
|
| 526 |
+
|
| 527 |
+
results = predictor.predict_batch(df_renamed, model_type="best_model")
|
| 528 |
+
print(f"✅ Processed {len(results)} experiments")
|
| 529 |
+
|
| 530 |
+
# Calculate prediction quality metrics
|
| 531 |
+
metrics = predictor.calculate_prediction_metrics(results)
|
| 532 |
+
print("\n📊 PREDICTION QUALITY METRICS:")
|
| 533 |
+
for target, metric in metrics.items():
|
| 534 |
+
print(f" {target}:")
|
| 535 |
+
print(f" RMSE: {metric['RMSE']:.4f}")
|
| 536 |
+
print(f" MAE: {metric['MAE']:.4f}")
|
| 537 |
+
print(f" Correlation: {metric['correlation']:.4f}")
|
| 538 |
+
|
| 539 |
+
# 2. Compare all models for a single experiment
|
| 540 |
+
print("\n2. COMPARING ALL MODELS FOR SINGLE EXPERIMENT:")
|
| 541 |
+
print("-" * 50)
|
| 542 |
+
|
| 543 |
+
single_experiment = {
|
| 544 |
+
'sample': 'ЯП(М)',
|
| 545 |
+
'time_min': 7,
|
| 546 |
+
'temperature_c': 120,
|
| 547 |
+
'pressure_atm': 2.08,
|
| 548 |
+
'ph': 2.0
|
| 549 |
+
}
|
| 550 |
+
|
| 551 |
+
print(f"🔍 Input parameters: {single_experiment}")
|
| 552 |
+
|
| 553 |
+
# Compare all models
|
| 554 |
+
comparison_df = predictor.compare_all_models(single_experiment)
|
| 555 |
+
|
| 556 |
+
# Create and display comparison tables
|
| 557 |
+
tables = predictor.create_comparison_tables(comparison_df)
|
| 558 |
+
|
| 559 |
+
print("\n📋 DETAILED MODEL COMPARISON:")
|
| 560 |
+
print(tables['detailed'])
|
| 561 |
+
|
| 562 |
+
print("\n📈 PREDICTION SUMMARY STATISTICS:")
|
| 563 |
+
print(tables['summary'])
|
| 564 |
+
|
| 565 |
+
print("\n🏆 MODELS RANKED BY PECTIN YIELD:")
|
| 566 |
+
print(tables['ranked'])
|
| 567 |
+
|
| 568 |
+
# 3. Show available models
|
| 569 |
+
print("\n3. AVAILABLE MODELS:")
|
| 570 |
+
print("-" * 20)
|
| 571 |
+
for model_name, info in predictor.AVAILABLE_MODELS.items():
|
| 572 |
+
print(f" • {model_name}: {info['description']}")
|
| 573 |
+
|
| 574 |
+
print(f"\n🎯 Total models available: {len(predictor.AVAILABLE_MODELS)}")
|
| 575 |
+
print(f"✅ Successfully loaded: {len(comparison_df)}")
|
| 576 |
+
```
|
| 577 |
|
| 578 |
## 📁 Repository Structure
|
| 579 |
|
|
|
|
| 593 |
├── k_neighbors/ # K-Neighbors model
|
| 594 |
├── multilayer_perceptron/ # MLP model
|
| 595 |
├── scaler.pkl # Feature scaler
|
| 596 |
+
├── label_encoder.pkl # Label encoder for sample types
|
| 597 |
├── model_metadata.json # Training metadata
|
| 598 |
├── models_metadata.json # All models metadata
|
| 599 |
└── README.md # This file
|
|
|
|
| 602 |
## 🧪 Training Information
|
| 603 |
|
| 604 |
- **Dataset**: 1000 experimental records
|
| 605 |
+
- **Features**: 6 process parameters (excluding constant Т:Ж parameter)
|
| 606 |
- **Targets**: 4 quality parameters
|
| 607 |
- **Validation**: 80/20 train-test split
|
| 608 |
- **Cross-validation**: 5-fold
|
|
|
|
| 610 |
|
| 611 |
## 💡 Key Features
|
| 612 |
|
| 613 |
+
- **Multi-target regression**: Predicts 4 pectin quality parameters simultaneously
|
| 614 |
- **Process optimization**: Helps optimize pectin production conditions
|
| 615 |
- **Quality prediction**: Estimates pectin quality from process variables
|
| 616 |
- **Multiple algorithms**: 10 different ML algorithms for comparison
|
| 617 |
+
- **Industrial focus**: Specifically designed for pectin production technology
|
| 618 |
+
|
| 619 |
+
## ⚠️ Important Notes
|
| 620 |
+
|
| 621 |
+
### Data Requirements:
|
| 622 |
+
- **Supported samples**: 7 types as listed above
|
| 623 |
+
- **Parameter ranges**:
|
| 624 |
+
- Time: 5-180 minutes
|
| 625 |
+
- Temperature: 60-160°C
|
| 626 |
+
- Pressure: 1.0-5.0 atm
|
| 627 |
+
- pH: 1.5-4.0
|
| 628 |
+
|
| 629 |
+
### Limitations:
|
| 630 |
+
- Models trained on specific raw materials listed above
|
| 631 |
+
- Accuracy may decrease outside trained parameter ranges
|
| 632 |
+
- Retraining required for new types of raw materials
|
| 633 |
+
|
| 634 |
+
## 📜 Citation
|
| 635 |
+
|
| 636 |
+
If you use this model in your research, please cite it as:
|
| 637 |
+
|
| 638 |
+
```bibtex
|
| 639 |
+
@misc{PectinProductionModels2025,
|
| 640 |
+
title = {Pectin Production Models: Machine Learning for Predicting Pectin Quality Parameters},
|
| 641 |
+
author = {Arabovs AI Lab},
|
| 642 |
+
year = {2025},
|
| 643 |
+
publisher = {Hugging Face},
|
| 644 |
+
url = {https://huggingface.co/arabovs-ai-lab/PectinProductionModels}
|
| 645 |
+
}
|
| 646 |
+
```
|
| 647 |
|
| 648 |
## 📄 License
|
| 649 |
|
|
|
|
| 659 |
- [Pectin Production Technology](https://en.wikipedia.org/wiki/Pectin)
|
| 660 |
- [Scikit-learn](https://scikit-learn.org/)
|
| 661 |
- [Hugging Face Hub](https://huggingface.co/docs/hub/)
|
| 662 |
+
```
|