Spaces:
Running
Running
Add data structures and AutoML implementations
Browse files- src/mlpipeline/automl/__init__.py +4 -0
- src/mlpipeline/automl/autogluon_trainer.py +53 -0
- src/mlpipeline/automl/automl_factory.py +32 -0
- src/mlpipeline/automl/flaml_trainer.py +67 -0
- src/mlpipeline/automl/pycaret_trainer.py +91 -0
- src/mlpipeline/config/__init__.py +1 -0
- src/mlpipeline/config/configuration.py +99 -0
- src/mlpipeline/entity/__init__.py +18 -0
- src/mlpipeline/entity/artifact_entity.py +48 -0
- src/mlpipeline/entity/config_entity.py +62 -0
src/mlpipeline/automl/__init__.py
CHANGED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mlpipeline.automl.autogluon_trainer import AutoGluonTrainer
|
| 2 |
+
from mlpipeline.automl.flaml_trainer import FLAMLTrainer
|
| 3 |
+
from mlpipeline.automl.pycaret_trainer import PyCaretTrainer
|
| 4 |
+
from mlpipeline.automl.automl_factory import AutoMLFactory
|
src/mlpipeline/automl/autogluon_trainer.py
CHANGED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from typing import Dict, Any, Optional
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from autogluon.tabular import TabularPredictor
|
| 5 |
+
|
| 6 |
+
from mlpipeline.logging.logger import get_logger
|
| 7 |
+
|
| 8 |
+
logger = get_logger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class AutoGluonTrainer:
|
| 12 |
+
def __init__(self, config: Dict[str, Any]):
|
| 13 |
+
self.config = config
|
| 14 |
+
self.predictor: Optional[TabularPredictor] = None
|
| 15 |
+
|
| 16 |
+
def train(self, train_data: pd.DataFrame, target_column: str, model_path: Path) -> Dict[str, float]:
|
| 17 |
+
logger.info("Starting AutoGluon training")
|
| 18 |
+
|
| 19 |
+
self.predictor = TabularPredictor(
|
| 20 |
+
label=target_column,
|
| 21 |
+
path=str(model_path),
|
| 22 |
+
eval_metric=self.config.get('eval_metric'),
|
| 23 |
+
verbosity=self.config.get('verbosity', 2),
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
self.predictor.fit(
|
| 27 |
+
train_data=train_data,
|
| 28 |
+
time_limit=self.config.get('time_limit', 600),
|
| 29 |
+
presets=self.config.get('presets', 'medium_quality'),
|
| 30 |
+
num_bag_folds=self.config.get('num_bag_folds', 5),
|
| 31 |
+
num_stack_levels=self.config.get('num_stack_levels', 1),
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
leaderboard = self.predictor.leaderboard(silent=True)
|
| 35 |
+
best_model = leaderboard.iloc[0]
|
| 36 |
+
|
| 37 |
+
metrics = {
|
| 38 |
+
'score': float(best_model['score_val']),
|
| 39 |
+
'score_test': float(best_model.get('score_test', 0.0)),
|
| 40 |
+
}
|
| 41 |
+
|
| 42 |
+
logger.info(f"AutoGluon training completed. Best score: {metrics['score']}")
|
| 43 |
+
return metrics
|
| 44 |
+
|
| 45 |
+
def predict(self, data: pd.DataFrame) -> pd.Series:
|
| 46 |
+
if self.predictor is None:
|
| 47 |
+
raise ValueError("Model not trained. Call train() first.")
|
| 48 |
+
return self.predictor.predict(data)
|
| 49 |
+
|
| 50 |
+
def load(self, model_path: Path):
|
| 51 |
+
logger.info(f"Loading AutoGluon model from {model_path}")
|
| 52 |
+
self.predictor = TabularPredictor.load(str(model_path))
|
| 53 |
+
return self
|
src/mlpipeline/automl/automl_factory.py
CHANGED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import Union
|
| 2 |
+
from mlpipeline.automl.autogluon_trainer import AutoGluonTrainer
|
| 3 |
+
from mlpipeline.automl.flaml_trainer import FLAMLTrainer
|
| 4 |
+
from mlpipeline.automl.pycaret_trainer import PyCaretTrainer
|
| 5 |
+
from mlpipeline.logging.logger import get_logger
|
| 6 |
+
|
| 7 |
+
logger = get_logger(__name__)
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class AutoMLFactory:
|
| 11 |
+
@staticmethod
|
| 12 |
+
def create_trainer(library: str, config: dict) -> Union[AutoGluonTrainer, FLAMLTrainer, PyCaretTrainer]:
|
| 13 |
+
library = library.lower()
|
| 14 |
+
|
| 15 |
+
if library == 'autogluon':
|
| 16 |
+
logger.info("Creating AutoGluon trainer")
|
| 17 |
+
return AutoGluonTrainer(config)
|
| 18 |
+
|
| 19 |
+
elif library == 'flaml':
|
| 20 |
+
logger.info("Creating FLAML trainer")
|
| 21 |
+
return FLAMLTrainer(config)
|
| 22 |
+
|
| 23 |
+
elif library == 'pycaret':
|
| 24 |
+
logger.info("Creating PyCaret trainer")
|
| 25 |
+
return PyCaretTrainer(config)
|
| 26 |
+
|
| 27 |
+
else:
|
| 28 |
+
raise ValueError(f"Unknown AutoML library: {library}. Choose from: autogluon, flaml, pycaret")
|
| 29 |
+
|
| 30 |
+
@staticmethod
|
| 31 |
+
def get_available_libraries():
|
| 32 |
+
return ['autogluon', 'flaml', 'pycaret']
|
src/mlpipeline/automl/flaml_trainer.py
CHANGED
|
@@ -0,0 +1,67 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from typing import Dict, Any, Optional
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
from flaml import AutoML
|
| 6 |
+
from sklearn.metrics import accuracy_score, r2_score
|
| 7 |
+
|
| 8 |
+
from mlpipeline.logging.logger import get_logger
|
| 9 |
+
from mlpipeline.utils.common import save_pickle, load_pickle
|
| 10 |
+
|
| 11 |
+
logger = get_logger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class FLAMLTrainer:
|
| 15 |
+
def __init__(self, config: Dict[str, Any]):
|
| 16 |
+
self.config = config
|
| 17 |
+
self.automl: Optional[AutoML] = None
|
| 18 |
+
self.task = config.get('task', 'classification')
|
| 19 |
+
|
| 20 |
+
def train(self, train_data: pd.DataFrame, target_column: str, model_path: Path) -> Dict[str, float]:
|
| 21 |
+
logger.info("Starting FLAML training")
|
| 22 |
+
|
| 23 |
+
X_train = train_data.drop(columns=[target_column])
|
| 24 |
+
y_train = train_data[target_column]
|
| 25 |
+
|
| 26 |
+
self.automl = AutoML()
|
| 27 |
+
|
| 28 |
+
settings = {
|
| 29 |
+
'time_budget': self.config.get('time_budget', 600),
|
| 30 |
+
'metric': self.config.get('metric', 'auto'),
|
| 31 |
+
'task': self.task,
|
| 32 |
+
'estimator_list': self.config.get('estimator_list', ['lgbm', 'xgboost', 'rf']),
|
| 33 |
+
'n_jobs': self.config.get('n_jobs', -1),
|
| 34 |
+
'verbose': self.config.get('verbose', 1),
|
| 35 |
+
'early_stop': self.config.get('early_stop', True),
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
self.automl.fit(X_train=X_train, y_train=y_train, **settings)
|
| 39 |
+
|
| 40 |
+
y_pred = self.automl.predict(X_train)
|
| 41 |
+
|
| 42 |
+
if self.task == 'classification':
|
| 43 |
+
score = accuracy_score(y_train, y_pred)
|
| 44 |
+
metric_name = 'accuracy'
|
| 45 |
+
else:
|
| 46 |
+
score = r2_score(y_train, y_pred)
|
| 47 |
+
metric_name = 'r2_score'
|
| 48 |
+
|
| 49 |
+
save_pickle(model_path / 'model.pkl', self.automl)
|
| 50 |
+
|
| 51 |
+
metrics = {
|
| 52 |
+
metric_name: float(score),
|
| 53 |
+
'best_loss': float(self.automl.best_loss),
|
| 54 |
+
}
|
| 55 |
+
|
| 56 |
+
logger.info(f"FLAML training completed. Best {metric_name}: {score}")
|
| 57 |
+
return metrics
|
| 58 |
+
|
| 59 |
+
def predict(self, data: pd.DataFrame) -> np.ndarray:
|
| 60 |
+
if self.automl is None:
|
| 61 |
+
raise ValueError("Model not trained. Call train() first.")
|
| 62 |
+
return self.automl.predict(data)
|
| 63 |
+
|
| 64 |
+
def load(self, model_path: Path):
|
| 65 |
+
logger.info(f"Loading FLAML model from {model_path}")
|
| 66 |
+
self.automl = load_pickle(model_path / 'model.pkl')
|
| 67 |
+
return self
|
src/mlpipeline/automl/pycaret_trainer.py
CHANGED
|
@@ -0,0 +1,91 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from typing import Dict, Any, Optional
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from pycaret.classification import setup as classification_setup, compare_models as classification_compare, finalize_model as classification_finalize, save_model as classification_save, load_model as classification_load
|
| 5 |
+
from pycaret.regression import setup as regression_setup, compare_models as regression_compare, finalize_model as regression_finalize, save_model as regression_save, load_model as regression_load
|
| 6 |
+
|
| 7 |
+
from mlpipeline.logging.logger import get_logger
|
| 8 |
+
|
| 9 |
+
logger = get_logger(__name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class PyCaretTrainer:
|
| 13 |
+
def __init__(self, config: Dict[str, Any]):
|
| 14 |
+
self.config = config
|
| 15 |
+
self.model: Optional[Any] = None
|
| 16 |
+
self.is_classification = None
|
| 17 |
+
|
| 18 |
+
def train(self, train_data: pd.DataFrame, target_column: str, model_path: Path) -> Dict[str, float]:
|
| 19 |
+
logger.info("Starting PyCaret training")
|
| 20 |
+
|
| 21 |
+
if train_data[target_column].dtype == 'object' or train_data[target_column].nunique() < 20:
|
| 22 |
+
self.is_classification = True
|
| 23 |
+
setup_fn = classification_setup
|
| 24 |
+
compare_fn = classification_compare
|
| 25 |
+
finalize_fn = classification_finalize
|
| 26 |
+
save_fn = classification_save
|
| 27 |
+
else:
|
| 28 |
+
self.is_classification = False
|
| 29 |
+
setup_fn = regression_setup
|
| 30 |
+
compare_fn = regression_compare
|
| 31 |
+
finalize_fn = regression_finalize
|
| 32 |
+
save_fn = regression_save
|
| 33 |
+
|
| 34 |
+
exp = setup_fn(
|
| 35 |
+
data=train_data,
|
| 36 |
+
target=target_column,
|
| 37 |
+
session_id=self.config.get('session_id', 42),
|
| 38 |
+
fold=self.config.get('fold', 5),
|
| 39 |
+
verbose=self.config.get('verbose', False),
|
| 40 |
+
use_gpu=self.config.get('use_gpu', False),
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
best_model = compare_fn(
|
| 44 |
+
n_select=self.config.get('n_select', 5),
|
| 45 |
+
verbose=self.config.get('verbose', False),
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
if self.config.get('tuning', {}).get('enabled', True):
|
| 49 |
+
from pycaret.classification import tune_model as classification_tune
|
| 50 |
+
from pycaret.regression import tune_model as regression_tune
|
| 51 |
+
tune_fn = classification_tune if self.is_classification else regression_tune
|
| 52 |
+
|
| 53 |
+
best_model = tune_fn(
|
| 54 |
+
best_model,
|
| 55 |
+
n_iter=self.config.get('tuning', {}).get('n_iter', 10),
|
| 56 |
+
optimize=self.config.get('tuning', {}).get('optimize', 'Accuracy'),
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
self.model = finalize_fn(best_model)
|
| 60 |
+
|
| 61 |
+
save_fn(self.model, str(model_path / 'model'))
|
| 62 |
+
|
| 63 |
+
from pycaret.classification import pull as classification_pull
|
| 64 |
+
from pycaret.regression import pull as regression_pull
|
| 65 |
+
pull_fn = classification_pull if self.is_classification else regression_pull
|
| 66 |
+
|
| 67 |
+
results = pull_fn()
|
| 68 |
+
|
| 69 |
+
metrics = {
|
| 70 |
+
'score': float(results.iloc[0]['Mean']) if not results.empty else 0.0,
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
logger.info(f"PyCaret training completed. Score: {metrics['score']}")
|
| 74 |
+
return metrics
|
| 75 |
+
|
| 76 |
+
def predict(self, data: pd.DataFrame) -> pd.Series:
|
| 77 |
+
if self.model is None:
|
| 78 |
+
raise ValueError("Model not trained. Call train() first.")
|
| 79 |
+
|
| 80 |
+
from pycaret.classification import predict_model as classification_predict
|
| 81 |
+
from pycaret.regression import predict_model as regression_predict
|
| 82 |
+
predict_fn = classification_predict if self.is_classification else regression_predict
|
| 83 |
+
|
| 84 |
+
predictions = predict_fn(self.model, data=data)
|
| 85 |
+
return predictions.iloc[:, -1]
|
| 86 |
+
|
| 87 |
+
def load(self, model_path: Path):
|
| 88 |
+
logger.info(f"Loading PyCaret model from {model_path}")
|
| 89 |
+
load_fn = classification_load if self.is_classification else regression_load
|
| 90 |
+
self.model = load_fn(str(model_path / 'model'))
|
| 91 |
+
return self
|
src/mlpipeline/config/__init__.py
CHANGED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
from mlpipeline.config.configuration import ConfigurationManager
|
src/mlpipeline/config/configuration.py
CHANGED
|
@@ -0,0 +1,99 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from mlpipeline.constants import CONFIG_FILE_PATH
|
| 3 |
+
from mlpipeline.utils.common import read_yaml, create_directories
|
| 4 |
+
from mlpipeline.entity.config_entity import (
|
| 5 |
+
DataIngestionConfig,
|
| 6 |
+
DataValidationConfig,
|
| 7 |
+
DataTransformationConfig,
|
| 8 |
+
FeatureEngineeringConfig,
|
| 9 |
+
ModelTrainerConfig,
|
| 10 |
+
ModelEvaluationConfig,
|
| 11 |
+
ModelPusherConfig,
|
| 12 |
+
)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ConfigurationManager:
|
| 16 |
+
def __init__(self, config_filepath=CONFIG_FILE_PATH):
|
| 17 |
+
self.config = read_yaml(config_filepath)
|
| 18 |
+
create_directories([self.config.artifacts_root])
|
| 19 |
+
|
| 20 |
+
def get_data_ingestion_config(self) -> DataIngestionConfig:
|
| 21 |
+
config = self.config.data_ingestion
|
| 22 |
+
create_directories([config.root_dir])
|
| 23 |
+
|
| 24 |
+
return DataIngestionConfig(
|
| 25 |
+
root_dir=Path(config.root_dir),
|
| 26 |
+
source_url=config.source_url,
|
| 27 |
+
local_data_file=Path(config.local_data_file),
|
| 28 |
+
unzip_dir=Path(config.unzip_dir),
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
def get_data_validation_config(self) -> DataValidationConfig:
|
| 32 |
+
config = self.config.data_validation
|
| 33 |
+
create_directories([config.root_dir])
|
| 34 |
+
|
| 35 |
+
return DataValidationConfig(
|
| 36 |
+
root_dir=Path(config.root_dir),
|
| 37 |
+
data_dir=Path(config.data_dir),
|
| 38 |
+
status_file=Path(config.status_file),
|
| 39 |
+
schema_file=Path(config.schema_file),
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
def get_data_transformation_config(self) -> DataTransformationConfig:
|
| 43 |
+
config = self.config.data_transformation
|
| 44 |
+
create_directories([config.root_dir])
|
| 45 |
+
|
| 46 |
+
return DataTransformationConfig(
|
| 47 |
+
root_dir=Path(config.root_dir),
|
| 48 |
+
data_path=Path(config.data_path),
|
| 49 |
+
train_path=Path(config.train_path),
|
| 50 |
+
test_path=Path(config.test_path),
|
| 51 |
+
test_size=config.test_size,
|
| 52 |
+
random_state=config.random_state,
|
| 53 |
+
)
|
| 54 |
+
|
| 55 |
+
def get_feature_engineering_config(self) -> FeatureEngineeringConfig:
|
| 56 |
+
config = self.config.feature_engineering
|
| 57 |
+
create_directories([config.root_dir])
|
| 58 |
+
|
| 59 |
+
return FeatureEngineeringConfig(
|
| 60 |
+
root_dir=Path(config.root_dir),
|
| 61 |
+
train_path=Path(config.train_path),
|
| 62 |
+
test_path=Path(config.test_path),
|
| 63 |
+
output_train_path=Path(config.output_train_path),
|
| 64 |
+
output_test_path=Path(config.output_test_path),
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
def get_model_trainer_config(self) -> ModelTrainerConfig:
|
| 68 |
+
config = self.config.model_trainer
|
| 69 |
+
create_directories([config.root_dir])
|
| 70 |
+
|
| 71 |
+
return ModelTrainerConfig(
|
| 72 |
+
root_dir=Path(config.root_dir),
|
| 73 |
+
train_data_path=Path(config.train_data_path),
|
| 74 |
+
test_data_path=Path(config.test_data_path),
|
| 75 |
+
model_path=Path(config.model_path),
|
| 76 |
+
target_column=config.target_column,
|
| 77 |
+
)
|
| 78 |
+
|
| 79 |
+
def get_model_evaluation_config(self) -> ModelEvaluationConfig:
|
| 80 |
+
config = self.config.model_evaluation
|
| 81 |
+
create_directories([config.root_dir])
|
| 82 |
+
|
| 83 |
+
return ModelEvaluationConfig(
|
| 84 |
+
root_dir=Path(config.root_dir),
|
| 85 |
+
model_path=Path(config.model_path),
|
| 86 |
+
test_data_path=Path(config.test_data_path),
|
| 87 |
+
metrics_file=Path(config.metrics_file),
|
| 88 |
+
target_column=config.target_column,
|
| 89 |
+
)
|
| 90 |
+
|
| 91 |
+
def get_model_pusher_config(self) -> ModelPusherConfig:
|
| 92 |
+
config = self.config.model_pusher
|
| 93 |
+
create_directories([config.root_dir])
|
| 94 |
+
|
| 95 |
+
return ModelPusherConfig(
|
| 96 |
+
root_dir=Path(config.root_dir),
|
| 97 |
+
model_path=Path(config.model_path),
|
| 98 |
+
model_registry_path=Path(config.model_registry_path),
|
| 99 |
+
)
|
src/mlpipeline/entity/__init__.py
CHANGED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mlpipeline.entity.config_entity import (
|
| 2 |
+
DataIngestionConfig,
|
| 3 |
+
DataValidationConfig,
|
| 4 |
+
DataTransformationConfig,
|
| 5 |
+
FeatureEngineeringConfig,
|
| 6 |
+
ModelTrainerConfig,
|
| 7 |
+
ModelEvaluationConfig,
|
| 8 |
+
ModelPusherConfig,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
from mlpipeline.entity.artifact_entity import (
|
| 12 |
+
DataIngestionArtifact,
|
| 13 |
+
DataValidationArtifact,
|
| 14 |
+
DataTransformationArtifact,
|
| 15 |
+
FeatureEngineeringArtifact,
|
| 16 |
+
ModelTrainerArtifact,
|
| 17 |
+
ModelEvaluationArtifact,
|
| 18 |
+
)
|
src/mlpipeline/entity/artifact_entity.py
CHANGED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Dict, Any
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
@dataclass
|
| 7 |
+
class DataIngestionArtifact:
|
| 8 |
+
data_file_path: Path
|
| 9 |
+
is_ingested: bool
|
| 10 |
+
message: str
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class DataValidationArtifact:
|
| 15 |
+
validation_status: bool
|
| 16 |
+
message: str
|
| 17 |
+
schema_file_path: Path
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
@dataclass
|
| 21 |
+
class DataTransformationArtifact:
|
| 22 |
+
train_file_path: Path
|
| 23 |
+
test_file_path: Path
|
| 24 |
+
is_transformed: bool
|
| 25 |
+
message: str
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@dataclass
|
| 29 |
+
class FeatureEngineeringArtifact:
|
| 30 |
+
train_features_path: Path
|
| 31 |
+
test_features_path: Path
|
| 32 |
+
is_engineered: bool
|
| 33 |
+
message: str
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
@dataclass
|
| 37 |
+
class ModelTrainerArtifact:
|
| 38 |
+
model_path: Path
|
| 39 |
+
is_trained: bool
|
| 40 |
+
message: str
|
| 41 |
+
train_metrics: Dict[str, float]
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class ModelEvaluationArtifact:
|
| 46 |
+
is_model_accepted: bool
|
| 47 |
+
evaluation_metrics: Dict[str, float]
|
| 48 |
+
message: str
|
src/mlpipeline/entity/config_entity.py
CHANGED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
@dataclass(frozen=True)
|
| 6 |
+
class DataIngestionConfig:
|
| 7 |
+
root_dir: Path
|
| 8 |
+
source_url: str
|
| 9 |
+
local_data_file: Path
|
| 10 |
+
unzip_dir: Path
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass(frozen=True)
|
| 14 |
+
class DataValidationConfig:
|
| 15 |
+
root_dir: Path
|
| 16 |
+
data_dir: Path
|
| 17 |
+
status_file: Path
|
| 18 |
+
schema_file: Path
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
@dataclass(frozen=True)
|
| 22 |
+
class DataTransformationConfig:
|
| 23 |
+
root_dir: Path
|
| 24 |
+
data_path: Path
|
| 25 |
+
train_path: Path
|
| 26 |
+
test_path: Path
|
| 27 |
+
test_size: float
|
| 28 |
+
random_state: int
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass(frozen=True)
|
| 32 |
+
class FeatureEngineeringConfig:
|
| 33 |
+
root_dir: Path
|
| 34 |
+
train_path: Path
|
| 35 |
+
test_path: Path
|
| 36 |
+
output_train_path: Path
|
| 37 |
+
output_test_path: Path
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
@dataclass(frozen=True)
|
| 41 |
+
class ModelTrainerConfig:
|
| 42 |
+
root_dir: Path
|
| 43 |
+
train_data_path: Path
|
| 44 |
+
test_data_path: Path
|
| 45 |
+
model_path: Path
|
| 46 |
+
target_column: str
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
@dataclass(frozen=True)
|
| 50 |
+
class ModelEvaluationConfig:
|
| 51 |
+
root_dir: Path
|
| 52 |
+
model_path: Path
|
| 53 |
+
test_data_path: Path
|
| 54 |
+
metrics_file: Path
|
| 55 |
+
target_column: str
|
| 56 |
+
|
| 57 |
+
|
| 58 |
+
@dataclass(frozen=True)
|
| 59 |
+
class ModelPusherConfig:
|
| 60 |
+
root_dir: Path
|
| 61 |
+
model_path: Path
|
| 62 |
+
model_registry_path: Path
|