Spaces:
Running
Running
Add pipeline stages implementation
Browse files- config/config.yaml +6 -6
- requirements.txt +2 -1
- setup.py +28 -0
- src/mlpipeline/automl/autogluon_trainer.py +1 -1
- src/mlpipeline/automl/flaml_trainer.py +1 -1
- src/mlpipeline/components/__init__.py +9 -0
- src/mlpipeline/components/automl_trainer.py +51 -0
- src/mlpipeline/components/data_ingestion.py +62 -0
- src/mlpipeline/components/data_transformation.py +42 -0
- src/mlpipeline/components/data_validation.py +45 -0
- src/mlpipeline/components/feature_engineering.py +128 -0
- src/mlpipeline/components/model_evaluation.py +53 -0
- src/mlpipeline/components/model_pusher.py +40 -0
- src/mlpipeline/config/configuration.py +5 -1
- src/mlpipeline/entity/__init__.py +1 -0
- src/mlpipeline/entity/artifact_entity.py +7 -0
- src/mlpipeline/entity/config_entity.py +2 -0
- src/mlpipeline/exception/__init__.py +2 -0
- src/mlpipeline/exception/exception.py +8 -0
- src/mlpipeline/pipelines/__init__.py +9 -0
- src/mlpipeline/pipelines/data_ingestion_pipeline.py +23 -0
- src/mlpipeline/pipelines/data_transformation_pipeline.py +23 -0
- src/mlpipeline/pipelines/data_validation_pipeline.py +23 -0
- src/mlpipeline/pipelines/feature_engineering_pipeline.py +23 -0
- src/mlpipeline/pipelines/model_evaluation_pipeline.py +23 -0
- src/mlpipeline/pipelines/model_pusher_pipeline.py +23 -0
- src/mlpipeline/pipelines/model_trainer_pipeline.py +23 -0
- src/mlpipeline/utils/common.py +13 -0
config/config.yaml
CHANGED
|
@@ -2,19 +2,19 @@ artifacts_root: artifacts
|
|
| 2 |
|
| 3 |
data_ingestion:
|
| 4 |
root_dir: artifacts/data_ingestion
|
| 5 |
-
source_url:
|
| 6 |
-
local_data_file: artifacts/data_ingestion/
|
| 7 |
unzip_dir: artifacts/data_ingestion
|
| 8 |
|
| 9 |
data_validation:
|
| 10 |
root_dir: artifacts/data_validation
|
| 11 |
-
data_dir: artifacts/data_ingestion/
|
| 12 |
status_file: artifacts/data_validation/status.txt
|
| 13 |
schema_file: config/schema.yaml
|
| 14 |
|
| 15 |
data_transformation:
|
| 16 |
root_dir: artifacts/data_transformation
|
| 17 |
-
data_path: artifacts/data_ingestion/
|
| 18 |
train_path: artifacts/data_transformation/train.csv
|
| 19 |
test_path: artifacts/data_transformation/test.csv
|
| 20 |
test_size: 0.2
|
|
@@ -32,14 +32,14 @@ model_trainer:
|
|
| 32 |
train_data_path: artifacts/feature_engineering/train_features.csv
|
| 33 |
test_data_path: artifacts/feature_engineering/test_features.csv
|
| 34 |
model_path: artifacts/model_trainer/model
|
| 35 |
-
target_column:
|
| 36 |
|
| 37 |
model_evaluation:
|
| 38 |
root_dir: artifacts/model_evaluation
|
| 39 |
model_path: artifacts/model_trainer/model
|
| 40 |
test_data_path: artifacts/feature_engineering/test_features.csv
|
| 41 |
metrics_file: artifacts/model_evaluation/metrics.json
|
| 42 |
-
target_column:
|
| 43 |
|
| 44 |
model_pusher:
|
| 45 |
root_dir: artifacts/model_pusher
|
|
|
|
| 2 |
|
| 3 |
data_ingestion:
|
| 4 |
root_dir: artifacts/data_ingestion
|
| 5 |
+
source_url: kaggle://playground-series-s6e2
|
| 6 |
+
local_data_file: artifacts/data_ingestion/train_raw.csv
|
| 7 |
unzip_dir: artifacts/data_ingestion
|
| 8 |
|
| 9 |
data_validation:
|
| 10 |
root_dir: artifacts/data_validation
|
| 11 |
+
data_dir: artifacts/data_ingestion/train_raw.csv
|
| 12 |
status_file: artifacts/data_validation/status.txt
|
| 13 |
schema_file: config/schema.yaml
|
| 14 |
|
| 15 |
data_transformation:
|
| 16 |
root_dir: artifacts/data_transformation
|
| 17 |
+
data_path: artifacts/data_ingestion/train_raw.csv
|
| 18 |
train_path: artifacts/data_transformation/train.csv
|
| 19 |
test_path: artifacts/data_transformation/test.csv
|
| 20 |
test_size: 0.2
|
|
|
|
| 32 |
train_data_path: artifacts/feature_engineering/train_features.csv
|
| 33 |
test_data_path: artifacts/feature_engineering/test_features.csv
|
| 34 |
model_path: artifacts/model_trainer/model
|
| 35 |
+
target_column: Heart Disease
|
| 36 |
|
| 37 |
model_evaluation:
|
| 38 |
root_dir: artifacts/model_evaluation
|
| 39 |
model_path: artifacts/model_trainer/model
|
| 40 |
test_data_path: artifacts/feature_engineering/test_features.csv
|
| 41 |
metrics_file: artifacts/model_evaluation/metrics.json
|
| 42 |
+
target_column: Heart Disease
|
| 43 |
|
| 44 |
model_pusher:
|
| 45 |
root_dir: artifacts/model_pusher
|
requirements.txt
CHANGED
|
@@ -27,4 +27,5 @@ prometheus-client
|
|
| 27 |
python-json-logger
|
| 28 |
|
| 29 |
httpx
|
| 30 |
-
requests
|
|
|
|
|
|
| 27 |
python-json-logger
|
| 28 |
|
| 29 |
httpx
|
| 30 |
+
requests
|
| 31 |
+
kaggle
|
setup.py
CHANGED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from setuptools import setup, find_packages
|
| 2 |
+
|
| 3 |
+
setup(
|
| 4 |
+
name="mlpipeline",
|
| 5 |
+
version="0.1.0",
|
| 6 |
+
author="AutoML Team",
|
| 7 |
+
packages=find_packages(where="src"),
|
| 8 |
+
package_dir={"": "src"},
|
| 9 |
+
python_requires=">=3.11",
|
| 10 |
+
install_requires=[
|
| 11 |
+
"fastapi",
|
| 12 |
+
"uvicorn[standard]",
|
| 13 |
+
"pydantic",
|
| 14 |
+
"pandas",
|
| 15 |
+
"numpy",
|
| 16 |
+
"scikit-learn",
|
| 17 |
+
"autogluon.tabular",
|
| 18 |
+
"flaml",
|
| 19 |
+
"pycaret",
|
| 20 |
+
"mlflow",
|
| 21 |
+
"dvc",
|
| 22 |
+
"evidently",
|
| 23 |
+
"pyyaml",
|
| 24 |
+
"python-box",
|
| 25 |
+
"ensure",
|
| 26 |
+
"kaggle",
|
| 27 |
+
],
|
| 28 |
+
)
|
src/mlpipeline/automl/autogluon_trainer.py
CHANGED
|
@@ -47,7 +47,7 @@ class AutoGluonTrainer:
|
|
| 47 |
raise ValueError("Model not trained. Call train() first.")
|
| 48 |
return self.predictor.predict(data)
|
| 49 |
|
| 50 |
-
def
|
| 51 |
logger.info(f"Loading AutoGluon model from {model_path}")
|
| 52 |
self.predictor = TabularPredictor.load(str(model_path))
|
| 53 |
return self
|
|
|
|
| 47 |
raise ValueError("Model not trained. Call train() first.")
|
| 48 |
return self.predictor.predict(data)
|
| 49 |
|
| 50 |
+
def load_model(self, model_path: Path):
|
| 51 |
logger.info(f"Loading AutoGluon model from {model_path}")
|
| 52 |
self.predictor = TabularPredictor.load(str(model_path))
|
| 53 |
return self
|
src/mlpipeline/automl/flaml_trainer.py
CHANGED
|
@@ -2,7 +2,7 @@ from pathlib import Path
|
|
| 2 |
from typing import Dict, Any, Optional
|
| 3 |
import pandas as pd
|
| 4 |
import numpy as np
|
| 5 |
-
from flaml import AutoML
|
| 6 |
from sklearn.metrics import accuracy_score, r2_score
|
| 7 |
|
| 8 |
from mlpipeline.logging.logger import get_logger
|
|
|
|
| 2 |
from typing import Dict, Any, Optional
|
| 3 |
import pandas as pd
|
| 4 |
import numpy as np
|
| 5 |
+
from flaml.automl.automl import AutoML
|
| 6 |
from sklearn.metrics import accuracy_score, r2_score
|
| 7 |
|
| 8 |
from mlpipeline.logging.logger import get_logger
|
src/mlpipeline/components/__init__.py
CHANGED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__all__ = [
|
| 2 |
+
"DataIngestion",
|
| 3 |
+
"DataValidation",
|
| 4 |
+
"DataTransformation",
|
| 5 |
+
"FeatureEngineering",
|
| 6 |
+
"AutoMLTrainer",
|
| 7 |
+
"ModelEvaluation",
|
| 8 |
+
"ModelPusher",
|
| 9 |
+
]
|
src/mlpipeline/components/automl_trainer.py
CHANGED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from mlpipeline.entity import ModelTrainerConfig, ModelTrainerArtifact
|
| 3 |
+
from mlpipeline.automl import AutoMLFactory
|
| 4 |
+
from mlpipeline.logging.logger import get_logger
|
| 5 |
+
from mlpipeline.exception import ModelTrainingException
|
| 6 |
+
from mlpipeline.constants import AUTOML_CONFIG_FILE_PATH
|
| 7 |
+
from mlpipeline.utils.common import read_yaml
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
import sys
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
logger = get_logger(__name__)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class AutoMLTrainer:
|
| 16 |
+
def __init__(self, config: ModelTrainerConfig):
|
| 17 |
+
self.config = config
|
| 18 |
+
|
| 19 |
+
def train(self) -> ModelTrainerArtifact:
|
| 20 |
+
try:
|
| 21 |
+
logger.info("Starting model training")
|
| 22 |
+
|
| 23 |
+
train_df = pd.read_csv(self.config.train_data_path)
|
| 24 |
+
|
| 25 |
+
automl_config = read_yaml(Path(AUTOML_CONFIG_FILE_PATH))
|
| 26 |
+
library_config = automl_config[self.config.automl_library]
|
| 27 |
+
|
| 28 |
+
trainer = AutoMLFactory.create_trainer(
|
| 29 |
+
self.config.automl_library,
|
| 30 |
+
library_config
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
os.makedirs(self.config.root_dir, exist_ok=True)
|
| 34 |
+
|
| 35 |
+
if self.config.automl_library == 'autogluon':
|
| 36 |
+
metrics = trainer.train(train_df, self.config.target_column, self.config.model_path)
|
| 37 |
+
else:
|
| 38 |
+
X_train = train_df.drop(columns=[self.config.target_column])
|
| 39 |
+
y_train = train_df[self.config.target_column]
|
| 40 |
+
metrics = trainer.train(X_train, y_train, self.config.model_path)
|
| 41 |
+
|
| 42 |
+
logger.info(f"Model trained with metrics: {metrics}")
|
| 43 |
+
|
| 44 |
+
return ModelTrainerArtifact(
|
| 45 |
+
model_path=self.config.model_path,
|
| 46 |
+
train_metrics=metrics,
|
| 47 |
+
is_trained=True,
|
| 48 |
+
message=f"Model trained successfully with score: {metrics.get('score', 0.0):.4f}"
|
| 49 |
+
)
|
| 50 |
+
except Exception as e:
|
| 51 |
+
raise ModelTrainingException(str(e), sys)
|
src/mlpipeline/components/data_ingestion.py
CHANGED
|
@@ -0,0 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import shutil
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from kaggle.api.kaggle_api_extended import KaggleApi
|
| 5 |
+
|
| 6 |
+
from mlpipeline.entity import DataIngestionConfig, DataIngestionArtifact
|
| 7 |
+
from mlpipeline.logging.logger import get_logger
|
| 8 |
+
from mlpipeline.exception import DataIngestionException
|
| 9 |
+
import sys
|
| 10 |
+
|
| 11 |
+
logger = get_logger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class DataIngestion:
|
| 15 |
+
def __init__(self, config: DataIngestionConfig):
|
| 16 |
+
self.config = config
|
| 17 |
+
self.kaggle_api = KaggleApi()
|
| 18 |
+
self.kaggle_api.authenticate()
|
| 19 |
+
|
| 20 |
+
def download_data(self) -> DataIngestionArtifact:
|
| 21 |
+
try:
|
| 22 |
+
logger.info("Starting data ingestion")
|
| 23 |
+
|
| 24 |
+
os.makedirs(self.config.root_dir, exist_ok=True)
|
| 25 |
+
|
| 26 |
+
competition_name = "playground-series-s6e2"
|
| 27 |
+
|
| 28 |
+
logger.info(f"Downloading dataset from Kaggle competition: {competition_name}")
|
| 29 |
+
self.kaggle_api.competition_download_files(
|
| 30 |
+
competition_name,
|
| 31 |
+
path=self.config.root_dir
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
zip_file = self.config.root_dir / f"{competition_name}.zip"
|
| 35 |
+
|
| 36 |
+
if zip_file.exists():
|
| 37 |
+
logger.info(f"Extracting {zip_file}")
|
| 38 |
+
shutil.unpack_archive(zip_file, self.config.unzip_dir)
|
| 39 |
+
zip_file.unlink()
|
| 40 |
+
|
| 41 |
+
train_file = self.config.unzip_dir / "train.csv"
|
| 42 |
+
test_file = self.config.unzip_dir / "test.csv"
|
| 43 |
+
|
| 44 |
+
if train_file.exists() and test_file.exists():
|
| 45 |
+
train_raw = self.config.root_dir / "train_raw.csv"
|
| 46 |
+
test_raw = self.config.root_dir / "test_raw.csv"
|
| 47 |
+
|
| 48 |
+
shutil.copy(train_file, train_raw)
|
| 49 |
+
shutil.copy(test_file, test_raw)
|
| 50 |
+
|
| 51 |
+
logger.info(f"Data saved: {train_raw}, {test_raw}")
|
| 52 |
+
|
| 53 |
+
return DataIngestionArtifact(
|
| 54 |
+
data_file_path=train_raw,
|
| 55 |
+
is_ingested=True,
|
| 56 |
+
message="Data ingestion completed successfully"
|
| 57 |
+
)
|
| 58 |
+
else:
|
| 59 |
+
raise FileNotFoundError("Train or test file not found after extraction")
|
| 60 |
+
|
| 61 |
+
except Exception as e:
|
| 62 |
+
raise DataIngestionException(str(e), sys)
|
src/mlpipeline/components/data_transformation.py
CHANGED
|
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from sklearn.model_selection import train_test_split
|
| 3 |
+
from mlpipeline.entity import DataTransformationConfig, DataTransformationArtifact
|
| 4 |
+
from mlpipeline.logging.logger import get_logger
|
| 5 |
+
from mlpipeline.exception import DataTransformationException
|
| 6 |
+
import sys
|
| 7 |
+
import os
|
| 8 |
+
|
| 9 |
+
logger = get_logger(__name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class DataTransformation:
|
| 13 |
+
def __init__(self, config: DataTransformationConfig):
|
| 14 |
+
self.config = config
|
| 15 |
+
|
| 16 |
+
def transform(self) -> DataTransformationArtifact:
|
| 17 |
+
try:
|
| 18 |
+
logger.info("Starting data transformation")
|
| 19 |
+
|
| 20 |
+
df = pd.read_csv(self.config.data_path)
|
| 21 |
+
|
| 22 |
+
train_df, test_df = train_test_split(
|
| 23 |
+
df,
|
| 24 |
+
test_size=self.config.test_size,
|
| 25 |
+
random_state=self.config.random_state
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
os.makedirs(self.config.root_dir, exist_ok=True)
|
| 29 |
+
|
| 30 |
+
train_df.to_csv(self.config.train_path, index=False)
|
| 31 |
+
test_df.to_csv(self.config.test_path, index=False)
|
| 32 |
+
|
| 33 |
+
logger.info(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
|
| 34 |
+
|
| 35 |
+
return DataTransformationArtifact(
|
| 36 |
+
train_file_path=self.config.train_path,
|
| 37 |
+
test_file_path=self.config.test_path,
|
| 38 |
+
is_transformed=True,
|
| 39 |
+
message=f"Train: {train_df.shape}, Test: {test_df.shape}"
|
| 40 |
+
)
|
| 41 |
+
except Exception as e:
|
| 42 |
+
raise DataTransformationException(str(e), sys)
|
src/mlpipeline/components/data_validation.py
CHANGED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
from mlpipeline.entity import DataValidationConfig, DataValidationArtifact
|
| 3 |
+
from mlpipeline.logging.logger import get_logger
|
| 4 |
+
from mlpipeline.exception import DataValidationException
|
| 5 |
+
from mlpipeline.utils.common import read_yaml
|
| 6 |
+
import sys
|
| 7 |
+
|
| 8 |
+
logger = get_logger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class DataValidation:
|
| 12 |
+
def __init__(self, config: DataValidationConfig):
|
| 13 |
+
self.config = config
|
| 14 |
+
|
| 15 |
+
def validate_schema(self) -> DataValidationArtifact:
|
| 16 |
+
try:
|
| 17 |
+
logger.info("Starting data validation")
|
| 18 |
+
|
| 19 |
+
df = pd.read_csv(self.config.data_dir)
|
| 20 |
+
|
| 21 |
+
validation_status = True
|
| 22 |
+
errors = []
|
| 23 |
+
|
| 24 |
+
if df.empty:
|
| 25 |
+
validation_status = False
|
| 26 |
+
errors.append("Dataset is empty")
|
| 27 |
+
|
| 28 |
+
if df.isnull().all().any():
|
| 29 |
+
validation_status = False
|
| 30 |
+
errors.append("Columns with all null values found")
|
| 31 |
+
|
| 32 |
+
status_message = "Validation passed" if validation_status else "; ".join(errors)
|
| 33 |
+
|
| 34 |
+
with open(self.config.status_file, "w") as f:
|
| 35 |
+
f.write(status_message)
|
| 36 |
+
|
| 37 |
+
logger.info(f"Validation status: {status_message}")
|
| 38 |
+
|
| 39 |
+
return DataValidationArtifact(
|
| 40 |
+
validation_status=validation_status,
|
| 41 |
+
message=status_message,
|
| 42 |
+
schema_file_path=self.config.schema_file
|
| 43 |
+
)
|
| 44 |
+
except Exception as e:
|
| 45 |
+
raise DataValidationException(str(e), sys)
|
src/mlpipeline/components/feature_engineering.py
CHANGED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import numpy as np
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
| 5 |
+
from sklearn.feature_selection import VarianceThreshold
|
| 6 |
+
from mlpipeline.entity import FeatureEngineeringConfig, FeatureEngineeringArtifact
|
| 7 |
+
from mlpipeline.logging.logger import get_logger
|
| 8 |
+
from mlpipeline.exception import FeatureEngineeringException
|
| 9 |
+
from mlpipeline.utils.common import save_object
|
| 10 |
+
import sys
|
| 11 |
+
import os
|
| 12 |
+
|
| 13 |
+
logger = get_logger(__name__)
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class FeatureEngineering:
|
| 17 |
+
def __init__(self, config: FeatureEngineeringConfig):
|
| 18 |
+
self.config = config
|
| 19 |
+
self.label_encoders = {}
|
| 20 |
+
self.scaler = None
|
| 21 |
+
|
| 22 |
+
def engineer_features(self) -> FeatureEngineeringArtifact:
|
| 23 |
+
try:
|
| 24 |
+
logger.info("Starting feature engineering")
|
| 25 |
+
|
| 26 |
+
train_df = pd.read_csv(self.config.train_path)
|
| 27 |
+
test_df = pd.read_csv(self.config.test_path)
|
| 28 |
+
|
| 29 |
+
train_df = self._handle_missing_values(train_df)
|
| 30 |
+
test_df = self._handle_missing_values(test_df)
|
| 31 |
+
|
| 32 |
+
train_df = self._encode_categorical(train_df, is_train=True)
|
| 33 |
+
test_df = self._encode_categorical(test_df, is_train=False)
|
| 34 |
+
|
| 35 |
+
train_df = self._create_interaction_features(train_df)
|
| 36 |
+
test_df = self._create_interaction_features(test_df)
|
| 37 |
+
|
| 38 |
+
train_df = self._remove_low_variance(train_df, is_train=True)
|
| 39 |
+
test_df = self._remove_low_variance(test_df, is_train=False)
|
| 40 |
+
|
| 41 |
+
numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
|
| 42 |
+
if 'target' in numeric_cols:
|
| 43 |
+
numeric_cols.remove('target')
|
| 44 |
+
|
| 45 |
+
if numeric_cols:
|
| 46 |
+
self.scaler = StandardScaler()
|
| 47 |
+
train_df[numeric_cols] = self.scaler.fit_transform(train_df[numeric_cols])
|
| 48 |
+
test_df[numeric_cols] = self.scaler.transform(test_df[numeric_cols])
|
| 49 |
+
|
| 50 |
+
os.makedirs(self.config.root_dir, exist_ok=True)
|
| 51 |
+
|
| 52 |
+
train_df.to_csv(self.config.output_train_path, index=False)
|
| 53 |
+
test_df.to_csv(self.config.output_test_path, index=False)
|
| 54 |
+
|
| 55 |
+
preprocessor_path = Path(self.config.root_dir) / "preprocessor.pkl"
|
| 56 |
+
save_object(preprocessor_path, {
|
| 57 |
+
'scaler': self.scaler,
|
| 58 |
+
'label_encoders': self.label_encoders
|
| 59 |
+
})
|
| 60 |
+
|
| 61 |
+
logger.info(f"Feature engineering completed. Train shape: {train_df.shape}, Test shape: {test_df.shape}")
|
| 62 |
+
|
| 63 |
+
return FeatureEngineeringArtifact(
|
| 64 |
+
train_features_path=self.config.output_train_path,
|
| 65 |
+
test_features_path=self.config.output_test_path,
|
| 66 |
+
is_engineered=True,
|
| 67 |
+
message=f"Features engineered: {train_df.shape[1]} features"
|
| 68 |
+
)
|
| 69 |
+
except Exception as e:
|
| 70 |
+
raise FeatureEngineeringException(str(e), sys)
|
| 71 |
+
|
| 72 |
+
def _handle_missing_values(self, df):
|
| 73 |
+
for col in df.columns:
|
| 74 |
+
if df[col].dtype in [np.float64, np.int64]:
|
| 75 |
+
df[col].fillna(df[col].median(), inplace=True)
|
| 76 |
+
else:
|
| 77 |
+
df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'missing', inplace=True)
|
| 78 |
+
return df
|
| 79 |
+
|
| 80 |
+
def _encode_categorical(self, df, is_train=True):
|
| 81 |
+
categorical_cols = df.select_dtypes(include=['object']).columns
|
| 82 |
+
|
| 83 |
+
for col in categorical_cols:
|
| 84 |
+
if is_train:
|
| 85 |
+
self.label_encoders[col] = LabelEncoder()
|
| 86 |
+
df[col] = self.label_encoders[col].fit_transform(df[col].astype(str))
|
| 87 |
+
else:
|
| 88 |
+
if col in self.label_encoders:
|
| 89 |
+
df[col] = df[col].astype(str).map(
|
| 90 |
+
lambda x: self.label_encoders[col].transform([x])[0]
|
| 91 |
+
if x in self.label_encoders[col].classes_ else -1
|
| 92 |
+
)
|
| 93 |
+
return df
|
| 94 |
+
|
| 95 |
+
def _create_interaction_features(self, df):
|
| 96 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
|
| 97 |
+
if 'target' in numeric_cols:
|
| 98 |
+
numeric_cols.remove('target')
|
| 99 |
+
|
| 100 |
+
if len(numeric_cols) >= 2:
|
| 101 |
+
df[f'{numeric_cols[0]}_x_{numeric_cols[1]}'] = df[numeric_cols[0]] * df[numeric_cols[1]]
|
| 102 |
+
|
| 103 |
+
return df
|
| 104 |
+
|
| 105 |
+
def _remove_low_variance(self, df, is_train=True, threshold=0.01):
|
| 106 |
+
if 'target' in df.columns:
|
| 107 |
+
target = df['target']
|
| 108 |
+
features = df.drop(columns=['target'])
|
| 109 |
+
else:
|
| 110 |
+
target = None
|
| 111 |
+
features = df
|
| 112 |
+
|
| 113 |
+
if is_train:
|
| 114 |
+
self.variance_selector = VarianceThreshold(threshold=threshold)
|
| 115 |
+
self.variance_selector.fit(features)
|
| 116 |
+
|
| 117 |
+
if hasattr(self, 'variance_selector'):
|
| 118 |
+
features_selected = pd.DataFrame(
|
| 119 |
+
self.variance_selector.transform(features),
|
| 120 |
+
columns=features.columns[self.variance_selector.get_support()],
|
| 121 |
+
index=features.index
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
if target is not None:
|
| 125 |
+
return pd.concat([features_selected, target], axis=1)
|
| 126 |
+
return features_selected
|
| 127 |
+
|
| 128 |
+
return df
|
src/mlpipeline/components/model_evaluation.py
CHANGED
|
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import json
|
| 3 |
+
from sklearn.metrics import accuracy_score, f1_score, classification_report
|
| 4 |
+
from mlpipeline.entity import ModelEvaluationConfig, ModelEvaluationArtifact
|
| 5 |
+
from autogluon.tabular import TabularPredictor
|
| 6 |
+
from mlpipeline.logging.logger import get_logger
|
| 7 |
+
from mlpipeline.exception import ModelEvaluationException
|
| 8 |
+
import sys
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
logger = get_logger(__name__)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class ModelEvaluation:
|
| 15 |
+
def __init__(self, config: ModelEvaluationConfig):
|
| 16 |
+
self.config = config
|
| 17 |
+
|
| 18 |
+
def evaluate(self) -> ModelEvaluationArtifact:
|
| 19 |
+
try:
|
| 20 |
+
logger.info("Starting model evaluation")
|
| 21 |
+
|
| 22 |
+
test_df = pd.read_csv(self.config.test_data_path)
|
| 23 |
+
|
| 24 |
+
predictor = TabularPredictor.load(str(self.config.model_path))
|
| 25 |
+
|
| 26 |
+
predictions = predictor.predict(test_df)
|
| 27 |
+
y_test = test_df[self.config.target_column]
|
| 28 |
+
|
| 29 |
+
predictions_binary = (predictions > 0).astype(int)
|
| 30 |
+
y_test_binary = (y_test > 0).astype(int)
|
| 31 |
+
|
| 32 |
+
accuracy = float(accuracy_score(y_test_binary, predictions_binary))
|
| 33 |
+
f1 = float(f1_score(y_test_binary, predictions_binary, average='weighted'))
|
| 34 |
+
|
| 35 |
+
metrics = {
|
| 36 |
+
"accuracy": accuracy,
|
| 37 |
+
"f1_score": f1
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
os.makedirs(self.config.root_dir, exist_ok=True)
|
| 41 |
+
|
| 42 |
+
with open(self.config.metrics_file, "w") as f:
|
| 43 |
+
json.dump(metrics, f, indent=2)
|
| 44 |
+
|
| 45 |
+
logger.info(f"Evaluation metrics: {metrics}")
|
| 46 |
+
|
| 47 |
+
return ModelEvaluationArtifact(
|
| 48 |
+
is_model_accepted=True,
|
| 49 |
+
evaluation_metrics=metrics,
|
| 50 |
+
message=f"Model evaluation completed with accuracy: {accuracy:.4f}"
|
| 51 |
+
)
|
| 52 |
+
except Exception as e:
|
| 53 |
+
raise ModelEvaluationException(str(e), sys)
|
src/mlpipeline/components/model_pusher.py
CHANGED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import shutil
|
| 2 |
+
import os
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from mlpipeline.entity import ModelPusherConfig, ModelPusherArtifact
|
| 5 |
+
from mlpipeline.logging.logger import get_logger
|
| 6 |
+
from mlpipeline.exception import ModelPusherException
|
| 7 |
+
import sys
|
| 8 |
+
|
| 9 |
+
logger = get_logger(__name__)
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class ModelPusher:
|
| 13 |
+
def __init__(self, config: ModelPusherConfig):
|
| 14 |
+
self.config = config
|
| 15 |
+
|
| 16 |
+
def push_model(self) -> ModelPusherArtifact:
|
| 17 |
+
try:
|
| 18 |
+
logger.info("Starting model pusher")
|
| 19 |
+
|
| 20 |
+
os.makedirs(self.config.model_registry_path, exist_ok=True)
|
| 21 |
+
|
| 22 |
+
model_source = Path(self.config.model_path)
|
| 23 |
+
model_dest = Path(self.config.model_registry_path) / model_source.name
|
| 24 |
+
|
| 25 |
+
if model_source.is_dir():
|
| 26 |
+
if model_dest.exists():
|
| 27 |
+
shutil.rmtree(model_dest)
|
| 28 |
+
shutil.copytree(model_source, model_dest)
|
| 29 |
+
else:
|
| 30 |
+
shutil.copy2(model_source, model_dest)
|
| 31 |
+
|
| 32 |
+
logger.info(f"Model pushed to: {model_dest}")
|
| 33 |
+
|
| 34 |
+
return ModelPusherArtifact(
|
| 35 |
+
pushed_model_path=str(model_dest),
|
| 36 |
+
is_pushed=True,
|
| 37 |
+
message="Model pushed successfully"
|
| 38 |
+
)
|
| 39 |
+
except Exception as e:
|
| 40 |
+
raise ModelPusherException(str(e), sys)
|
src/mlpipeline/config/configuration.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
from pathlib import Path
|
| 2 |
-
from mlpipeline.constants import CONFIG_FILE_PATH
|
| 3 |
from mlpipeline.utils.common import read_yaml, create_directories
|
| 4 |
from mlpipeline.entity.config_entity import (
|
| 5 |
DataIngestionConfig,
|
|
@@ -66,6 +66,7 @@ class ConfigurationManager:
|
|
| 66 |
|
| 67 |
def get_model_trainer_config(self) -> ModelTrainerConfig:
|
| 68 |
config = self.config.model_trainer
|
|
|
|
| 69 |
create_directories([config.root_dir])
|
| 70 |
|
| 71 |
return ModelTrainerConfig(
|
|
@@ -74,10 +75,12 @@ class ConfigurationManager:
|
|
| 74 |
test_data_path=Path(config.test_data_path),
|
| 75 |
model_path=Path(config.model_path),
|
| 76 |
target_column=config.target_column,
|
|
|
|
| 77 |
)
|
| 78 |
|
| 79 |
def get_model_evaluation_config(self) -> ModelEvaluationConfig:
|
| 80 |
config = self.config.model_evaluation
|
|
|
|
| 81 |
create_directories([config.root_dir])
|
| 82 |
|
| 83 |
return ModelEvaluationConfig(
|
|
@@ -86,6 +89,7 @@ class ConfigurationManager:
|
|
| 86 |
test_data_path=Path(config.test_data_path),
|
| 87 |
metrics_file=Path(config.metrics_file),
|
| 88 |
target_column=config.target_column,
|
|
|
|
| 89 |
)
|
| 90 |
|
| 91 |
def get_model_pusher_config(self) -> ModelPusherConfig:
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
+
from mlpipeline.constants import CONFIG_FILE_PATH, AUTOML_CONFIG_FILE_PATH
|
| 3 |
from mlpipeline.utils.common import read_yaml, create_directories
|
| 4 |
from mlpipeline.entity.config_entity import (
|
| 5 |
DataIngestionConfig,
|
|
|
|
| 66 |
|
| 67 |
def get_model_trainer_config(self) -> ModelTrainerConfig:
|
| 68 |
config = self.config.model_trainer
|
| 69 |
+
automl_config = read_yaml(Path(AUTOML_CONFIG_FILE_PATH))
|
| 70 |
create_directories([config.root_dir])
|
| 71 |
|
| 72 |
return ModelTrainerConfig(
|
|
|
|
| 75 |
test_data_path=Path(config.test_data_path),
|
| 76 |
model_path=Path(config.model_path),
|
| 77 |
target_column=config.target_column,
|
| 78 |
+
automl_library=automl_config.automl_library,
|
| 79 |
)
|
| 80 |
|
| 81 |
def get_model_evaluation_config(self) -> ModelEvaluationConfig:
|
| 82 |
config = self.config.model_evaluation
|
| 83 |
+
automl_config = read_yaml(Path(AUTOML_CONFIG_FILE_PATH))
|
| 84 |
create_directories([config.root_dir])
|
| 85 |
|
| 86 |
return ModelEvaluationConfig(
|
|
|
|
| 89 |
test_data_path=Path(config.test_data_path),
|
| 90 |
metrics_file=Path(config.metrics_file),
|
| 91 |
target_column=config.target_column,
|
| 92 |
+
automl_library=automl_config.automl_library,
|
| 93 |
)
|
| 94 |
|
| 95 |
def get_model_pusher_config(self) -> ModelPusherConfig:
|
src/mlpipeline/entity/__init__.py
CHANGED
|
@@ -15,4 +15,5 @@ from mlpipeline.entity.artifact_entity import (
|
|
| 15 |
FeatureEngineeringArtifact,
|
| 16 |
ModelTrainerArtifact,
|
| 17 |
ModelEvaluationArtifact,
|
|
|
|
| 18 |
)
|
|
|
|
| 15 |
FeatureEngineeringArtifact,
|
| 16 |
ModelTrainerArtifact,
|
| 17 |
ModelEvaluationArtifact,
|
| 18 |
+
ModelPusherArtifact,
|
| 19 |
)
|
src/mlpipeline/entity/artifact_entity.py
CHANGED
|
@@ -45,4 +45,11 @@ class ModelTrainerArtifact:
|
|
| 45 |
class ModelEvaluationArtifact:
|
| 46 |
is_model_accepted: bool
|
| 47 |
evaluation_metrics: Dict[str, float]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
message: str
|
|
|
|
| 45 |
class ModelEvaluationArtifact:
|
| 46 |
is_model_accepted: bool
|
| 47 |
evaluation_metrics: Dict[str, float]
|
| 48 |
+
message: str
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
@dataclass
|
| 52 |
+
class ModelPusherArtifact:
|
| 53 |
+
pushed_model_path: str
|
| 54 |
+
is_pushed: bool
|
| 55 |
message: str
|
src/mlpipeline/entity/config_entity.py
CHANGED
|
@@ -44,6 +44,7 @@ class ModelTrainerConfig:
|
|
| 44 |
test_data_path: Path
|
| 45 |
model_path: Path
|
| 46 |
target_column: str
|
|
|
|
| 47 |
|
| 48 |
|
| 49 |
@dataclass(frozen=True)
|
|
@@ -53,6 +54,7 @@ class ModelEvaluationConfig:
|
|
| 53 |
test_data_path: Path
|
| 54 |
metrics_file: Path
|
| 55 |
target_column: str
|
|
|
|
| 56 |
|
| 57 |
|
| 58 |
@dataclass(frozen=True)
|
|
|
|
| 44 |
test_data_path: Path
|
| 45 |
model_path: Path
|
| 46 |
target_column: str
|
| 47 |
+
automl_library: str
|
| 48 |
|
| 49 |
|
| 50 |
@dataclass(frozen=True)
|
|
|
|
| 54 |
test_data_path: Path
|
| 55 |
metrics_file: Path
|
| 56 |
target_column: str
|
| 57 |
+
automl_library: str
|
| 58 |
|
| 59 |
|
| 60 |
@dataclass(frozen=True)
|
src/mlpipeline/exception/__init__.py
CHANGED
|
@@ -3,7 +3,9 @@ from mlpipeline.exception.exception import (
|
|
| 3 |
DataIngestionException,
|
| 4 |
DataValidationException,
|
| 5 |
DataTransformationException,
|
|
|
|
| 6 |
ModelTrainingException,
|
| 7 |
ModelEvaluationException,
|
|
|
|
| 8 |
ConfigurationException,
|
| 9 |
)
|
|
|
|
| 3 |
DataIngestionException,
|
| 4 |
DataValidationException,
|
| 5 |
DataTransformationException,
|
| 6 |
+
FeatureEngineeringException,
|
| 7 |
ModelTrainingException,
|
| 8 |
ModelEvaluationException,
|
| 9 |
+
ModelPusherException,
|
| 10 |
ConfigurationException,
|
| 11 |
)
|
src/mlpipeline/exception/exception.py
CHANGED
|
@@ -35,6 +35,10 @@ class DataTransformationException(MLPipelineException):
|
|
| 35 |
pass
|
| 36 |
|
| 37 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
class ModelTrainingException(MLPipelineException):
|
| 39 |
pass
|
| 40 |
|
|
@@ -43,5 +47,9 @@ class ModelEvaluationException(MLPipelineException):
|
|
| 43 |
pass
|
| 44 |
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
class ConfigurationException(MLPipelineException):
|
| 47 |
pass
|
|
|
|
| 35 |
pass
|
| 36 |
|
| 37 |
|
| 38 |
+
class FeatureEngineeringException(MLPipelineException):
|
| 39 |
+
pass
|
| 40 |
+
|
| 41 |
+
|
| 42 |
class ModelTrainingException(MLPipelineException):
|
| 43 |
pass
|
| 44 |
|
|
|
|
| 47 |
pass
|
| 48 |
|
| 49 |
|
| 50 |
+
class ModelPusherException(MLPipelineException):
|
| 51 |
+
pass
|
| 52 |
+
|
| 53 |
+
|
| 54 |
class ConfigurationException(MLPipelineException):
|
| 55 |
pass
|
src/mlpipeline/pipelines/__init__.py
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
__all__ = [
|
| 2 |
+
"DataIngestionPipeline",
|
| 3 |
+
"DataValidationPipeline",
|
| 4 |
+
"DataTransformationPipeline",
|
| 5 |
+
"FeatureEngineeringPipeline",
|
| 6 |
+
"ModelTrainerPipeline",
|
| 7 |
+
"ModelEvaluationPipeline",
|
| 8 |
+
"ModelPusherPipeline",
|
| 9 |
+
]
|
src/mlpipeline/pipelines/data_ingestion_pipeline.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mlpipeline.config.configuration import ConfigurationManager
|
| 2 |
+
from mlpipeline.components.data_ingestion import DataIngestion
|
| 3 |
+
from mlpipeline.logging.logger import get_logger
|
| 4 |
+
|
| 5 |
+
logger = get_logger(__name__)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class DataIngestionPipeline:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.config_manager = ConfigurationManager()
|
| 11 |
+
|
| 12 |
+
def run(self):
|
| 13 |
+
logger.info("Data Ingestion Pipeline started")
|
| 14 |
+
config = self.config_manager.get_data_ingestion_config()
|
| 15 |
+
data_ingestion = DataIngestion(config=config)
|
| 16 |
+
artifact = data_ingestion.download_data()
|
| 17 |
+
logger.info(f"Data Ingestion Pipeline completed: {artifact.message}")
|
| 18 |
+
return artifact
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
if __name__ == "__main__":
|
| 22 |
+
pipeline = DataIngestionPipeline()
|
| 23 |
+
pipeline.run()
|
src/mlpipeline/pipelines/data_transformation_pipeline.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mlpipeline.config.configuration import ConfigurationManager
|
| 2 |
+
from mlpipeline.components.data_transformation import DataTransformation
|
| 3 |
+
from mlpipeline.logging.logger import get_logger
|
| 4 |
+
|
| 5 |
+
logger = get_logger(__name__)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class DataTransformationPipeline:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.config_manager = ConfigurationManager()
|
| 11 |
+
|
| 12 |
+
def run(self):
|
| 13 |
+
logger.info("Data Transformation Pipeline started")
|
| 14 |
+
config = self.config_manager.get_data_transformation_config()
|
| 15 |
+
data_transformation = DataTransformation(config=config)
|
| 16 |
+
artifact = data_transformation.transform()
|
| 17 |
+
logger.info(f"Data Transformation Pipeline completed: {artifact.message}")
|
| 18 |
+
return artifact
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
if __name__ == "__main__":
|
| 22 |
+
pipeline = DataTransformationPipeline()
|
| 23 |
+
pipeline.run()
|
src/mlpipeline/pipelines/data_validation_pipeline.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mlpipeline.config.configuration import ConfigurationManager
|
| 2 |
+
from mlpipeline.components.data_validation import DataValidation
|
| 3 |
+
from mlpipeline.logging.logger import get_logger
|
| 4 |
+
|
| 5 |
+
logger = get_logger(__name__)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class DataValidationPipeline:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.config_manager = ConfigurationManager()
|
| 11 |
+
|
| 12 |
+
def run(self):
|
| 13 |
+
logger.info("Data Validation Pipeline started")
|
| 14 |
+
config = self.config_manager.get_data_validation_config()
|
| 15 |
+
data_validation = DataValidation(config=config)
|
| 16 |
+
artifact = data_validation.validate_schema()
|
| 17 |
+
logger.info(f"Data Validation Pipeline completed: {artifact.message}")
|
| 18 |
+
return artifact
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
if __name__ == "__main__":
|
| 22 |
+
pipeline = DataValidationPipeline()
|
| 23 |
+
pipeline.run()
|
src/mlpipeline/pipelines/feature_engineering_pipeline.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mlpipeline.config.configuration import ConfigurationManager
|
| 2 |
+
from mlpipeline.components.feature_engineering import FeatureEngineering
|
| 3 |
+
from mlpipeline.logging.logger import get_logger
|
| 4 |
+
|
| 5 |
+
logger = get_logger(__name__)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class FeatureEngineeringPipeline:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.config_manager = ConfigurationManager()
|
| 11 |
+
|
| 12 |
+
def run(self):
|
| 13 |
+
logger.info("Feature Engineering Pipeline started")
|
| 14 |
+
config = self.config_manager.get_feature_engineering_config()
|
| 15 |
+
feature_engineering = FeatureEngineering(config=config)
|
| 16 |
+
artifact = feature_engineering.engineer_features()
|
| 17 |
+
logger.info(f"Feature Engineering Pipeline completed: {artifact.message}")
|
| 18 |
+
return artifact
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
if __name__ == "__main__":
|
| 22 |
+
pipeline = FeatureEngineeringPipeline()
|
| 23 |
+
pipeline.run()
|
src/mlpipeline/pipelines/model_evaluation_pipeline.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mlpipeline.config.configuration import ConfigurationManager
|
| 2 |
+
from mlpipeline.components.model_evaluation import ModelEvaluation
|
| 3 |
+
from mlpipeline.logging.logger import get_logger
|
| 4 |
+
|
| 5 |
+
logger = get_logger(__name__)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ModelEvaluationPipeline:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.config_manager = ConfigurationManager()
|
| 11 |
+
|
| 12 |
+
def run(self):
|
| 13 |
+
logger.info("Model Evaluation Pipeline started")
|
| 14 |
+
config = self.config_manager.get_model_evaluation_config()
|
| 15 |
+
evaluation = ModelEvaluation(config=config)
|
| 16 |
+
artifact = evaluation.evaluate()
|
| 17 |
+
logger.info(f"Model Evaluation Pipeline completed with metrics: {artifact.evaluation_metrics}")
|
| 18 |
+
return artifact
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
if __name__ == "__main__":
|
| 22 |
+
pipeline = ModelEvaluationPipeline()
|
| 23 |
+
pipeline.run()
|
src/mlpipeline/pipelines/model_pusher_pipeline.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mlpipeline.config.configuration import ConfigurationManager
|
| 2 |
+
from mlpipeline.components.model_pusher import ModelPusher
|
| 3 |
+
from mlpipeline.logging.logger import get_logger
|
| 4 |
+
|
| 5 |
+
logger = get_logger(__name__)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ModelPusherPipeline:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.config_manager = ConfigurationManager()
|
| 11 |
+
|
| 12 |
+
def run(self):
|
| 13 |
+
logger.info("Model Pusher Pipeline started")
|
| 14 |
+
config = self.config_manager.get_model_pusher_config()
|
| 15 |
+
pusher = ModelPusher(config=config)
|
| 16 |
+
artifact = pusher.push_model()
|
| 17 |
+
logger.info(f"Model Pusher Pipeline completed: {artifact.message}")
|
| 18 |
+
return artifact
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
if __name__ == "__main__":
|
| 22 |
+
pipeline = ModelPusherPipeline()
|
| 23 |
+
pipeline.run()
|
src/mlpipeline/pipelines/model_trainer_pipeline.py
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from mlpipeline.config.configuration import ConfigurationManager
|
| 2 |
+
from mlpipeline.components.automl_trainer import AutoMLTrainer
|
| 3 |
+
from mlpipeline.logging.logger import get_logger
|
| 4 |
+
|
| 5 |
+
logger = get_logger(__name__)
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
class ModelTrainerPipeline:
|
| 9 |
+
def __init__(self):
|
| 10 |
+
self.config_manager = ConfigurationManager()
|
| 11 |
+
|
| 12 |
+
def run(self):
|
| 13 |
+
logger.info("Model Trainer Pipeline started")
|
| 14 |
+
config = self.config_manager.get_model_trainer_config()
|
| 15 |
+
trainer = AutoMLTrainer(config=config)
|
| 16 |
+
artifact = trainer.train()
|
| 17 |
+
logger.info(f"Model Trainer Pipeline completed with metrics: {artifact.train_metrics}")
|
| 18 |
+
return artifact
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
if __name__ == "__main__":
|
| 22 |
+
pipeline = ModelTrainerPipeline()
|
| 23 |
+
pipeline.run()
|
src/mlpipeline/utils/common.py
CHANGED
|
@@ -76,6 +76,19 @@ def load_model(path: Path) -> Any:
|
|
| 76 |
return model
|
| 77 |
|
| 78 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 79 |
@ensure_annotations
|
| 80 |
def get_size(path: Path) -> str:
|
| 81 |
size_in_kb = round(os.path.getsize(path) / 1024)
|
|
|
|
| 76 |
return model
|
| 77 |
|
| 78 |
|
| 79 |
+
def save_object(path: Path, obj):
|
| 80 |
+
with open(path, "wb") as f:
|
| 81 |
+
pickle.dump(obj, f)
|
| 82 |
+
logger.info(f"Object saved: {path}")
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
def load_object(path: Path):
|
| 86 |
+
with open(path, "rb") as f:
|
| 87 |
+
obj = pickle.load(f)
|
| 88 |
+
logger.info(f"Object loaded: {path}")
|
| 89 |
+
return obj
|
| 90 |
+
|
| 91 |
+
|
| 92 |
@ensure_annotations
|
| 93 |
def get_size(path: Path) -> str:
|
| 94 |
size_in_kb = round(os.path.getsize(path) / 1024)
|