Abeshith commited on
Commit
a7d80f2
·
1 Parent(s): 19d70f4

Add pipeline stages implementation

Browse files
config/config.yaml CHANGED
@@ -2,19 +2,19 @@ artifacts_root: artifacts
2
 
3
  data_ingestion:
4
  root_dir: artifacts/data_ingestion
5
- source_url: null
6
- local_data_file: artifacts/data_ingestion/data.csv
7
  unzip_dir: artifacts/data_ingestion
8
 
9
  data_validation:
10
  root_dir: artifacts/data_validation
11
- data_dir: artifacts/data_ingestion/data.csv
12
  status_file: artifacts/data_validation/status.txt
13
  schema_file: config/schema.yaml
14
 
15
  data_transformation:
16
  root_dir: artifacts/data_transformation
17
- data_path: artifacts/data_ingestion/data.csv
18
  train_path: artifacts/data_transformation/train.csv
19
  test_path: artifacts/data_transformation/test.csv
20
  test_size: 0.2
@@ -32,14 +32,14 @@ model_trainer:
32
  train_data_path: artifacts/feature_engineering/train_features.csv
33
  test_data_path: artifacts/feature_engineering/test_features.csv
34
  model_path: artifacts/model_trainer/model
35
- target_column: target
36
 
37
  model_evaluation:
38
  root_dir: artifacts/model_evaluation
39
  model_path: artifacts/model_trainer/model
40
  test_data_path: artifacts/feature_engineering/test_features.csv
41
  metrics_file: artifacts/model_evaluation/metrics.json
42
- target_column: target
43
 
44
  model_pusher:
45
  root_dir: artifacts/model_pusher
 
2
 
3
  data_ingestion:
4
  root_dir: artifacts/data_ingestion
5
+ source_url: kaggle://playground-series-s6e2
6
+ local_data_file: artifacts/data_ingestion/train_raw.csv
7
  unzip_dir: artifacts/data_ingestion
8
 
9
  data_validation:
10
  root_dir: artifacts/data_validation
11
+ data_dir: artifacts/data_ingestion/train_raw.csv
12
  status_file: artifacts/data_validation/status.txt
13
  schema_file: config/schema.yaml
14
 
15
  data_transformation:
16
  root_dir: artifacts/data_transformation
17
+ data_path: artifacts/data_ingestion/train_raw.csv
18
  train_path: artifacts/data_transformation/train.csv
19
  test_path: artifacts/data_transformation/test.csv
20
  test_size: 0.2
 
32
  train_data_path: artifacts/feature_engineering/train_features.csv
33
  test_data_path: artifacts/feature_engineering/test_features.csv
34
  model_path: artifacts/model_trainer/model
35
+ target_column: Heart Disease
36
 
37
  model_evaluation:
38
  root_dir: artifacts/model_evaluation
39
  model_path: artifacts/model_trainer/model
40
  test_data_path: artifacts/feature_engineering/test_features.csv
41
  metrics_file: artifacts/model_evaluation/metrics.json
42
+ target_column: Heart Disease
43
 
44
  model_pusher:
45
  root_dir: artifacts/model_pusher
requirements.txt CHANGED
@@ -27,4 +27,5 @@ prometheus-client
27
  python-json-logger
28
 
29
  httpx
30
- requests
 
 
27
  python-json-logger
28
 
29
  httpx
30
+ requests
31
+ kaggle
setup.py CHANGED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from setuptools import setup, find_packages
2
+
3
+ setup(
4
+ name="mlpipeline",
5
+ version="0.1.0",
6
+ author="AutoML Team",
7
+ packages=find_packages(where="src"),
8
+ package_dir={"": "src"},
9
+ python_requires=">=3.11",
10
+ install_requires=[
11
+ "fastapi",
12
+ "uvicorn[standard]",
13
+ "pydantic",
14
+ "pandas",
15
+ "numpy",
16
+ "scikit-learn",
17
+ "autogluon.tabular",
18
+ "flaml",
19
+ "pycaret",
20
+ "mlflow",
21
+ "dvc",
22
+ "evidently",
23
+ "pyyaml",
24
+ "python-box",
25
+ "ensure",
26
+ "kaggle",
27
+ ],
28
+ )
src/mlpipeline/automl/autogluon_trainer.py CHANGED
@@ -47,7 +47,7 @@ class AutoGluonTrainer:
47
  raise ValueError("Model not trained. Call train() first.")
48
  return self.predictor.predict(data)
49
 
50
- def load(self, model_path: Path):
51
  logger.info(f"Loading AutoGluon model from {model_path}")
52
  self.predictor = TabularPredictor.load(str(model_path))
53
  return self
 
47
  raise ValueError("Model not trained. Call train() first.")
48
  return self.predictor.predict(data)
49
 
50
+ def load_model(self, model_path: Path):
51
  logger.info(f"Loading AutoGluon model from {model_path}")
52
  self.predictor = TabularPredictor.load(str(model_path))
53
  return self
src/mlpipeline/automl/flaml_trainer.py CHANGED
@@ -2,7 +2,7 @@ from pathlib import Path
2
  from typing import Dict, Any, Optional
3
  import pandas as pd
4
  import numpy as np
5
- from flaml import AutoML
6
  from sklearn.metrics import accuracy_score, r2_score
7
 
8
  from mlpipeline.logging.logger import get_logger
 
2
  from typing import Dict, Any, Optional
3
  import pandas as pd
4
  import numpy as np
5
+ from flaml.automl.automl import AutoML
6
  from sklearn.metrics import accuracy_score, r2_score
7
 
8
  from mlpipeline.logging.logger import get_logger
src/mlpipeline/components/__init__.py CHANGED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ __all__ = [
2
+ "DataIngestion",
3
+ "DataValidation",
4
+ "DataTransformation",
5
+ "FeatureEngineering",
6
+ "AutoMLTrainer",
7
+ "ModelEvaluation",
8
+ "ModelPusher",
9
+ ]
src/mlpipeline/components/automl_trainer.py CHANGED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from mlpipeline.entity import ModelTrainerConfig, ModelTrainerArtifact
3
+ from mlpipeline.automl import AutoMLFactory
4
+ from mlpipeline.logging.logger import get_logger
5
+ from mlpipeline.exception import ModelTrainingException
6
+ from mlpipeline.constants import AUTOML_CONFIG_FILE_PATH
7
+ from mlpipeline.utils.common import read_yaml
8
+ from pathlib import Path
9
+ import sys
10
+ import os
11
+
12
+ logger = get_logger(__name__)
13
+
14
+
15
+ class AutoMLTrainer:
16
+ def __init__(self, config: ModelTrainerConfig):
17
+ self.config = config
18
+
19
+ def train(self) -> ModelTrainerArtifact:
20
+ try:
21
+ logger.info("Starting model training")
22
+
23
+ train_df = pd.read_csv(self.config.train_data_path)
24
+
25
+ automl_config = read_yaml(Path(AUTOML_CONFIG_FILE_PATH))
26
+ library_config = automl_config[self.config.automl_library]
27
+
28
+ trainer = AutoMLFactory.create_trainer(
29
+ self.config.automl_library,
30
+ library_config
31
+ )
32
+
33
+ os.makedirs(self.config.root_dir, exist_ok=True)
34
+
35
+ if self.config.automl_library == 'autogluon':
36
+ metrics = trainer.train(train_df, self.config.target_column, self.config.model_path)
37
+ else:
38
+ X_train = train_df.drop(columns=[self.config.target_column])
39
+ y_train = train_df[self.config.target_column]
40
+ metrics = trainer.train(X_train, y_train, self.config.model_path)
41
+
42
+ logger.info(f"Model trained with metrics: {metrics}")
43
+
44
+ return ModelTrainerArtifact(
45
+ model_path=self.config.model_path,
46
+ train_metrics=metrics,
47
+ is_trained=True,
48
+ message=f"Model trained successfully with score: {metrics.get('score', 0.0):.4f}"
49
+ )
50
+ except Exception as e:
51
+ raise ModelTrainingException(str(e), sys)
src/mlpipeline/components/data_ingestion.py CHANGED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import shutil
3
+ from pathlib import Path
4
+ from kaggle.api.kaggle_api_extended import KaggleApi
5
+
6
+ from mlpipeline.entity import DataIngestionConfig, DataIngestionArtifact
7
+ from mlpipeline.logging.logger import get_logger
8
+ from mlpipeline.exception import DataIngestionException
9
+ import sys
10
+
11
+ logger = get_logger(__name__)
12
+
13
+
14
+ class DataIngestion:
15
+ def __init__(self, config: DataIngestionConfig):
16
+ self.config = config
17
+ self.kaggle_api = KaggleApi()
18
+ self.kaggle_api.authenticate()
19
+
20
+ def download_data(self) -> DataIngestionArtifact:
21
+ try:
22
+ logger.info("Starting data ingestion")
23
+
24
+ os.makedirs(self.config.root_dir, exist_ok=True)
25
+
26
+ competition_name = "playground-series-s6e2"
27
+
28
+ logger.info(f"Downloading dataset from Kaggle competition: {competition_name}")
29
+ self.kaggle_api.competition_download_files(
30
+ competition_name,
31
+ path=self.config.root_dir
32
+ )
33
+
34
+ zip_file = self.config.root_dir / f"{competition_name}.zip"
35
+
36
+ if zip_file.exists():
37
+ logger.info(f"Extracting {zip_file}")
38
+ shutil.unpack_archive(zip_file, self.config.unzip_dir)
39
+ zip_file.unlink()
40
+
41
+ train_file = self.config.unzip_dir / "train.csv"
42
+ test_file = self.config.unzip_dir / "test.csv"
43
+
44
+ if train_file.exists() and test_file.exists():
45
+ train_raw = self.config.root_dir / "train_raw.csv"
46
+ test_raw = self.config.root_dir / "test_raw.csv"
47
+
48
+ shutil.copy(train_file, train_raw)
49
+ shutil.copy(test_file, test_raw)
50
+
51
+ logger.info(f"Data saved: {train_raw}, {test_raw}")
52
+
53
+ return DataIngestionArtifact(
54
+ data_file_path=train_raw,
55
+ is_ingested=True,
56
+ message="Data ingestion completed successfully"
57
+ )
58
+ else:
59
+ raise FileNotFoundError("Train or test file not found after extraction")
60
+
61
+ except Exception as e:
62
+ raise DataIngestionException(str(e), sys)
src/mlpipeline/components/data_transformation.py CHANGED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from sklearn.model_selection import train_test_split
3
+ from mlpipeline.entity import DataTransformationConfig, DataTransformationArtifact
4
+ from mlpipeline.logging.logger import get_logger
5
+ from mlpipeline.exception import DataTransformationException
6
+ import sys
7
+ import os
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ class DataTransformation:
13
+ def __init__(self, config: DataTransformationConfig):
14
+ self.config = config
15
+
16
+ def transform(self) -> DataTransformationArtifact:
17
+ try:
18
+ logger.info("Starting data transformation")
19
+
20
+ df = pd.read_csv(self.config.data_path)
21
+
22
+ train_df, test_df = train_test_split(
23
+ df,
24
+ test_size=self.config.test_size,
25
+ random_state=self.config.random_state
26
+ )
27
+
28
+ os.makedirs(self.config.root_dir, exist_ok=True)
29
+
30
+ train_df.to_csv(self.config.train_path, index=False)
31
+ test_df.to_csv(self.config.test_path, index=False)
32
+
33
+ logger.info(f"Train shape: {train_df.shape}, Test shape: {test_df.shape}")
34
+
35
+ return DataTransformationArtifact(
36
+ train_file_path=self.config.train_path,
37
+ test_file_path=self.config.test_path,
38
+ is_transformed=True,
39
+ message=f"Train: {train_df.shape}, Test: {test_df.shape}"
40
+ )
41
+ except Exception as e:
42
+ raise DataTransformationException(str(e), sys)
src/mlpipeline/components/data_validation.py CHANGED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ from mlpipeline.entity import DataValidationConfig, DataValidationArtifact
3
+ from mlpipeline.logging.logger import get_logger
4
+ from mlpipeline.exception import DataValidationException
5
+ from mlpipeline.utils.common import read_yaml
6
+ import sys
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ class DataValidation:
12
+ def __init__(self, config: DataValidationConfig):
13
+ self.config = config
14
+
15
+ def validate_schema(self) -> DataValidationArtifact:
16
+ try:
17
+ logger.info("Starting data validation")
18
+
19
+ df = pd.read_csv(self.config.data_dir)
20
+
21
+ validation_status = True
22
+ errors = []
23
+
24
+ if df.empty:
25
+ validation_status = False
26
+ errors.append("Dataset is empty")
27
+
28
+ if df.isnull().all().any():
29
+ validation_status = False
30
+ errors.append("Columns with all null values found")
31
+
32
+ status_message = "Validation passed" if validation_status else "; ".join(errors)
33
+
34
+ with open(self.config.status_file, "w") as f:
35
+ f.write(status_message)
36
+
37
+ logger.info(f"Validation status: {status_message}")
38
+
39
+ return DataValidationArtifact(
40
+ validation_status=validation_status,
41
+ message=status_message,
42
+ schema_file_path=self.config.schema_file
43
+ )
44
+ except Exception as e:
45
+ raise DataValidationException(str(e), sys)
src/mlpipeline/components/feature_engineering.py CHANGED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ from pathlib import Path
4
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
5
+ from sklearn.feature_selection import VarianceThreshold
6
+ from mlpipeline.entity import FeatureEngineeringConfig, FeatureEngineeringArtifact
7
+ from mlpipeline.logging.logger import get_logger
8
+ from mlpipeline.exception import FeatureEngineeringException
9
+ from mlpipeline.utils.common import save_object
10
+ import sys
11
+ import os
12
+
13
+ logger = get_logger(__name__)
14
+
15
+
16
+ class FeatureEngineering:
17
+ def __init__(self, config: FeatureEngineeringConfig):
18
+ self.config = config
19
+ self.label_encoders = {}
20
+ self.scaler = None
21
+
22
+ def engineer_features(self) -> FeatureEngineeringArtifact:
23
+ try:
24
+ logger.info("Starting feature engineering")
25
+
26
+ train_df = pd.read_csv(self.config.train_path)
27
+ test_df = pd.read_csv(self.config.test_path)
28
+
29
+ train_df = self._handle_missing_values(train_df)
30
+ test_df = self._handle_missing_values(test_df)
31
+
32
+ train_df = self._encode_categorical(train_df, is_train=True)
33
+ test_df = self._encode_categorical(test_df, is_train=False)
34
+
35
+ train_df = self._create_interaction_features(train_df)
36
+ test_df = self._create_interaction_features(test_df)
37
+
38
+ train_df = self._remove_low_variance(train_df, is_train=True)
39
+ test_df = self._remove_low_variance(test_df, is_train=False)
40
+
41
+ numeric_cols = train_df.select_dtypes(include=[np.number]).columns.tolist()
42
+ if 'target' in numeric_cols:
43
+ numeric_cols.remove('target')
44
+
45
+ if numeric_cols:
46
+ self.scaler = StandardScaler()
47
+ train_df[numeric_cols] = self.scaler.fit_transform(train_df[numeric_cols])
48
+ test_df[numeric_cols] = self.scaler.transform(test_df[numeric_cols])
49
+
50
+ os.makedirs(self.config.root_dir, exist_ok=True)
51
+
52
+ train_df.to_csv(self.config.output_train_path, index=False)
53
+ test_df.to_csv(self.config.output_test_path, index=False)
54
+
55
+ preprocessor_path = Path(self.config.root_dir) / "preprocessor.pkl"
56
+ save_object(preprocessor_path, {
57
+ 'scaler': self.scaler,
58
+ 'label_encoders': self.label_encoders
59
+ })
60
+
61
+ logger.info(f"Feature engineering completed. Train shape: {train_df.shape}, Test shape: {test_df.shape}")
62
+
63
+ return FeatureEngineeringArtifact(
64
+ train_features_path=self.config.output_train_path,
65
+ test_features_path=self.config.output_test_path,
66
+ is_engineered=True,
67
+ message=f"Features engineered: {train_df.shape[1]} features"
68
+ )
69
+ except Exception as e:
70
+ raise FeatureEngineeringException(str(e), sys)
71
+
72
+ def _handle_missing_values(self, df):
73
+ for col in df.columns:
74
+ if df[col].dtype in [np.float64, np.int64]:
75
+ df[col].fillna(df[col].median(), inplace=True)
76
+ else:
77
+ df[col].fillna(df[col].mode()[0] if not df[col].mode().empty else 'missing', inplace=True)
78
+ return df
79
+
80
+ def _encode_categorical(self, df, is_train=True):
81
+ categorical_cols = df.select_dtypes(include=['object']).columns
82
+
83
+ for col in categorical_cols:
84
+ if is_train:
85
+ self.label_encoders[col] = LabelEncoder()
86
+ df[col] = self.label_encoders[col].fit_transform(df[col].astype(str))
87
+ else:
88
+ if col in self.label_encoders:
89
+ df[col] = df[col].astype(str).map(
90
+ lambda x: self.label_encoders[col].transform([x])[0]
91
+ if x in self.label_encoders[col].classes_ else -1
92
+ )
93
+ return df
94
+
95
+ def _create_interaction_features(self, df):
96
+ numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
97
+ if 'target' in numeric_cols:
98
+ numeric_cols.remove('target')
99
+
100
+ if len(numeric_cols) >= 2:
101
+ df[f'{numeric_cols[0]}_x_{numeric_cols[1]}'] = df[numeric_cols[0]] * df[numeric_cols[1]]
102
+
103
+ return df
104
+
105
+ def _remove_low_variance(self, df, is_train=True, threshold=0.01):
106
+ if 'target' in df.columns:
107
+ target = df['target']
108
+ features = df.drop(columns=['target'])
109
+ else:
110
+ target = None
111
+ features = df
112
+
113
+ if is_train:
114
+ self.variance_selector = VarianceThreshold(threshold=threshold)
115
+ self.variance_selector.fit(features)
116
+
117
+ if hasattr(self, 'variance_selector'):
118
+ features_selected = pd.DataFrame(
119
+ self.variance_selector.transform(features),
120
+ columns=features.columns[self.variance_selector.get_support()],
121
+ index=features.index
122
+ )
123
+
124
+ if target is not None:
125
+ return pd.concat([features_selected, target], axis=1)
126
+ return features_selected
127
+
128
+ return df
src/mlpipeline/components/model_evaluation.py CHANGED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import json
3
+ from sklearn.metrics import accuracy_score, f1_score, classification_report
4
+ from mlpipeline.entity import ModelEvaluationConfig, ModelEvaluationArtifact
5
+ from autogluon.tabular import TabularPredictor
6
+ from mlpipeline.logging.logger import get_logger
7
+ from mlpipeline.exception import ModelEvaluationException
8
+ import sys
9
+ import os
10
+
11
+ logger = get_logger(__name__)
12
+
13
+
14
+ class ModelEvaluation:
15
+ def __init__(self, config: ModelEvaluationConfig):
16
+ self.config = config
17
+
18
+ def evaluate(self) -> ModelEvaluationArtifact:
19
+ try:
20
+ logger.info("Starting model evaluation")
21
+
22
+ test_df = pd.read_csv(self.config.test_data_path)
23
+
24
+ predictor = TabularPredictor.load(str(self.config.model_path))
25
+
26
+ predictions = predictor.predict(test_df)
27
+ y_test = test_df[self.config.target_column]
28
+
29
+ predictions_binary = (predictions > 0).astype(int)
30
+ y_test_binary = (y_test > 0).astype(int)
31
+
32
+ accuracy = float(accuracy_score(y_test_binary, predictions_binary))
33
+ f1 = float(f1_score(y_test_binary, predictions_binary, average='weighted'))
34
+
35
+ metrics = {
36
+ "accuracy": accuracy,
37
+ "f1_score": f1
38
+ }
39
+
40
+ os.makedirs(self.config.root_dir, exist_ok=True)
41
+
42
+ with open(self.config.metrics_file, "w") as f:
43
+ json.dump(metrics, f, indent=2)
44
+
45
+ logger.info(f"Evaluation metrics: {metrics}")
46
+
47
+ return ModelEvaluationArtifact(
48
+ is_model_accepted=True,
49
+ evaluation_metrics=metrics,
50
+ message=f"Model evaluation completed with accuracy: {accuracy:.4f}"
51
+ )
52
+ except Exception as e:
53
+ raise ModelEvaluationException(str(e), sys)
src/mlpipeline/components/model_pusher.py CHANGED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ import os
3
+ from pathlib import Path
4
+ from mlpipeline.entity import ModelPusherConfig, ModelPusherArtifact
5
+ from mlpipeline.logging.logger import get_logger
6
+ from mlpipeline.exception import ModelPusherException
7
+ import sys
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ class ModelPusher:
13
+ def __init__(self, config: ModelPusherConfig):
14
+ self.config = config
15
+
16
+ def push_model(self) -> ModelPusherArtifact:
17
+ try:
18
+ logger.info("Starting model pusher")
19
+
20
+ os.makedirs(self.config.model_registry_path, exist_ok=True)
21
+
22
+ model_source = Path(self.config.model_path)
23
+ model_dest = Path(self.config.model_registry_path) / model_source.name
24
+
25
+ if model_source.is_dir():
26
+ if model_dest.exists():
27
+ shutil.rmtree(model_dest)
28
+ shutil.copytree(model_source, model_dest)
29
+ else:
30
+ shutil.copy2(model_source, model_dest)
31
+
32
+ logger.info(f"Model pushed to: {model_dest}")
33
+
34
+ return ModelPusherArtifact(
35
+ pushed_model_path=str(model_dest),
36
+ is_pushed=True,
37
+ message="Model pushed successfully"
38
+ )
39
+ except Exception as e:
40
+ raise ModelPusherException(str(e), sys)
src/mlpipeline/config/configuration.py CHANGED
@@ -1,5 +1,5 @@
1
  from pathlib import Path
2
- from mlpipeline.constants import CONFIG_FILE_PATH
3
  from mlpipeline.utils.common import read_yaml, create_directories
4
  from mlpipeline.entity.config_entity import (
5
  DataIngestionConfig,
@@ -66,6 +66,7 @@ class ConfigurationManager:
66
 
67
  def get_model_trainer_config(self) -> ModelTrainerConfig:
68
  config = self.config.model_trainer
 
69
  create_directories([config.root_dir])
70
 
71
  return ModelTrainerConfig(
@@ -74,10 +75,12 @@ class ConfigurationManager:
74
  test_data_path=Path(config.test_data_path),
75
  model_path=Path(config.model_path),
76
  target_column=config.target_column,
 
77
  )
78
 
79
  def get_model_evaluation_config(self) -> ModelEvaluationConfig:
80
  config = self.config.model_evaluation
 
81
  create_directories([config.root_dir])
82
 
83
  return ModelEvaluationConfig(
@@ -86,6 +89,7 @@ class ConfigurationManager:
86
  test_data_path=Path(config.test_data_path),
87
  metrics_file=Path(config.metrics_file),
88
  target_column=config.target_column,
 
89
  )
90
 
91
  def get_model_pusher_config(self) -> ModelPusherConfig:
 
1
  from pathlib import Path
2
+ from mlpipeline.constants import CONFIG_FILE_PATH, AUTOML_CONFIG_FILE_PATH
3
  from mlpipeline.utils.common import read_yaml, create_directories
4
  from mlpipeline.entity.config_entity import (
5
  DataIngestionConfig,
 
66
 
67
  def get_model_trainer_config(self) -> ModelTrainerConfig:
68
  config = self.config.model_trainer
69
+ automl_config = read_yaml(Path(AUTOML_CONFIG_FILE_PATH))
70
  create_directories([config.root_dir])
71
 
72
  return ModelTrainerConfig(
 
75
  test_data_path=Path(config.test_data_path),
76
  model_path=Path(config.model_path),
77
  target_column=config.target_column,
78
+ automl_library=automl_config.automl_library,
79
  )
80
 
81
  def get_model_evaluation_config(self) -> ModelEvaluationConfig:
82
  config = self.config.model_evaluation
83
+ automl_config = read_yaml(Path(AUTOML_CONFIG_FILE_PATH))
84
  create_directories([config.root_dir])
85
 
86
  return ModelEvaluationConfig(
 
89
  test_data_path=Path(config.test_data_path),
90
  metrics_file=Path(config.metrics_file),
91
  target_column=config.target_column,
92
+ automl_library=automl_config.automl_library,
93
  )
94
 
95
  def get_model_pusher_config(self) -> ModelPusherConfig:
src/mlpipeline/entity/__init__.py CHANGED
@@ -15,4 +15,5 @@ from mlpipeline.entity.artifact_entity import (
15
  FeatureEngineeringArtifact,
16
  ModelTrainerArtifact,
17
  ModelEvaluationArtifact,
 
18
  )
 
15
  FeatureEngineeringArtifact,
16
  ModelTrainerArtifact,
17
  ModelEvaluationArtifact,
18
+ ModelPusherArtifact,
19
  )
src/mlpipeline/entity/artifact_entity.py CHANGED
@@ -45,4 +45,11 @@ class ModelTrainerArtifact:
45
  class ModelEvaluationArtifact:
46
  is_model_accepted: bool
47
  evaluation_metrics: Dict[str, float]
 
 
 
 
 
 
 
48
  message: str
 
45
  class ModelEvaluationArtifact:
46
  is_model_accepted: bool
47
  evaluation_metrics: Dict[str, float]
48
+ message: str
49
+
50
+
51
+ @dataclass
52
+ class ModelPusherArtifact:
53
+ pushed_model_path: str
54
+ is_pushed: bool
55
  message: str
src/mlpipeline/entity/config_entity.py CHANGED
@@ -44,6 +44,7 @@ class ModelTrainerConfig:
44
  test_data_path: Path
45
  model_path: Path
46
  target_column: str
 
47
 
48
 
49
  @dataclass(frozen=True)
@@ -53,6 +54,7 @@ class ModelEvaluationConfig:
53
  test_data_path: Path
54
  metrics_file: Path
55
  target_column: str
 
56
 
57
 
58
  @dataclass(frozen=True)
 
44
  test_data_path: Path
45
  model_path: Path
46
  target_column: str
47
+ automl_library: str
48
 
49
 
50
  @dataclass(frozen=True)
 
54
  test_data_path: Path
55
  metrics_file: Path
56
  target_column: str
57
+ automl_library: str
58
 
59
 
60
  @dataclass(frozen=True)
src/mlpipeline/exception/__init__.py CHANGED
@@ -3,7 +3,9 @@ from mlpipeline.exception.exception import (
3
  DataIngestionException,
4
  DataValidationException,
5
  DataTransformationException,
 
6
  ModelTrainingException,
7
  ModelEvaluationException,
 
8
  ConfigurationException,
9
  )
 
3
  DataIngestionException,
4
  DataValidationException,
5
  DataTransformationException,
6
+ FeatureEngineeringException,
7
  ModelTrainingException,
8
  ModelEvaluationException,
9
+ ModelPusherException,
10
  ConfigurationException,
11
  )
src/mlpipeline/exception/exception.py CHANGED
@@ -35,6 +35,10 @@ class DataTransformationException(MLPipelineException):
35
  pass
36
 
37
 
 
 
 
 
38
  class ModelTrainingException(MLPipelineException):
39
  pass
40
 
@@ -43,5 +47,9 @@ class ModelEvaluationException(MLPipelineException):
43
  pass
44
 
45
 
 
 
 
 
46
  class ConfigurationException(MLPipelineException):
47
  pass
 
35
  pass
36
 
37
 
38
+ class FeatureEngineeringException(MLPipelineException):
39
+ pass
40
+
41
+
42
  class ModelTrainingException(MLPipelineException):
43
  pass
44
 
 
47
  pass
48
 
49
 
50
+ class ModelPusherException(MLPipelineException):
51
+ pass
52
+
53
+
54
  class ConfigurationException(MLPipelineException):
55
  pass
src/mlpipeline/pipelines/__init__.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ __all__ = [
2
+ "DataIngestionPipeline",
3
+ "DataValidationPipeline",
4
+ "DataTransformationPipeline",
5
+ "FeatureEngineeringPipeline",
6
+ "ModelTrainerPipeline",
7
+ "ModelEvaluationPipeline",
8
+ "ModelPusherPipeline",
9
+ ]
src/mlpipeline/pipelines/data_ingestion_pipeline.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mlpipeline.config.configuration import ConfigurationManager
2
+ from mlpipeline.components.data_ingestion import DataIngestion
3
+ from mlpipeline.logging.logger import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+
8
+ class DataIngestionPipeline:
9
+ def __init__(self):
10
+ self.config_manager = ConfigurationManager()
11
+
12
+ def run(self):
13
+ logger.info("Data Ingestion Pipeline started")
14
+ config = self.config_manager.get_data_ingestion_config()
15
+ data_ingestion = DataIngestion(config=config)
16
+ artifact = data_ingestion.download_data()
17
+ logger.info(f"Data Ingestion Pipeline completed: {artifact.message}")
18
+ return artifact
19
+
20
+
21
+ if __name__ == "__main__":
22
+ pipeline = DataIngestionPipeline()
23
+ pipeline.run()
src/mlpipeline/pipelines/data_transformation_pipeline.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mlpipeline.config.configuration import ConfigurationManager
2
+ from mlpipeline.components.data_transformation import DataTransformation
3
+ from mlpipeline.logging.logger import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+
8
+ class DataTransformationPipeline:
9
+ def __init__(self):
10
+ self.config_manager = ConfigurationManager()
11
+
12
+ def run(self):
13
+ logger.info("Data Transformation Pipeline started")
14
+ config = self.config_manager.get_data_transformation_config()
15
+ data_transformation = DataTransformation(config=config)
16
+ artifact = data_transformation.transform()
17
+ logger.info(f"Data Transformation Pipeline completed: {artifact.message}")
18
+ return artifact
19
+
20
+
21
+ if __name__ == "__main__":
22
+ pipeline = DataTransformationPipeline()
23
+ pipeline.run()
src/mlpipeline/pipelines/data_validation_pipeline.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mlpipeline.config.configuration import ConfigurationManager
2
+ from mlpipeline.components.data_validation import DataValidation
3
+ from mlpipeline.logging.logger import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+
8
+ class DataValidationPipeline:
9
+ def __init__(self):
10
+ self.config_manager = ConfigurationManager()
11
+
12
+ def run(self):
13
+ logger.info("Data Validation Pipeline started")
14
+ config = self.config_manager.get_data_validation_config()
15
+ data_validation = DataValidation(config=config)
16
+ artifact = data_validation.validate_schema()
17
+ logger.info(f"Data Validation Pipeline completed: {artifact.message}")
18
+ return artifact
19
+
20
+
21
+ if __name__ == "__main__":
22
+ pipeline = DataValidationPipeline()
23
+ pipeline.run()
src/mlpipeline/pipelines/feature_engineering_pipeline.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mlpipeline.config.configuration import ConfigurationManager
2
+ from mlpipeline.components.feature_engineering import FeatureEngineering
3
+ from mlpipeline.logging.logger import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+
8
+ class FeatureEngineeringPipeline:
9
+ def __init__(self):
10
+ self.config_manager = ConfigurationManager()
11
+
12
+ def run(self):
13
+ logger.info("Feature Engineering Pipeline started")
14
+ config = self.config_manager.get_feature_engineering_config()
15
+ feature_engineering = FeatureEngineering(config=config)
16
+ artifact = feature_engineering.engineer_features()
17
+ logger.info(f"Feature Engineering Pipeline completed: {artifact.message}")
18
+ return artifact
19
+
20
+
21
+ if __name__ == "__main__":
22
+ pipeline = FeatureEngineeringPipeline()
23
+ pipeline.run()
src/mlpipeline/pipelines/model_evaluation_pipeline.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mlpipeline.config.configuration import ConfigurationManager
2
+ from mlpipeline.components.model_evaluation import ModelEvaluation
3
+ from mlpipeline.logging.logger import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+
8
+ class ModelEvaluationPipeline:
9
+ def __init__(self):
10
+ self.config_manager = ConfigurationManager()
11
+
12
+ def run(self):
13
+ logger.info("Model Evaluation Pipeline started")
14
+ config = self.config_manager.get_model_evaluation_config()
15
+ evaluation = ModelEvaluation(config=config)
16
+ artifact = evaluation.evaluate()
17
+ logger.info(f"Model Evaluation Pipeline completed with metrics: {artifact.evaluation_metrics}")
18
+ return artifact
19
+
20
+
21
+ if __name__ == "__main__":
22
+ pipeline = ModelEvaluationPipeline()
23
+ pipeline.run()
src/mlpipeline/pipelines/model_pusher_pipeline.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mlpipeline.config.configuration import ConfigurationManager
2
+ from mlpipeline.components.model_pusher import ModelPusher
3
+ from mlpipeline.logging.logger import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+
8
+ class ModelPusherPipeline:
9
+ def __init__(self):
10
+ self.config_manager = ConfigurationManager()
11
+
12
+ def run(self):
13
+ logger.info("Model Pusher Pipeline started")
14
+ config = self.config_manager.get_model_pusher_config()
15
+ pusher = ModelPusher(config=config)
16
+ artifact = pusher.push_model()
17
+ logger.info(f"Model Pusher Pipeline completed: {artifact.message}")
18
+ return artifact
19
+
20
+
21
+ if __name__ == "__main__":
22
+ pipeline = ModelPusherPipeline()
23
+ pipeline.run()
src/mlpipeline/pipelines/model_trainer_pipeline.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mlpipeline.config.configuration import ConfigurationManager
2
+ from mlpipeline.components.automl_trainer import AutoMLTrainer
3
+ from mlpipeline.logging.logger import get_logger
4
+
5
+ logger = get_logger(__name__)
6
+
7
+
8
+ class ModelTrainerPipeline:
9
+ def __init__(self):
10
+ self.config_manager = ConfigurationManager()
11
+
12
+ def run(self):
13
+ logger.info("Model Trainer Pipeline started")
14
+ config = self.config_manager.get_model_trainer_config()
15
+ trainer = AutoMLTrainer(config=config)
16
+ artifact = trainer.train()
17
+ logger.info(f"Model Trainer Pipeline completed with metrics: {artifact.train_metrics}")
18
+ return artifact
19
+
20
+
21
+ if __name__ == "__main__":
22
+ pipeline = ModelTrainerPipeline()
23
+ pipeline.run()
src/mlpipeline/utils/common.py CHANGED
@@ -76,6 +76,19 @@ def load_model(path: Path) -> Any:
76
  return model
77
 
78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  @ensure_annotations
80
  def get_size(path: Path) -> str:
81
  size_in_kb = round(os.path.getsize(path) / 1024)
 
76
  return model
77
 
78
 
79
+ def save_object(path: Path, obj):
80
+ with open(path, "wb") as f:
81
+ pickle.dump(obj, f)
82
+ logger.info(f"Object saved: {path}")
83
+
84
+
85
+ def load_object(path: Path):
86
+ with open(path, "rb") as f:
87
+ obj = pickle.load(f)
88
+ logger.info(f"Object loaded: {path}")
89
+ return obj
90
+
91
+
92
  @ensure_annotations
93
  def get_size(path: Path) -> str:
94
  size_in_kb = round(os.path.getsize(path) / 1024)