Abeshith commited on
Commit
19d70f4
·
1 Parent(s): ba003d8

Add data structures and AutoML implementations

Browse files
src/mlpipeline/automl/__init__.py CHANGED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from mlpipeline.automl.autogluon_trainer import AutoGluonTrainer
2
+ from mlpipeline.automl.flaml_trainer import FLAMLTrainer
3
+ from mlpipeline.automl.pycaret_trainer import PyCaretTrainer
4
+ from mlpipeline.automl.automl_factory import AutoMLFactory
src/mlpipeline/automl/autogluon_trainer.py CHANGED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Dict, Any, Optional
3
+ import pandas as pd
4
+ from autogluon.tabular import TabularPredictor
5
+
6
+ from mlpipeline.logging.logger import get_logger
7
+
8
+ logger = get_logger(__name__)
9
+
10
+
11
+ class AutoGluonTrainer:
12
+ def __init__(self, config: Dict[str, Any]):
13
+ self.config = config
14
+ self.predictor: Optional[TabularPredictor] = None
15
+
16
+ def train(self, train_data: pd.DataFrame, target_column: str, model_path: Path) -> Dict[str, float]:
17
+ logger.info("Starting AutoGluon training")
18
+
19
+ self.predictor = TabularPredictor(
20
+ label=target_column,
21
+ path=str(model_path),
22
+ eval_metric=self.config.get('eval_metric'),
23
+ verbosity=self.config.get('verbosity', 2),
24
+ )
25
+
26
+ self.predictor.fit(
27
+ train_data=train_data,
28
+ time_limit=self.config.get('time_limit', 600),
29
+ presets=self.config.get('presets', 'medium_quality'),
30
+ num_bag_folds=self.config.get('num_bag_folds', 5),
31
+ num_stack_levels=self.config.get('num_stack_levels', 1),
32
+ )
33
+
34
+ leaderboard = self.predictor.leaderboard(silent=True)
35
+ best_model = leaderboard.iloc[0]
36
+
37
+ metrics = {
38
+ 'score': float(best_model['score_val']),
39
+ 'score_test': float(best_model.get('score_test', 0.0)),
40
+ }
41
+
42
+ logger.info(f"AutoGluon training completed. Best score: {metrics['score']}")
43
+ return metrics
44
+
45
+ def predict(self, data: pd.DataFrame) -> pd.Series:
46
+ if self.predictor is None:
47
+ raise ValueError("Model not trained. Call train() first.")
48
+ return self.predictor.predict(data)
49
+
50
+ def load(self, model_path: Path):
51
+ logger.info(f"Loading AutoGluon model from {model_path}")
52
+ self.predictor = TabularPredictor.load(str(model_path))
53
+ return self
src/mlpipeline/automl/automl_factory.py CHANGED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Union
2
+ from mlpipeline.automl.autogluon_trainer import AutoGluonTrainer
3
+ from mlpipeline.automl.flaml_trainer import FLAMLTrainer
4
+ from mlpipeline.automl.pycaret_trainer import PyCaretTrainer
5
+ from mlpipeline.logging.logger import get_logger
6
+
7
+ logger = get_logger(__name__)
8
+
9
+
10
+ class AutoMLFactory:
11
+ @staticmethod
12
+ def create_trainer(library: str, config: dict) -> Union[AutoGluonTrainer, FLAMLTrainer, PyCaretTrainer]:
13
+ library = library.lower()
14
+
15
+ if library == 'autogluon':
16
+ logger.info("Creating AutoGluon trainer")
17
+ return AutoGluonTrainer(config)
18
+
19
+ elif library == 'flaml':
20
+ logger.info("Creating FLAML trainer")
21
+ return FLAMLTrainer(config)
22
+
23
+ elif library == 'pycaret':
24
+ logger.info("Creating PyCaret trainer")
25
+ return PyCaretTrainer(config)
26
+
27
+ else:
28
+ raise ValueError(f"Unknown AutoML library: {library}. Choose from: autogluon, flaml, pycaret")
29
+
30
+ @staticmethod
31
+ def get_available_libraries():
32
+ return ['autogluon', 'flaml', 'pycaret']
src/mlpipeline/automl/flaml_trainer.py CHANGED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Dict, Any, Optional
3
+ import pandas as pd
4
+ import numpy as np
5
+ from flaml import AutoML
6
+ from sklearn.metrics import accuracy_score, r2_score
7
+
8
+ from mlpipeline.logging.logger import get_logger
9
+ from mlpipeline.utils.common import save_pickle, load_pickle
10
+
11
+ logger = get_logger(__name__)
12
+
13
+
14
+ class FLAMLTrainer:
15
+ def __init__(self, config: Dict[str, Any]):
16
+ self.config = config
17
+ self.automl: Optional[AutoML] = None
18
+ self.task = config.get('task', 'classification')
19
+
20
+ def train(self, train_data: pd.DataFrame, target_column: str, model_path: Path) -> Dict[str, float]:
21
+ logger.info("Starting FLAML training")
22
+
23
+ X_train = train_data.drop(columns=[target_column])
24
+ y_train = train_data[target_column]
25
+
26
+ self.automl = AutoML()
27
+
28
+ settings = {
29
+ 'time_budget': self.config.get('time_budget', 600),
30
+ 'metric': self.config.get('metric', 'auto'),
31
+ 'task': self.task,
32
+ 'estimator_list': self.config.get('estimator_list', ['lgbm', 'xgboost', 'rf']),
33
+ 'n_jobs': self.config.get('n_jobs', -1),
34
+ 'verbose': self.config.get('verbose', 1),
35
+ 'early_stop': self.config.get('early_stop', True),
36
+ }
37
+
38
+ self.automl.fit(X_train=X_train, y_train=y_train, **settings)
39
+
40
+ y_pred = self.automl.predict(X_train)
41
+
42
+ if self.task == 'classification':
43
+ score = accuracy_score(y_train, y_pred)
44
+ metric_name = 'accuracy'
45
+ else:
46
+ score = r2_score(y_train, y_pred)
47
+ metric_name = 'r2_score'
48
+
49
+ save_pickle(model_path / 'model.pkl', self.automl)
50
+
51
+ metrics = {
52
+ metric_name: float(score),
53
+ 'best_loss': float(self.automl.best_loss),
54
+ }
55
+
56
+ logger.info(f"FLAML training completed. Best {metric_name}: {score}")
57
+ return metrics
58
+
59
+ def predict(self, data: pd.DataFrame) -> np.ndarray:
60
+ if self.automl is None:
61
+ raise ValueError("Model not trained. Call train() first.")
62
+ return self.automl.predict(data)
63
+
64
+ def load(self, model_path: Path):
65
+ logger.info(f"Loading FLAML model from {model_path}")
66
+ self.automl = load_pickle(model_path / 'model.pkl')
67
+ return self
src/mlpipeline/automl/pycaret_trainer.py CHANGED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Dict, Any, Optional
3
+ import pandas as pd
4
+ from pycaret.classification import setup as classification_setup, compare_models as classification_compare, finalize_model as classification_finalize, save_model as classification_save, load_model as classification_load
5
+ from pycaret.regression import setup as regression_setup, compare_models as regression_compare, finalize_model as regression_finalize, save_model as regression_save, load_model as regression_load
6
+
7
+ from mlpipeline.logging.logger import get_logger
8
+
9
+ logger = get_logger(__name__)
10
+
11
+
12
+ class PyCaretTrainer:
13
+ def __init__(self, config: Dict[str, Any]):
14
+ self.config = config
15
+ self.model: Optional[Any] = None
16
+ self.is_classification = None
17
+
18
+ def train(self, train_data: pd.DataFrame, target_column: str, model_path: Path) -> Dict[str, float]:
19
+ logger.info("Starting PyCaret training")
20
+
21
+ if train_data[target_column].dtype == 'object' or train_data[target_column].nunique() < 20:
22
+ self.is_classification = True
23
+ setup_fn = classification_setup
24
+ compare_fn = classification_compare
25
+ finalize_fn = classification_finalize
26
+ save_fn = classification_save
27
+ else:
28
+ self.is_classification = False
29
+ setup_fn = regression_setup
30
+ compare_fn = regression_compare
31
+ finalize_fn = regression_finalize
32
+ save_fn = regression_save
33
+
34
+ exp = setup_fn(
35
+ data=train_data,
36
+ target=target_column,
37
+ session_id=self.config.get('session_id', 42),
38
+ fold=self.config.get('fold', 5),
39
+ verbose=self.config.get('verbose', False),
40
+ use_gpu=self.config.get('use_gpu', False),
41
+ )
42
+
43
+ best_model = compare_fn(
44
+ n_select=self.config.get('n_select', 5),
45
+ verbose=self.config.get('verbose', False),
46
+ )
47
+
48
+ if self.config.get('tuning', {}).get('enabled', True):
49
+ from pycaret.classification import tune_model as classification_tune
50
+ from pycaret.regression import tune_model as regression_tune
51
+ tune_fn = classification_tune if self.is_classification else regression_tune
52
+
53
+ best_model = tune_fn(
54
+ best_model,
55
+ n_iter=self.config.get('tuning', {}).get('n_iter', 10),
56
+ optimize=self.config.get('tuning', {}).get('optimize', 'Accuracy'),
57
+ )
58
+
59
+ self.model = finalize_fn(best_model)
60
+
61
+ save_fn(self.model, str(model_path / 'model'))
62
+
63
+ from pycaret.classification import pull as classification_pull
64
+ from pycaret.regression import pull as regression_pull
65
+ pull_fn = classification_pull if self.is_classification else regression_pull
66
+
67
+ results = pull_fn()
68
+
69
+ metrics = {
70
+ 'score': float(results.iloc[0]['Mean']) if not results.empty else 0.0,
71
+ }
72
+
73
+ logger.info(f"PyCaret training completed. Score: {metrics['score']}")
74
+ return metrics
75
+
76
+ def predict(self, data: pd.DataFrame) -> pd.Series:
77
+ if self.model is None:
78
+ raise ValueError("Model not trained. Call train() first.")
79
+
80
+ from pycaret.classification import predict_model as classification_predict
81
+ from pycaret.regression import predict_model as regression_predict
82
+ predict_fn = classification_predict if self.is_classification else regression_predict
83
+
84
+ predictions = predict_fn(self.model, data=data)
85
+ return predictions.iloc[:, -1]
86
+
87
+ def load(self, model_path: Path):
88
+ logger.info(f"Loading PyCaret model from {model_path}")
89
+ load_fn = classification_load if self.is_classification else regression_load
90
+ self.model = load_fn(str(model_path / 'model'))
91
+ return self
src/mlpipeline/config/__init__.py CHANGED
@@ -0,0 +1 @@
 
 
1
+ from mlpipeline.config.configuration import ConfigurationManager
src/mlpipeline/config/configuration.py CHANGED
@@ -0,0 +1,99 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from mlpipeline.constants import CONFIG_FILE_PATH
3
+ from mlpipeline.utils.common import read_yaml, create_directories
4
+ from mlpipeline.entity.config_entity import (
5
+ DataIngestionConfig,
6
+ DataValidationConfig,
7
+ DataTransformationConfig,
8
+ FeatureEngineeringConfig,
9
+ ModelTrainerConfig,
10
+ ModelEvaluationConfig,
11
+ ModelPusherConfig,
12
+ )
13
+
14
+
15
+ class ConfigurationManager:
16
+ def __init__(self, config_filepath=CONFIG_FILE_PATH):
17
+ self.config = read_yaml(config_filepath)
18
+ create_directories([self.config.artifacts_root])
19
+
20
+ def get_data_ingestion_config(self) -> DataIngestionConfig:
21
+ config = self.config.data_ingestion
22
+ create_directories([config.root_dir])
23
+
24
+ return DataIngestionConfig(
25
+ root_dir=Path(config.root_dir),
26
+ source_url=config.source_url,
27
+ local_data_file=Path(config.local_data_file),
28
+ unzip_dir=Path(config.unzip_dir),
29
+ )
30
+
31
+ def get_data_validation_config(self) -> DataValidationConfig:
32
+ config = self.config.data_validation
33
+ create_directories([config.root_dir])
34
+
35
+ return DataValidationConfig(
36
+ root_dir=Path(config.root_dir),
37
+ data_dir=Path(config.data_dir),
38
+ status_file=Path(config.status_file),
39
+ schema_file=Path(config.schema_file),
40
+ )
41
+
42
+ def get_data_transformation_config(self) -> DataTransformationConfig:
43
+ config = self.config.data_transformation
44
+ create_directories([config.root_dir])
45
+
46
+ return DataTransformationConfig(
47
+ root_dir=Path(config.root_dir),
48
+ data_path=Path(config.data_path),
49
+ train_path=Path(config.train_path),
50
+ test_path=Path(config.test_path),
51
+ test_size=config.test_size,
52
+ random_state=config.random_state,
53
+ )
54
+
55
+ def get_feature_engineering_config(self) -> FeatureEngineeringConfig:
56
+ config = self.config.feature_engineering
57
+ create_directories([config.root_dir])
58
+
59
+ return FeatureEngineeringConfig(
60
+ root_dir=Path(config.root_dir),
61
+ train_path=Path(config.train_path),
62
+ test_path=Path(config.test_path),
63
+ output_train_path=Path(config.output_train_path),
64
+ output_test_path=Path(config.output_test_path),
65
+ )
66
+
67
+ def get_model_trainer_config(self) -> ModelTrainerConfig:
68
+ config = self.config.model_trainer
69
+ create_directories([config.root_dir])
70
+
71
+ return ModelTrainerConfig(
72
+ root_dir=Path(config.root_dir),
73
+ train_data_path=Path(config.train_data_path),
74
+ test_data_path=Path(config.test_data_path),
75
+ model_path=Path(config.model_path),
76
+ target_column=config.target_column,
77
+ )
78
+
79
+ def get_model_evaluation_config(self) -> ModelEvaluationConfig:
80
+ config = self.config.model_evaluation
81
+ create_directories([config.root_dir])
82
+
83
+ return ModelEvaluationConfig(
84
+ root_dir=Path(config.root_dir),
85
+ model_path=Path(config.model_path),
86
+ test_data_path=Path(config.test_data_path),
87
+ metrics_file=Path(config.metrics_file),
88
+ target_column=config.target_column,
89
+ )
90
+
91
+ def get_model_pusher_config(self) -> ModelPusherConfig:
92
+ config = self.config.model_pusher
93
+ create_directories([config.root_dir])
94
+
95
+ return ModelPusherConfig(
96
+ root_dir=Path(config.root_dir),
97
+ model_path=Path(config.model_path),
98
+ model_registry_path=Path(config.model_registry_path),
99
+ )
src/mlpipeline/entity/__init__.py CHANGED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from mlpipeline.entity.config_entity import (
2
+ DataIngestionConfig,
3
+ DataValidationConfig,
4
+ DataTransformationConfig,
5
+ FeatureEngineeringConfig,
6
+ ModelTrainerConfig,
7
+ ModelEvaluationConfig,
8
+ ModelPusherConfig,
9
+ )
10
+
11
+ from mlpipeline.entity.artifact_entity import (
12
+ DataIngestionArtifact,
13
+ DataValidationArtifact,
14
+ DataTransformationArtifact,
15
+ FeatureEngineeringArtifact,
16
+ ModelTrainerArtifact,
17
+ ModelEvaluationArtifact,
18
+ )
src/mlpipeline/entity/artifact_entity.py CHANGED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+ from typing import Dict, Any
4
+
5
+
6
+ @dataclass
7
+ class DataIngestionArtifact:
8
+ data_file_path: Path
9
+ is_ingested: bool
10
+ message: str
11
+
12
+
13
+ @dataclass
14
+ class DataValidationArtifact:
15
+ validation_status: bool
16
+ message: str
17
+ schema_file_path: Path
18
+
19
+
20
+ @dataclass
21
+ class DataTransformationArtifact:
22
+ train_file_path: Path
23
+ test_file_path: Path
24
+ is_transformed: bool
25
+ message: str
26
+
27
+
28
+ @dataclass
29
+ class FeatureEngineeringArtifact:
30
+ train_features_path: Path
31
+ test_features_path: Path
32
+ is_engineered: bool
33
+ message: str
34
+
35
+
36
+ @dataclass
37
+ class ModelTrainerArtifact:
38
+ model_path: Path
39
+ is_trained: bool
40
+ message: str
41
+ train_metrics: Dict[str, float]
42
+
43
+
44
+ @dataclass
45
+ class ModelEvaluationArtifact:
46
+ is_model_accepted: bool
47
+ evaluation_metrics: Dict[str, float]
48
+ message: str
src/mlpipeline/entity/config_entity.py CHANGED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from pathlib import Path
3
+
4
+
5
+ @dataclass(frozen=True)
6
+ class DataIngestionConfig:
7
+ root_dir: Path
8
+ source_url: str
9
+ local_data_file: Path
10
+ unzip_dir: Path
11
+
12
+
13
+ @dataclass(frozen=True)
14
+ class DataValidationConfig:
15
+ root_dir: Path
16
+ data_dir: Path
17
+ status_file: Path
18
+ schema_file: Path
19
+
20
+
21
+ @dataclass(frozen=True)
22
+ class DataTransformationConfig:
23
+ root_dir: Path
24
+ data_path: Path
25
+ train_path: Path
26
+ test_path: Path
27
+ test_size: float
28
+ random_state: int
29
+
30
+
31
+ @dataclass(frozen=True)
32
+ class FeatureEngineeringConfig:
33
+ root_dir: Path
34
+ train_path: Path
35
+ test_path: Path
36
+ output_train_path: Path
37
+ output_test_path: Path
38
+
39
+
40
+ @dataclass(frozen=True)
41
+ class ModelTrainerConfig:
42
+ root_dir: Path
43
+ train_data_path: Path
44
+ test_data_path: Path
45
+ model_path: Path
46
+ target_column: str
47
+
48
+
49
+ @dataclass(frozen=True)
50
+ class ModelEvaluationConfig:
51
+ root_dir: Path
52
+ model_path: Path
53
+ test_data_path: Path
54
+ metrics_file: Path
55
+ target_column: str
56
+
57
+
58
+ @dataclass(frozen=True)
59
+ class ModelPusherConfig:
60
+ root_dir: Path
61
+ model_path: Path
62
+ model_registry_path: Path