SURESHBEEKHANI commited on
Commit
72475ed
·
verified ·
1 Parent(s): 666348d

Upload 15 files

Browse files
src/__pycache__/exception.cpython-312.pyc ADDED
Binary file (1.6 kB). View file
 
src/__pycache__/utils.cpython-312.pyc ADDED
Binary file (2.82 kB). View file
 
src/_init_.py ADDED
File without changes
src/components/_init_.py ADDED
File without changes
src/components/data_ingestion.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Import necessary libraries and modules
2
+ import os # For file and directory operations
3
+ import sys # For system-specific parameters and functions
4
+ from ..exception import CustomException
5
+
6
+ from src.logger import logging # Adjusted to absolute import
7
+
8
+ import pandas as pd # For data manipulation and analysis
9
+
10
+ from sklearn.model_selection import train_test_split # For splitting data into training and testing sets
11
+ from dataclasses import dataclass # For creating data classes
12
+
13
+ #Import components for data transformation and model training
14
+ from src.components.data_transformation import DataTransformation # Data transformation class
15
+ from src.components.data_transformation import DataTransformationConfig # Configuration for data transformation
16
+ from src.components.model_trainer import ModelTrainerConfig # Configuration for model training
17
+ from src.components.model_trainer import ModelTrainer # Model training class
18
+
19
+ # Define a data class for Data Ingestion Configuration
20
+ @dataclass
21
+ class DataIngestionConfig:
22
+ # Specify file paths for training, testing, and raw data
23
+ train_data_path: str = os.path.join('artifacts', "train.csv")
24
+ test_data_path: str = os.path.join('artifacts', "test.csv")
25
+ raw_data_path: str = os.path.join('artifacts', "data.csv")
26
+
27
+ # Data Ingestion Class Definition
28
+ class DataIngestion:
29
+ def __init__(self):
30
+ # Initialize ingestion configuration
31
+ self.ingestion_config = DataIngestionConfig()
32
+
33
+ # Method to initiate data ingestion
34
+ def initiate_data_ingestion(self):
35
+ logging.info("Entered the data ingestion method or component")
36
+ try:
37
+ # Read the dataset from the specified path
38
+ df = pd.read_csv('notebook/data/stud.csv') # Update path based on your directory structure
39
+ logging.info('Read the dataset as dataframe')
40
+
41
+ # Create necessary directories if they do not exist
42
+ os.makedirs(os.path.dirname(self.ingestion_config.train_data_path), exist_ok=True)
43
+
44
+ # Save the raw data to CSV
45
+ df.to_csv(self.ingestion_config.raw_data_path, index=False, header=True)
46
+
47
+ logging.info("Train test split initiated")
48
+ # Split the data into training and testing sets
49
+ train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
50
+
51
+ # Save the training and testing sets to CSV
52
+ train_set.to_csv(self.ingestion_config.train_data_path, index=False, header=True)
53
+ test_set.to_csv(self.ingestion_config.test_data_path, index=False, header=True)
54
+
55
+ logging.info("Ingestion of the data is completed")
56
+
57
+ # Return the paths of the train and test data
58
+ return (
59
+ self.ingestion_config.train_data_path,
60
+ self.ingestion_config.test_data_path
61
+ )
62
+ except Exception as e:
63
+ raise CustomException(e, sys) # Raise custom exception on error
64
+
65
+ # Main block to execute the data ingestion process
66
+ if __name__ == "__main__":
67
+ # Create an instance of DataIngestion
68
+ obj = DataIngestion()
69
+ # Initiate data ingestion and get train and test data paths
70
+ train_data, test_data = obj.initiate_data_ingestion()
71
+
72
+ # Create an instance of DataTransformation
73
+ data_transformation = DataTransformation()
74
+ # Perform data transformation and obtain training and testing arrays
75
+ train_arr, test_arr, _ = data_transformation.initiate_data_transformation(train_data, test_data)
76
+
77
+ # Create an instance of ModelTrainer
78
+ modeltrainer = ModelTrainer()
79
+ # Train the model and print the result
80
+ print(modeltrainer.initiate_model_trainer(train_arr, test_arr))
src/components/data_transformation.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from dataclasses import dataclass
3
+
4
+ # Importing necessary libraries to handle data manipulation and machine learning.
5
+ import numpy as np
6
+ import pandas as pd
7
+ from sklearn.compose import ColumnTransformer
8
+ from sklearn.impute import SimpleImputer
9
+ from sklearn.pipeline import Pipeline
10
+ from sklearn.preprocessing import OneHotEncoder, StandardScaler
11
+
12
+ # Custom modules for handling exceptions and logging messages.
13
+ from src.exception import CustomException
14
+ from src.logger import logging
15
+
16
+ import os
17
+
18
+ # Utility function to save files (used later).
19
+ from src.utils import save_object
20
+
21
+ # This class holds configuration details, such as where to save the preprocessor.
22
+ @dataclass
23
+ class DataTransformationConfig:
24
+ preprocessor_obj_file_path = os.path.join('artifacts', "preprocessor.pkl")
25
+
26
+ # Main class responsible for data transformation.
27
+ class DataTransformation:
28
+ def __init__(self):
29
+ # Setting up the configuration for data transformation, like where to save the preprocessor file.
30
+ self.data_transformation_config = DataTransformationConfig()
31
+
32
+ # This function prepares and returns an object that will handle all the transformations.
33
+ def get_data_transformation_object(self):
34
+ '''
35
+ Function to create a preprocessing object that transforms data (both numerical and categorical).
36
+ '''
37
+ try:
38
+ # Specifying which columns contain numbers and which contain categories (text).
39
+ numerical_columns = ["writing_score", "reading_score"]
40
+ categorical_columns = [
41
+ "gender",
42
+ "race_ethnicity",
43
+ "parental_level_of_education",
44
+ "lunch",
45
+ "test_preparation_course",
46
+ ]
47
+
48
+ # Creating a pipeline to handle numerical columns:
49
+ # 1. Filling in missing values with the median.
50
+ # 2. Standardizing the data (scaling it to have similar ranges).
51
+ num_pipeline = Pipeline(
52
+ steps=[
53
+ ("imputer", SimpleImputer(strategy="median")),
54
+ ("scaler", StandardScaler())
55
+ ]
56
+ )
57
+
58
+ # Creating a pipeline to handle categorical columns:
59
+ # 1. Filling in missing values with the most frequent category.
60
+ # 2. Converting text categories into numbers (OneHotEncoder).
61
+ # 3. Scaling the data but not adjusting the mean.
62
+ cat_pipeline = Pipeline(
63
+ steps=[
64
+ ("imputer", SimpleImputer(strategy="most_frequent")),
65
+ ("one_hot_encoder", OneHotEncoder()),
66
+ ("scaler", StandardScaler(with_mean=False))
67
+ ]
68
+ )
69
+
70
+ # Logging information about the columns.
71
+ logging.info(f"Categorical columns: {categorical_columns}")
72
+ logging.info(f"Numerical columns: {numerical_columns}")
73
+
74
+ # Combining the two pipelines (for numbers and categories) into one processor.
75
+ preprocessor = ColumnTransformer(
76
+ [
77
+ ("num_pipeline", num_pipeline, numerical_columns),
78
+ ("cat_pipeline", cat_pipeline, categorical_columns)
79
+ ]
80
+ )
81
+
82
+ # Returning the combined processor object.
83
+ return preprocessor
84
+
85
+ # If something goes wrong, this block will raise a custom error.
86
+ except Exception as e:
87
+ raise CustomException(e, sys)
88
+
89
+ # This function initiates the transformation process using the created processor.
90
+ def initiate_data_transformation(self, train_path, test_path):
91
+ try:
92
+ # Loading training and testing data from CSV files.
93
+ train_df = pd.read_csv(train_path)
94
+ test_df = pd.read_csv(test_path)
95
+
96
+ logging.info("Reading train and test data completed.")
97
+
98
+ # Getting the preprocessing object created earlier.
99
+ logging.info("Obtaining preprocessing object.")
100
+ preprocessing_obj = self.get_data_transformation_object()
101
+
102
+ # Defining the target column (the one we want to predict) and numerical columns.
103
+ target_column_name = "math_score"
104
+ numerical_columns = ["writing_score", "reading_score"]
105
+
106
+ # Separating the input features (all columns except the target) and the target (math scores).
107
+ input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1)
108
+ target_feature_train_df = train_df[target_column_name]
109
+
110
+ input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1)
111
+ target_feature_test_df = test_df[target_column_name]
112
+
113
+ logging.info("Applying preprocessing object on training and testing data.")
114
+
115
+ # Applying the preprocessing steps (like scaling and encoding) to the training and test data.
116
+ input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)
117
+ input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)
118
+
119
+ # Combining the transformed input features and the target column for both training and testing.
120
+ train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
121
+ test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]
122
+
123
+ logging.info("Saved preprocessing object.")
124
+
125
+ # Saving the preprocessing object for future use (so it doesn't have to be recreated).
126
+ save_object(
127
+ file_path = self.data_transformation_config.preprocessor_obj_file_path,
128
+ obj = preprocessing_obj
129
+ )
130
+
131
+ # Returning the transformed data and the path where the processor was saved.
132
+ return (
133
+ train_arr,
134
+ test_arr,
135
+ self.data_transformation_config.preprocessor_obj_file_path,
136
+ )
137
+ except Exception as e:
138
+ raise CustomException(e, sys)
src/components/model_trainer.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ from dataclasses import dataclass
4
+
5
+ from catboost import CatBoostRegressor
6
+ from sklearn.ensemble import (
7
+ AdaBoostRegressor,
8
+ GradientBoostingRegressor,
9
+ RandomForestRegressor,
10
+ )
11
+ from sklearn.linear_model import LinearRegression
12
+ from sklearn.metrics import r2_score
13
+ from sklearn.neighbors import KNeighborsRegressor
14
+ from sklearn.tree import DecisionTreeRegressor
15
+ from xgboost import XGBRegressor
16
+
17
+ from src.exception import CustomException
18
+ from src.logger import logging
19
+
20
+ from src.utils import save_object,evaluate_models
21
+
22
+ @dataclass
23
+ class ModelTrainerConfig:
24
+ trained_model_file_path=os.path.join("artifacts","model.pkl")
25
+
26
+ class ModelTrainer:
27
+ def __init__(self):
28
+ self.model_trainer_config=ModelTrainerConfig()
29
+
30
+
31
+ def initiate_model_trainer(self,train_array,test_array):
32
+ try:
33
+ logging.info("Split training and test input data")
34
+ X_train,y_train,X_test,y_test=(
35
+ train_array[:,:-1],
36
+ train_array[:,-1],
37
+ test_array[:,:-1],
38
+ test_array[:,-1]
39
+ )
40
+ models = {
41
+ "Random Forest": RandomForestRegressor(),
42
+ "Decision Tree": DecisionTreeRegressor(),
43
+ "Gradient Boosting": GradientBoostingRegressor(),
44
+ "Linear Regression": LinearRegression(),
45
+ "XGBRegressor": XGBRegressor(),
46
+ "CatBoosting Regressor": CatBoostRegressor(verbose=False),
47
+ "AdaBoost Regressor": AdaBoostRegressor(),
48
+ }
49
+ params={
50
+ "Decision Tree": {
51
+ 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
52
+ # 'splitter':['best','random'],
53
+ # 'max_features':['sqrt','log2'],
54
+ },
55
+ "Random Forest":{
56
+ # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
57
+
58
+ # 'max_features':['sqrt','log2',None],
59
+ 'n_estimators': [8,16,32,64,128,256]
60
+ },
61
+ "Gradient Boosting":{
62
+ # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
63
+ 'learning_rate':[.1,.01,.05,.001],
64
+ 'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
65
+ # 'criterion':['squared_error', 'friedman_mse'],
66
+ # 'max_features':['auto','sqrt','log2'],
67
+ 'n_estimators': [8,16,32,64,128,256]
68
+ },
69
+ "Linear Regression":{},
70
+ "XGBRegressor":{
71
+ 'learning_rate':[.1,.01,.05,.001],
72
+ 'n_estimators': [8,16,32,64,128,256]
73
+ },
74
+ "CatBoosting Regressor":{
75
+ 'depth': [6,8,10],
76
+ 'learning_rate': [0.01, 0.05, 0.1],
77
+ 'iterations': [30, 50, 100]
78
+ },
79
+ "AdaBoost Regressor":{
80
+ 'learning_rate':[.1,.01,0.5,.001],
81
+ # 'loss':['linear','square','exponential'],
82
+ 'n_estimators': [8,16,32,64,128,256]
83
+ }
84
+
85
+ }
86
+
87
+ model_report:dict=evaluate_models(X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,
88
+ models=models,param=params)
89
+
90
+ ## To get best model score from dict
91
+ best_model_score = max(sorted(model_report.values()))
92
+
93
+ ## To get best model name from dict
94
+
95
+ best_model_name = list(model_report.keys())[
96
+ list(model_report.values()).index(best_model_score)
97
+ ]
98
+ best_model = models[best_model_name]
99
+
100
+ if best_model_score<0.6:
101
+ raise CustomException("No best model found")
102
+ logging.info(f"Best found model on both training and testing dataset")
103
+
104
+ save_object(
105
+ file_path=self.model_trainer_config.trained_model_file_path,
106
+ obj=best_model
107
+ )
108
+
109
+ predicted=best_model.predict(X_test)
110
+
111
+ r2_square = r2_score(y_test, predicted)
112
+ return r2_square
113
+
114
+
115
+
116
+
117
+
118
+ except Exception as e:
119
+ raise CustomException(e,sys)
src/exception.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # src/exception.py
2
+ import sys
3
+ import logging
4
+
5
+ class CustomException(Exception):
6
+ def __init__(self, error_message: Exception, error_detail: sys):
7
+ super().__init__(error_message)
8
+ self.error_message = CustomException.get_detailed_error_message(error_message, error_detail)
9
+
10
+ @staticmethod
11
+ def get_detailed_error_message(error: Exception, error_detail: sys):
12
+ _, _, exc_tb = error_detail.exc_info()
13
+ file_name = exc_tb.tb_frame.f_code.co_filename
14
+ line_number = exc_tb.tb_lineno
15
+ error_message = f"Error occurred in script: {file_name} at line {line_number}: {str(error)}"
16
+ return error_message
17
+
18
+ def __str__(self):
19
+ return self.error_message
src/logger.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging # Import the logging module to enable logging messages
2
+ import os # Import the os module to interact with the operating system
3
+ from datetime import datetime # Import datetime to work with date and time
4
+
5
+ # Generate a log file name with the current timestamp (format: MM_DD_YYYY_HH_MM_SS.log)
6
+ LOG_FILE = f"{datetime.now().strftime('%m_%d_%Y_%H_%M_%S')}.log"
7
+
8
+ # Create a path for the "logs" folder in the current working directory
9
+ logs_dir = os.path.join(os.getcwd(), "logs")
10
+
11
+ # Create the "logs" directory if it doesn't exist already (exist_ok=True avoids errors if it exists)
12
+ os.makedirs(logs_dir, exist_ok=True)
13
+
14
+ # Define the full path where the log file will be saved
15
+ LOG_FILE_PATH = os.path.join(logs_dir, LOG_FILE)
16
+
17
+ # Configure the logging settings
18
+ logging.basicConfig(
19
+ filename=LOG_FILE_PATH, # Set the log file path
20
+ format="[ %(asctime)s ] %(lineno)d %(name)s - %(levelname)s - %(message)s", # Define the log message format
21
+ level=logging.INFO, # Set the logging level to INFO (log INFO and above)
22
+ )
23
+
24
+ # Log a message to verify that logging is working
25
+ logging.info("Logging has been configured successfully.")
src/pipeline/__init__.py ADDED
File without changes
src/pipeline/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (190 Bytes). View file
 
src/pipeline/__pycache__/predict_pipeline.cpython-312.pyc ADDED
Binary file (2.94 kB). View file
 
src/pipeline/predict_pipeline.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import pandas as pd
3
+ from src.exception import CustomException
4
+ from src.utils import load_object
5
+ import os
6
+
7
+
8
+ class PredictPipeline:
9
+ def __init__(self):
10
+ pass
11
+
12
+ def predict(self,features):
13
+ try:
14
+ model_path=os.path.join("artifacts","model.pkl")
15
+ preprocessor_path=os.path.join('artifacts','preprocessor.pkl')
16
+ print("Before Loading")
17
+ model=load_object(file_path=model_path)
18
+ preprocessor=load_object(file_path=preprocessor_path)
19
+ print("After Loading")
20
+ data_scaled=preprocessor.transform(features)
21
+ preds=model.predict(data_scaled)
22
+ return preds
23
+
24
+ except Exception as e:
25
+ raise CustomException(e,sys)
26
+
27
+
28
+
29
+ class CustomData:
30
+ def __init__( self,
31
+ gender: str,
32
+ race_ethnicity: str,
33
+ parental_level_of_education,
34
+ lunch: str,
35
+ test_preparation_course: str,
36
+ reading_score: int,
37
+ writing_score: int):
38
+
39
+ self.gender = gender
40
+
41
+ self.race_ethnicity = race_ethnicity
42
+
43
+ self.parental_level_of_education = parental_level_of_education
44
+
45
+ self.lunch = lunch
46
+
47
+ self.test_preparation_course = test_preparation_course
48
+
49
+ self.reading_score = reading_score
50
+
51
+ self.writing_score = writing_score
52
+
53
+ def get_data_as_data_frame(self):
54
+ try:
55
+ custom_data_input_dict = {
56
+ "gender": [self.gender],
57
+ "race_ethnicity": [self.race_ethnicity],
58
+ "parental_level_of_education": [self.parental_level_of_education],
59
+ "lunch": [self.lunch],
60
+ "test_preparation_course": [self.test_preparation_course],
61
+ "reading_score": [self.reading_score],
62
+ "writing_score": [self.writing_score],
63
+ }
64
+
65
+ return pd.DataFrame(custom_data_input_dict)
66
+
67
+ except Exception as e:
68
+ raise CustomException(e, sys)
src/pipeline/train_pipeline.py ADDED
File without changes
src/utils.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+
4
+ import numpy as np
5
+ import pandas as pd
6
+ import dill
7
+ import pickle
8
+ from sklearn.metrics import r2_score
9
+ from sklearn.model_selection import GridSearchCV
10
+
11
+ from src.exception import CustomException
12
+
13
+ def save_object(file_path, obj):
14
+ try:
15
+ dir_path = os.path.dirname(file_path)
16
+
17
+ os.makedirs(dir_path, exist_ok=True)
18
+
19
+ with open(file_path, "wb") as file_obj:
20
+ pickle.dump(obj, file_obj)
21
+
22
+ except Exception as e:
23
+ raise CustomException(e, sys)
24
+
25
+ def evaluate_models(X_train, y_train,X_test,y_test,models,param):
26
+ try:
27
+ report = {}
28
+
29
+ for i in range(len(list(models))):
30
+ model = list(models.values())[i]
31
+ para=param[list(models.keys())[i]]
32
+
33
+ gs = GridSearchCV(model,para,cv=3)
34
+ gs.fit(X_train,y_train)
35
+
36
+ model.set_params(**gs.best_params_)
37
+ model.fit(X_train,y_train)
38
+
39
+ #model.fit(X_train, y_train) # Train model
40
+
41
+ y_train_pred = model.predict(X_train)
42
+
43
+ y_test_pred = model.predict(X_test)
44
+
45
+ train_model_score = r2_score(y_train, y_train_pred)
46
+
47
+ test_model_score = r2_score(y_test, y_test_pred)
48
+
49
+ report[list(models.keys())[i]] = test_model_score
50
+
51
+ return report
52
+
53
+ except Exception as e:
54
+ raise CustomException(e, sys)
55
+
56
+ def load_object(file_path):
57
+ try:
58
+ with open(file_path, "rb") as file_obj:
59
+ return pickle.load(file_obj)
60
+
61
+ except Exception as e:
62
+ raise CustomException(e, sys)