Spaces:

SURESHBEEKHANI
/

StudentExamPerformancePrediction

Sleeping

App Files Files Community

SURESHBEEKHANI commited on Oct 17, 2024

Commit

72475ed

verified ·

1 Parent(s): 666348d

Upload 15 files

Browse files

Files changed (15) hide show

src/__pycache__/exception.cpython-312.pyc +0 -0
src/__pycache__/utils.cpython-312.pyc +0 -0
src/_init_.py +0 -0
src/components/_init_.py +0 -0
src/components/data_ingestion.py +80 -0
src/components/data_transformation.py +138 -0
src/components/model_trainer.py +119 -0
src/exception.py +19 -0
src/logger.py +25 -0
src/pipeline/__init__.py +0 -0
src/pipeline/__pycache__/__init__.cpython-312.pyc +0 -0
src/pipeline/__pycache__/predict_pipeline.cpython-312.pyc +0 -0
src/pipeline/predict_pipeline.py +68 -0
src/pipeline/train_pipeline.py +0 -0
src/utils.py +62 -0

src/__pycache__/exception.cpython-312.pyc ADDED Viewed

Binary file (1.6 kB). View file

src/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (2.82 kB). View file

src/_init_.py ADDED Viewed

File without changes

src/components/_init_.py ADDED Viewed

File without changes

src/components/data_ingestion.py ADDED Viewed

	@@ -0,0 +1,80 @@

+# Import necessary libraries and modules
+import os  # For file and directory operations
+import sys  # For system-specific parameters and functions
+from ..exception import CustomException
+from src.logger import logging  # Adjusted to absolute import
+import pandas as pd  # For data manipulation and analysis
+from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
+from dataclasses import dataclass  # For creating data classes
+#Import components for data transformation and model training
+from src.components.data_transformation import DataTransformation  # Data transformation class
+from src.components.data_transformation import DataTransformationConfig  # Configuration for data transformation
+from src.components.model_trainer import ModelTrainerConfig  # Configuration for model training
+from src.components.model_trainer import ModelTrainer  # Model training class
+# Define a data class for Data Ingestion Configuration
+@dataclass
+class DataIngestionConfig:
+    # Specify file paths for training, testing, and raw data
+    train_data_path: str = os.path.join('artifacts', "train.csv")
+    test_data_path: str = os.path.join('artifacts', "test.csv")
+    raw_data_path: str = os.path.join('artifacts', "data.csv")
+# Data Ingestion Class Definition
+class DataIngestion:
+    def __init__(self):
+        # Initialize ingestion configuration
+        self.ingestion_config = DataIngestionConfig()
+    # Method to initiate data ingestion
+    def initiate_data_ingestion(self):
+        logging.info("Entered the data ingestion method or component")
+        try:
+            # Read the dataset from the specified path
+            df = pd.read_csv('notebook/data/stud.csv')  # Update path based on your directory structure
+            logging.info('Read the dataset as dataframe')
+            # Create necessary directories if they do not exist
+            os.makedirs(os.path.dirname(self.ingestion_config.train_data_path), exist_ok=True)
+            # Save the raw data to CSV
+            df.to_csv(self.ingestion_config.raw_data_path, index=False, header=True)
+            logging.info("Train test split initiated")
+            # Split the data into training and testing sets
+            train_set, test_set = train_test_split(df, test_size=0.2, random_state=42)
+            # Save the training and testing sets to CSV
+            train_set.to_csv(self.ingestion_config.train_data_path, index=False, header=True)
+            test_set.to_csv(self.ingestion_config.test_data_path, index=False, header=True)
+            logging.info("Ingestion of the data is completed")
+            # Return the paths of the train and test data
+            return (
+                self.ingestion_config.train_data_path,
+                self.ingestion_config.test_data_path
+            )
+        except Exception as e:
+            raise CustomException(e, sys)  # Raise custom exception on error
+# Main block to execute the data ingestion process
+if __name__ == "__main__":
+    # Create an instance of DataIngestion
+    obj = DataIngestion()
+    # Initiate data ingestion and get train and test data paths
+    train_data, test_data = obj.initiate_data_ingestion()
+    # Create an instance of DataTransformation
+    data_transformation = DataTransformation()
+    # Perform data transformation and obtain training and testing arrays
+    train_arr, test_arr, _ = data_transformation.initiate_data_transformation(train_data, test_data)
+    # Create an instance of ModelTrainer
+    modeltrainer = ModelTrainer()
+    # Train the model and print the result
+    print(modeltrainer.initiate_model_trainer(train_arr, test_arr))

src/components/data_transformation.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import sys
+from dataclasses import dataclass
+# Importing necessary libraries to handle data manipulation and machine learning.
+import numpy as np
+import pandas as pd
+from sklearn.compose import ColumnTransformer
+from sklearn.impute import SimpleImputer
+from sklearn.pipeline import Pipeline
+from sklearn.preprocessing import OneHotEncoder, StandardScaler
+# Custom modules for handling exceptions and logging messages.
+from src.exception import CustomException
+from src.logger import logging
+import os
+# Utility function to save files (used later).
+from src.utils import save_object
+# This class holds configuration details, such as where to save the preprocessor.
+@dataclass
+class DataTransformationConfig:
+    preprocessor_obj_file_path = os.path.join('artifacts', "preprocessor.pkl")
+# Main class responsible for data transformation.
+class DataTransformation:
+    def __init__(self):
+        # Setting up the configuration for data transformation, like where to save the preprocessor file.
+        self.data_transformation_config = DataTransformationConfig()
+    # This function prepares and returns an object that will handle all the transformations.
+    def get_data_transformation_object(self):
+        '''
+        Function to create a preprocessing object that transforms data (both numerical and categorical).
+        '''
+        try:
+            # Specifying which columns contain numbers and which contain categories (text).
+            numerical_columns = ["writing_score", "reading_score"]
+            categorical_columns = [
+                "gender",
+                "race_ethnicity",
+                "parental_level_of_education",
+                "lunch",
+                "test_preparation_course",
+            ]
+            # Creating a pipeline to handle numerical columns:
+            # 1. Filling in missing values with the median.
+            # 2. Standardizing the data (scaling it to have similar ranges).
+            num_pipeline = Pipeline(
+                steps=[
+                ("imputer", SimpleImputer(strategy="median")),
+                ("scaler", StandardScaler())
+                ]
+            )
+            # Creating a pipeline to handle categorical columns:
+            # 1. Filling in missing values with the most frequent category.
+            # 2. Converting text categories into numbers (OneHotEncoder).
+            # 3. Scaling the data but not adjusting the mean.
+            cat_pipeline = Pipeline(
+                steps=[
+                ("imputer", SimpleImputer(strategy="most_frequent")),
+                ("one_hot_encoder", OneHotEncoder()),
+                ("scaler", StandardScaler(with_mean=False))
+                ]
+            )
+            # Logging information about the columns.
+            logging.info(f"Categorical columns: {categorical_columns}")
+            logging.info(f"Numerical columns: {numerical_columns}")
+            # Combining the two pipelines (for numbers and categories) into one processor.
+            preprocessor = ColumnTransformer(
+                [
+                ("num_pipeline", num_pipeline, numerical_columns),
+                ("cat_pipeline", cat_pipeline, categorical_columns)
+                ]
+            )
+            # Returning the combined processor object.
+            return preprocessor
+        # If something goes wrong, this block will raise a custom error.
+        except Exception as e:
+            raise CustomException(e, sys)
+    # This function initiates the transformation process using the created processor.
+    def initiate_data_transformation(self, train_path, test_path):
+        try:
+            # Loading training and testing data from CSV files.
+            train_df = pd.read_csv(train_path)
+            test_df = pd.read_csv(test_path)
+            logging.info("Reading train and test data completed.")
+            # Getting the preprocessing object created earlier.
+            logging.info("Obtaining preprocessing object.")
+            preprocessing_obj = self.get_data_transformation_object()
+            # Defining the target column (the one we want to predict) and numerical columns.
+            target_column_name = "math_score"
+            numerical_columns = ["writing_score", "reading_score"]
+            # Separating the input features (all columns except the target) and the target (math scores).
+            input_feature_train_df = train_df.drop(columns=[target_column_name], axis=1)
+            target_feature_train_df = train_df[target_column_name]
+            input_feature_test_df = test_df.drop(columns=[target_column_name], axis=1)
+            target_feature_test_df = test_df[target_column_name]
+            logging.info("Applying preprocessing object on training and testing data.")
+            # Applying the preprocessing steps (like scaling and encoding) to the training and test data.
+            input_feature_train_arr = preprocessing_obj.fit_transform(input_feature_train_df)
+            input_feature_test_arr = preprocessing_obj.transform(input_feature_test_df)
+            # Combining the transformed input features and the target column for both training and testing.
+            train_arr = np.c_[input_feature_train_arr, np.array(target_feature_train_df)]
+            test_arr = np.c_[input_feature_test_arr, np.array(target_feature_test_df)]
+            logging.info("Saved preprocessing object.")
+            # Saving the preprocessing object for future use (so it doesn't have to be recreated).
+            save_object(
+                file_path = self.data_transformation_config.preprocessor_obj_file_path,
+                obj = preprocessing_obj
+            )
+            # Returning the transformed data and the path where the processor was saved.
+            return (
+                train_arr,
+                test_arr,
+                self.data_transformation_config.preprocessor_obj_file_path,
+            )
+        except Exception as e:
+            raise CustomException(e, sys)

src/components/model_trainer.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import os
+import sys
+from dataclasses import dataclass
+from catboost import CatBoostRegressor
+from sklearn.ensemble import (
+    AdaBoostRegressor,
+    GradientBoostingRegressor,
+    RandomForestRegressor,
+)
+from sklearn.linear_model import LinearRegression
+from sklearn.metrics import r2_score
+from sklearn.neighbors import KNeighborsRegressor
+from sklearn.tree import DecisionTreeRegressor
+from xgboost import XGBRegressor
+from src.exception import CustomException
+from src.logger import logging
+from src.utils import save_object,evaluate_models
+@dataclass
+class ModelTrainerConfig:
+    trained_model_file_path=os.path.join("artifacts","model.pkl")
+class ModelTrainer:
+    def __init__(self):
+        self.model_trainer_config=ModelTrainerConfig()
+    def initiate_model_trainer(self,train_array,test_array):
+        try:
+            logging.info("Split training and test input data")
+            X_train,y_train,X_test,y_test=(
+                train_array[:,:-1],
+                train_array[:,-1],
+                test_array[:,:-1],
+                test_array[:,-1]
+            )
+            models = {
+                "Random Forest": RandomForestRegressor(),
+                "Decision Tree": DecisionTreeRegressor(),
+                "Gradient Boosting": GradientBoostingRegressor(),
+                "Linear Regression": LinearRegression(),
+                "XGBRegressor": XGBRegressor(),
+                "CatBoosting Regressor": CatBoostRegressor(verbose=False),
+                "AdaBoost Regressor": AdaBoostRegressor(),
+            }
+            params={
+                "Decision Tree": {
+                    'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
+                    # 'splitter':['best','random'],
+                    # 'max_features':['sqrt','log2'],
+                },
+                "Random Forest":{
+                    # 'criterion':['squared_error', 'friedman_mse', 'absolute_error', 'poisson'],
+                    # 'max_features':['sqrt','log2',None],
+                    'n_estimators': [8,16,32,64,128,256]
+                },
+                "Gradient Boosting":{
+                    # 'loss':['squared_error', 'huber', 'absolute_error', 'quantile'],
+                    'learning_rate':[.1,.01,.05,.001],
+                    'subsample':[0.6,0.7,0.75,0.8,0.85,0.9],
+                    # 'criterion':['squared_error', 'friedman_mse'],
+                    # 'max_features':['auto','sqrt','log2'],
+                    'n_estimators': [8,16,32,64,128,256]
+                },
+                "Linear Regression":{},
+                "XGBRegressor":{
+                    'learning_rate':[.1,.01,.05,.001],
+                    'n_estimators': [8,16,32,64,128,256]
+                },
+                "CatBoosting Regressor":{
+                    'depth': [6,8,10],
+                    'learning_rate': [0.01, 0.05, 0.1],
+                    'iterations': [30, 50, 100]
+                },
+                "AdaBoost Regressor":{
+                    'learning_rate':[.1,.01,0.5,.001],
+                    # 'loss':['linear','square','exponential'],
+                    'n_estimators': [8,16,32,64,128,256]
+                }
+            }
+            model_report:dict=evaluate_models(X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test,
+                                             models=models,param=params)
+            ## To get best model score from dict
+            best_model_score = max(sorted(model_report.values()))
+            ## To get best model name from dict
+            best_model_name = list(model_report.keys())[
+                list(model_report.values()).index(best_model_score)
+            ]
+            best_model = models[best_model_name]
+            if best_model_score<0.6:
+                raise CustomException("No best model found")
+            logging.info(f"Best found model on both training and testing dataset")
+            save_object(
+                file_path=self.model_trainer_config.trained_model_file_path,
+                obj=best_model
+            )
+            predicted=best_model.predict(X_test)
+            r2_square = r2_score(y_test, predicted)
+            return r2_square
+        except Exception as e:
+            raise CustomException(e,sys)

src/exception.py ADDED Viewed

	@@ -0,0 +1,19 @@

+# src/exception.py
+import sys
+import logging
+class CustomException(Exception):
+    def __init__(self, error_message: Exception, error_detail: sys):
+        super().__init__(error_message)
+        self.error_message = CustomException.get_detailed_error_message(error_message, error_detail)
+    @staticmethod
+    def get_detailed_error_message(error: Exception, error_detail: sys):
+        _, _, exc_tb = error_detail.exc_info()
+        file_name = exc_tb.tb_frame.f_code.co_filename
+        line_number = exc_tb.tb_lineno
+        error_message = f"Error occurred in script: {file_name} at line {line_number}: {str(error)}"
+        return error_message
+    def __str__(self):
+        return self.error_message

src/logger.py ADDED Viewed

	@@ -0,0 +1,25 @@

+import logging  # Import the logging module to enable logging messages
+import os  # Import the os module to interact with the operating system
+from datetime import datetime  # Import datetime to work with date and time
+# Generate a log file name with the current timestamp (format: MM_DD_YYYY_HH_MM_SS.log)
+LOG_FILE = f"{datetime.now().strftime('%m_%d_%Y_%H_%M_%S')}.log"
+# Create a path for the "logs" folder in the current working directory
+logs_dir = os.path.join(os.getcwd(), "logs")
+# Create the "logs" directory if it doesn't exist already (exist_ok=True avoids errors if it exists)
+os.makedirs(logs_dir, exist_ok=True)
+# Define the full path where the log file will be saved
+LOG_FILE_PATH = os.path.join(logs_dir, LOG_FILE)
+# Configure the logging settings
+logging.basicConfig(
+    filename=LOG_FILE_PATH,  # Set the log file path
+    format="[ %(asctime)s ] %(lineno)d %(name)s - %(levelname)s - %(message)s",  # Define the log message format
+    level=logging.INFO,  # Set the logging level to INFO (log INFO and above)
+)
+# Log a message to verify that logging is working
+logging.info("Logging has been configured successfully.")

src/pipeline/__init__.py ADDED Viewed

File without changes

src/pipeline/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (190 Bytes). View file

src/pipeline/__pycache__/predict_pipeline.cpython-312.pyc ADDED Viewed

Binary file (2.94 kB). View file

src/pipeline/predict_pipeline.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import sys
+import pandas as pd
+from src.exception import CustomException
+from src.utils import load_object
+import os
+class PredictPipeline:
+    def __init__(self):
+        pass
+    def predict(self,features):
+        try:
+            model_path=os.path.join("artifacts","model.pkl")
+            preprocessor_path=os.path.join('artifacts','preprocessor.pkl')
+            print("Before Loading")
+            model=load_object(file_path=model_path)
+            preprocessor=load_object(file_path=preprocessor_path)
+            print("After Loading")
+            data_scaled=preprocessor.transform(features)
+            preds=model.predict(data_scaled)
+            return preds
+        except Exception as e:
+            raise CustomException(e,sys)
+class CustomData:
+    def __init__(  self,
+        gender: str,
+        race_ethnicity: str,
+        parental_level_of_education,
+        lunch: str,
+        test_preparation_course: str,
+        reading_score: int,
+        writing_score: int):
+        self.gender = gender
+        self.race_ethnicity = race_ethnicity
+        self.parental_level_of_education = parental_level_of_education
+        self.lunch = lunch
+        self.test_preparation_course = test_preparation_course
+        self.reading_score = reading_score
+        self.writing_score = writing_score
+    def get_data_as_data_frame(self):
+        try:
+            custom_data_input_dict = {
+                "gender": [self.gender],
+                "race_ethnicity": [self.race_ethnicity],
+                "parental_level_of_education": [self.parental_level_of_education],
+                "lunch": [self.lunch],
+                "test_preparation_course": [self.test_preparation_course],
+                "reading_score": [self.reading_score],
+                "writing_score": [self.writing_score],
+            }
+            return pd.DataFrame(custom_data_input_dict)
+        except Exception as e:
+            raise CustomException(e, sys)

src/pipeline/train_pipeline.py ADDED Viewed

File without changes

src/utils.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import os
+import sys
+import numpy as np
+import pandas as pd
+import dill
+import pickle
+from sklearn.metrics import r2_score
+from sklearn.model_selection import GridSearchCV
+from src.exception import CustomException
+def save_object(file_path, obj):
+    try:
+        dir_path = os.path.dirname(file_path)
+        os.makedirs(dir_path, exist_ok=True)
+        with open(file_path, "wb") as file_obj:
+            pickle.dump(obj, file_obj)
+    except Exception as e:
+        raise CustomException(e, sys)
+def evaluate_models(X_train, y_train,X_test,y_test,models,param):
+    try:
+        report = {}
+        for i in range(len(list(models))):
+            model = list(models.values())[i]
+            para=param[list(models.keys())[i]]
+            gs = GridSearchCV(model,para,cv=3)
+            gs.fit(X_train,y_train)
+            model.set_params(**gs.best_params_)
+            model.fit(X_train,y_train)
+            #model.fit(X_train, y_train)  # Train model
+            y_train_pred = model.predict(X_train)
+            y_test_pred = model.predict(X_test)
+            train_model_score = r2_score(y_train, y_train_pred)
+            test_model_score = r2_score(y_test, y_test_pred)
+            report[list(models.keys())[i]] = test_model_score
+        return report
+    except Exception as e:
+        raise CustomException(e, sys)
+def load_object(file_path):
+    try:
+        with open(file_path, "rb") as file_obj:
+            return pickle.load(file_obj)
+    except Exception as e:
+        raise CustomException(e, sys)