Spaces:

rahul2001
/

student_performance

Build error

App Files Files Community

rahul2001 commited on Sep 13, 2023

Commit

0aa9c34

1 Parent(s): e22877e

Data transformation

Browse files

Files changed (7) hide show

EDA.ipynb +0 -0
artifact/Preprocessor.pkl +0 -0
model_training.ipynb +542 -0
requirements.txt +1 -0
src/Components/Data_ingestation.py +4 -3
src/Components/data_transformation.py +79 -3
src/utils.py +23 -0

EDA.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

artifact/Preprocessor.pkl ADDED Viewed

Binary file (3.48 kB). View file

model_training.ipynb CHANGED Viewed

	@@ -0,0 +1,542 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "1.1 Import Data and Required Packages\n",
+    "Importing Pandas, Numpy, Matplotlib, Seaborn and Warings Library.\n",
+    "# Basic Import"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Basic Import\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt \n",
+    "import seaborn as sns\n",
+    "# Modelling\n",
+    "from sklearn.metrics import mean_squared_error, r2_score\n",
+    "from sklearn.neighbors import KNeighborsRegressor\n",
+    "from sklearn.tree import DecisionTreeRegressor\n",
+    "from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor\n",
+    "from sklearn.svm import SVR\n",
+    "from sklearn.linear_model import LinearRegression, Ridge,Lasso\n",
+    "from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error\n",
+    "from sklearn.model_selection import RandomizedSearchCV\n",
+    "from catboost import CatBoostRegressor\n",
+    "from xgboost import XGBRegressor\n",
+    "import warnings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df =  pd.read_csv(\"artifact/raw.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "gender => ['female' 'male']\n",
+      "\n",
+      "race_ethnicity => ['group B' 'group C' 'group A' 'group D' 'group E']\n",
+      "\n",
+      "parental_level_of_education => [\"bachelor's degree\" 'some college' \"master's degree\" \"associate's degree\"\n",
+      " 'high school' 'some high school']\n",
+      "\n",
+      "lunch => ['standard' 'free/reduced']\n",
+      "\n",
+      "test_preparation_course => ['none' 'completed']\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "for i in df.columns:\n",
+    "    if df[i].dtype == \"object\":\n",
+    "        print(\"{} =>\".format(i),df[i].unique())\n",
+    "        print(\"\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = df.drop(columns=['math_score'],axis=1)\n",
+    "y  = df[\"math_score\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Create Column Transformer with 3 types of transformers\n",
+    "num_features = X.select_dtypes(exclude=\"object\").columns\n",
+    "cat_features = X.select_dtypes(include=\"object\").columns\n",
+    "\n",
+    "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
+    "from sklearn.compose import ColumnTransformer\n",
+    "\n",
+    "numeric_transformer = StandardScaler()\n",
+    "oh_transformer = OneHotEncoder()\n",
+    "\n",
+    "preprocessor = ColumnTransformer(\n",
+    "    [\n",
+    "        (\"OneHotEncoder\", oh_transformer, cat_features),\n",
+    "         (\"StandardScaler\", numeric_transformer, num_features),        \n",
+    "    ]\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X =  preprocessor.fit_transform(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(1000, 19)"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "X.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "((800, 19), (200, 19))"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "# separate dataset into train and test\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)\n",
+    "X_train.shape, X_test.shape"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***Create an Evaluate Function to give all metrics after model Training***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def evaluate_model(true, predicted):\n",
+    "    mae = mean_absolute_error(true, predicted)\n",
+    "    mse = mean_squared_error(true, predicted)\n",
+    "    rmse = np.sqrt(mean_squared_error(true, predicted))\n",
+    "    r2_square = r2_score(true, predicted)\n",
+    "    return mae, rmse, r2_square"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Linear Regression\n",
+      "Model performance for Training set\n",
+      "- Root Mean Squared Error: 5.3243\n",
+      "- Mean Absolute Error: 4.2671\n",
+      "- R2 Score: 0.8743\n",
+      "----------------------------------\n",
+      "Model performance for Test set\n",
+      "- Root Mean Squared Error: 5.3960\n",
+      "- Mean Absolute Error: 4.2158\n",
+      "- R2 Score: 0.8803\n",
+      "===================================\n",
+      "\n",
+      "\n",
+      "Lasso\n",
+      "Model performance for Training set\n",
+      "- Root Mean Squared Error: 6.5938\n",
+      "- Mean Absolute Error: 5.2063\n",
+      "- R2 Score: 0.8071\n",
+      "----------------------------------\n",
+      "Model performance for Test set\n",
+      "- Root Mean Squared Error: 6.5197\n",
+      "- Mean Absolute Error: 5.1579\n",
+      "- R2 Score: 0.8253\n",
+      "===================================\n",
+      "\n",
+      "\n",
+      "Ridge\n",
+      "Model performance for Training set\n",
+      "- Root Mean Squared Error: 5.3233\n",
+      "- Mean Absolute Error: 4.2650\n",
+      "- R2 Score: 0.8743\n",
+      "----------------------------------\n",
+      "Model performance for Test set\n",
+      "- Root Mean Squared Error: 5.3904\n",
+      "- Mean Absolute Error: 4.2111\n",
+      "- R2 Score: 0.8806\n",
+      "===================================\n",
+      "\n",
+      "\n",
+      "K-Neighbors Regressor\n",
+      "Model performance for Training set\n",
+      "- Root Mean Squared Error: 5.7077\n",
+      "- Mean Absolute Error: 4.5167\n",
+      "- R2 Score: 0.8555\n",
+      "----------------------------------\n",
+      "Model performance for Test set\n",
+      "- Root Mean Squared Error: 7.2530\n",
+      "- Mean Absolute Error: 5.6210\n",
+      "- R2 Score: 0.7838\n",
+      "===================================\n",
+      "\n",
+      "\n",
+      "Decision Tree\n",
+      "Model performance for Training set\n",
+      "- Root Mean Squared Error: 0.2795\n",
+      "- Mean Absolute Error: 0.0187\n",
+      "- R2 Score: 0.9997\n",
+      "----------------------------------\n",
+      "Model performance for Test set\n",
+      "- Root Mean Squared Error: 7.7785\n",
+      "- Mean Absolute Error: 6.2350\n",
+      "- R2 Score: 0.7514\n",
+      "===================================\n",
+      "\n",
+      "\n",
+      "Random Forest Regressor\n",
+      "Model performance for Training set\n",
+      "- Root Mean Squared Error: 2.2860\n",
+      "- Mean Absolute Error: 1.8215\n",
+      "- R2 Score: 0.9768\n",
+      "----------------------------------\n",
+      "Model performance for Test set\n",
+      "- Root Mean Squared Error: 5.9993\n",
+      "- Mean Absolute Error: 4.6304\n",
+      "- R2 Score: 0.8521\n",
+      "===================================\n",
+      "\n",
+      "\n",
+      "XGBRegressor\n",
+      "Model performance for Training set\n",
+      "- Root Mean Squared Error: 1.0073\n",
+      "- Mean Absolute Error: 0.6875\n",
+      "- R2 Score: 0.9955\n",
+      "----------------------------------\n",
+      "Model performance for Test set\n",
+      "- Root Mean Squared Error: 6.4733\n",
+      "- Mean Absolute Error: 5.0577\n",
+      "- R2 Score: 0.8278\n",
+      "===================================\n",
+      "\n",
+      "\n",
+      "CatBoosting Regressor\n",
+      "Model performance for Training set\n",
+      "- Root Mean Squared Error: 3.0427\n",
+      "- Mean Absolute Error: 2.4054\n",
+      "- R2 Score: 0.9589\n",
+      "----------------------------------\n",
+      "Model performance for Test set\n",
+      "- Root Mean Squared Error: 6.0086\n",
+      "- Mean Absolute Error: 4.6125\n",
+      "- R2 Score: 0.8516\n",
+      "===================================\n",
+      "\n",
+      "\n",
+      "AdaBoost Regressor\n",
+      "Model performance for Training set\n",
+      "- Root Mean Squared Error: 5.7923\n",
+      "- Mean Absolute Error: 4.7185\n",
+      "- R2 Score: 0.8512\n",
+      "----------------------------------\n",
+      "Model performance for Test set\n",
+      "- Root Mean Squared Error: 5.9460\n",
+      "- Mean Absolute Error: 4.6538\n",
+      "- R2 Score: 0.8547\n",
+      "===================================\n",
+      "\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "models = {\n",
+    "    \"Linear Regression\": LinearRegression(),\n",
+    "    \"Lasso\": Lasso(),\n",
+    "    \"Ridge\": Ridge(),\n",
+    "    \"K-Neighbors Regressor\": KNeighborsRegressor(),\n",
+    "    \"Decision Tree\": DecisionTreeRegressor(),\n",
+    "    \"Random Forest Regressor\": RandomForestRegressor(),\n",
+    "    \"XGBRegressor\": XGBRegressor(), \n",
+    "    \"CatBoosting Regressor\": CatBoostRegressor(verbose=False),\n",
+    "    \"AdaBoost Regressor\": AdaBoostRegressor()\n",
+    "}\n",
+    "model_list = []\n",
+    "r2_list =[]\n",
+    "\n",
+    "for i in range(len(list(models))):\n",
+    "    model = list(models.values())[i]\n",
+    "    model.fit(X_train, y_train) # Train model\n",
+    "\n",
+    "    # Make predictions\n",
+    "    y_train_pred = model.predict(X_train)\n",
+    "    y_test_pred = model.predict(X_test)\n",
+    "    \n",
+    "    # Evaluate Train and Test dataset\n",
+    "    model_train_mae , model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)\n",
+    "\n",
+    "    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)\n",
+    "\n",
+    "    \n",
+    "    print(list(models.keys())[i])\n",
+    "    model_list.append(list(models.keys())[i])\n",
+    "    \n",
+    "    print('Model performance for Training set')\n",
+    "    print(\"- Root Mean Squared Error: {:.4f}\".format(model_train_rmse))\n",
+    "    print(\"- Mean Absolute Error: {:.4f}\".format(model_train_mae))\n",
+    "    print(\"- R2 Score: {:.4f}\".format(model_train_r2))\n",
+    "\n",
+    "    print('----------------------------------')\n",
+    "    \n",
+    "    print('Model performance for Test set')\n",
+    "    print(\"- Root Mean Squared Error: {:.4f}\".format(model_test_rmse))\n",
+    "    print(\"- Mean Absolute Error: {:.4f}\".format(model_test_mae))\n",
+    "    print(\"- R2 Score: {:.4f}\".format(model_test_r2))\n",
+    "    r2_list.append(model_test_r2)\n",
+    "    \n",
+    "    print('='*35)\n",
+    "    print('\\n')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "***Results***"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Model Name</th>\n",
+       "      <th>R2_Score</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>Ridge</td>\n",
+       "      <td>0.880593</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>Linear Regression</td>\n",
+       "      <td>0.880345</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>8</th>\n",
+       "      <td>AdaBoost Regressor</td>\n",
+       "      <td>0.854710</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5</th>\n",
+       "      <td>Random Forest Regressor</td>\n",
+       "      <td>0.852094</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>7</th>\n",
+       "      <td>CatBoosting Regressor</td>\n",
+       "      <td>0.851632</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>6</th>\n",
+       "      <td>XGBRegressor</td>\n",
+       "      <td>0.827797</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Lasso</td>\n",
+       "      <td>0.825320</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>K-Neighbors Regressor</td>\n",
+       "      <td>0.783813</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Decision Tree</td>\n",
+       "      <td>0.751354</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                Model Name  R2_Score\n",
+       "2                    Ridge  0.880593\n",
+       "0        Linear Regression  0.880345\n",
+       "8       AdaBoost Regressor  0.854710\n",
+       "5  Random Forest Regressor  0.852094\n",
+       "7    CatBoosting Regressor  0.851632\n",
+       "6             XGBRegressor  0.827797\n",
+       "1                    Lasso  0.825320\n",
+       "3    K-Neighbors Regressor  0.783813\n",
+       "4            Decision Tree  0.751354"
+      ]
+     },
+     "execution_count": 18,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=[\"R2_Score\"],ascending=False)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ml-project",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.4"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

requirements.txt CHANGED Viewed

@@ -5,4 +5,5 @@ matplotlib
 scikit-learn
 catboost
 xgboost
 -e .

 scikit-learn
 catboost
 xgboost
+dill
 -e .

src/Components/Data_ingestation.py CHANGED Viewed

@@ -6,8 +6,7 @@ from src.logger import logging
 import pandas as pd
 from sklearn.model_selection import train_test_split
 from dataclasses import dataclass
 @dataclass
 class Data_ingestion_config:
     train_data_path: str =  os.path.join("artifact","train.csv")
@@ -46,8 +45,10 @@ class Data_ingestion:
 if __name__ == "__main__":
     obj =  Data_ingestion()
-    obj.intiate_data_ingestion()

 import pandas as pd
 from sklearn.model_selection import train_test_split
 from dataclasses import dataclass
+from data_transformation import Data_transformation
 @dataclass
 class Data_ingestion_config:
     train_data_path: str =  os.path.join("artifact","train.csv")
 if __name__ == "__main__":
     obj =  Data_ingestion()
+    train_data,test_data = obj.intiate_data_ingestion()
+    data_trans =  Data_transformation()
+    data_trans.initiate_data_transformation(train_data,test_data)

src/Components/data_transformation.py CHANGED Viewed

@@ -11,21 +11,97 @@ from sklearn.preprocessing import OneHotEncoder,StandardScaler
 from src.exception import CustomException
 from src.logger import logging
-from Data_ingestation import Data_ingestion
 @dataclass
 class Data_transformation_config:
-    Preprpcessor_obj_file = os.path.join("artifact","Preprocessor.pkl")
 class Data_transformation:
     def __init__(self) -> None:
         self.data_transformation_config =  Data_transformation_config()
     def get_data_transformer_object(self):
         try:
-            pass
         except Exception as e:
             raise CustomException(e,sys)

 from src.exception import CustomException
 from src.logger import logging
+from src.utils import save_object
 @dataclass
 class Data_transformation_config:
+    Preprocessor_obj_file = os.path.join("artifact","Preprocessor.pkl")
 class Data_transformation:
     def __init__(self) -> None:
         self.data_transformation_config =  Data_transformation_config()
     def get_data_transformer_object(self):
         try:
+            numerical_columns = ["writing_score","reading_score"]
+            categorical_columns = [
+                "gender",
+                "race_ethnicity",
+                "parental_level_of_education",
+                "lunch",
+                "test_preparation_course",
+            ]
+            num_pipeline = Pipeline(
+                steps = [
+                    ("imputer",SimpleImputer(strategy="median")),
+                    ("scaler",StandardScaler())
+                ]
+                )
+            cat_pipeline = Pipeline(
+                steps = [
+                    ("imputer",SimpleImputer(strategy= "most_frequent")),
+                    ("one_hot_encoder",OneHotEncoder()),
+                    ("scaler",StandardScaler(with_mean = False))
+                ]
+            )
+            logging.info(f"Categorical Columns:{categorical_columns}")
+            logging.info(f"Numerical Columns:{numerical_columns}")
+            preprocessor = ColumnTransformer(
+                [
+                    ("num_pipeline",num_pipeline,numerical_columns),
+                    ("cat_pipeline",cat_pipeline,categorical_columns)
+                ]
+            )
+            return preprocessor
         except Exception as e:
             raise CustomException(e,sys)
+    def initiate_data_transformation(self,train_path,test_path):
+        try:
+            train_df = pd.read_csv(train_path)
+            test_df = pd.read_csv(test_path)
+            logging.info("Read train and test data completed")
+            logging.info("Obtaining preprocessing object")
+            preprocessor_obj = self.get_data_transformer_object()
+            target_column_name = "math_score"
+            numerical_columns = ["writing_score","reading_score"]
+            input_feature_train_df = train_df.drop(columns = [target_column_name],axis = 1)
+            target_feature_train_df = train_df[target_column_name]
+            input_feature_test_df = test_df.drop(columns = [target_column_name],axis = 1)
+            target_feature_test_df = test_df[target_column_name]
+            logging.info(
+                f"Applying preprocessing object on training dataframe and testing dataframe.")
+            input_feature_train_arr = preprocessor_obj.fit_transform(input_feature_train_df)
+            input_feature_test_arr =  preprocessor_obj.transform(input_feature_test_df)
+            train_arr = np.c_[input_feature_train_arr,np.array(target_feature_train_df)]
+            test_arr = np.c_[input_feature_test_arr,np.array(target_feature_test_df)]
+            logging.info(f"Saved preprocessing object.")
+            save_object(
+                file_path = self.data_transformation_config.Preprocessor_obj_file,
+                obj = preprocessor_obj
+            )
+            return (
+                train_arr,
+                test_arr,
+                self.data_transformation_config.Preprocessor_obj_file
+            )
+        except Exception as e:
+            raise CustomException(e,sys)

src/utils.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+import sys
+import numpy as np
+import pandas as pd
+import dill
+import pickle
+from sklearn.metrics import r2_score
+from sklearn.model_selection import GridSearchCV
+from src.exception import CustomException
+def save_object(file_path , obj):
+        try:
+            dir_path = os.path.dirname(file_path)
+            os.makedirs(dir_path,exist_ok= True)
+            with open(file_path,"wb") as file_obj:
+                  pickle.dump(obj,file_obj)
+        except Exception as e:
+              raise CustomException(e,sys)