{ "metadata": { "kernelspec": { "language": "python", "display_name": "Python 3", "name": "python3" }, "language_info": { "pygments_lexer": "ipython3", "nbconvert_exporter": "python", "version": "3.6.4", "file_extension": ".py", "codemirror_mode": { "name": "ipython", "version": 3 }, "name": "python", "mimetype": "text/x-python" }, "kaggle": { "accelerator": "none", "dataSources": [ { "sourceId": 12374924, "sourceType": "datasetVersion", "datasetId": 7802809 } ], "isInternetEnabled": true, "language": "python", "sourceType": "notebook", "isGpuEnabled": false }, "colab": { "provenance": [] } }, "nbformat_minor": 0, "nbformat": 4, "cells": [ { "source": [ "# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,\n", "# THEN FEEL FREE TO DELETE THIS CELL.\n", "# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON\n", "# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR\n", "# NOTEBOOK.\n", "import kagglehub\n", "gmudit_employer_data_path = kagglehub.dataset_download('gmudit/employer-data')\n", "\n", "print('Data source import complete.')\n" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "wlTJKTg-OKTm", "outputId": "4cc02ed2-f1ec-40cf-a9fe-e53ed3b9fb1d" }, "cell_type": "code", "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "Downloading from https://www.kaggle.com/api/v1/datasets/download/gmudit/employer-data?dataset_version_number=1...\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "100%|██████████| 163k/163k [00:00<00:00, 496kB/s]" ] }, { "output_type": "stream", "name": "stdout", "text": [ "Extracting files...\n", "Data source import complete.\n" ] }, { "output_type": "stream", "name": "stderr", "text": [ "\n" ] } ], "execution_count": null }, { "cell_type": "code", "source": [ "gmudit_employer_data_path" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 35 }, "id": "BoE83HlnOQW_", "outputId": "ef513373-19a6-4ad4-d77a-380c9135b3f8" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "'/root/.cache/kagglehub/datasets/gmudit/employer-data/versions/1'" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "string" } }, "metadata": {}, "execution_count": 2 } ] }, { "cell_type": "code", "source": [ "# This Python 3 environment comes with many helpful analytics libraries installed\n", "# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python\n", "# For example, here's several helpful packages to load\n", "\n", "import numpy as np # linear algebra\n", "import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)\n", "\n", "# Input data files are available in the read-only \"../input/\" directory\n", "# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory\n", "\n", "import os\n", "for dirname, _, filenames in os.walk('/root/.cache/kagglehub/datasets/gmudit/employer-data/versions/1'):\n", " for filename in filenames:\n", " print(os.path.join(dirname, filename))\n", "\n", "# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using \"Save & Run All\"\n", "# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session" ], "metadata": { "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5", "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19", "trusted": true, "colab": { "base_uri": "https://localhost:8080/" }, "id": "X1ZiOTGIOKTo", "outputId": "539ddc40-9e2e-4fb4-8a72-d1da9c0f4e65" }, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "/root/.cache/kagglehub/datasets/gmudit/employer-data/versions/1/Employers_data.csv\n" ] } ], "execution_count": null }, { "cell_type": "code", "source": [ "emp = pd.read_csv('/root/.cache/kagglehub/datasets/gmudit/employer-data/versions/1/Employers_data.csv')\n", "emp.head(5)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 206 }, "id": "-0PQPjiLOTAk", "outputId": "e637a311-45c8-4cc5-d62a-78091b5ef329" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ " Employee_ID Name Age Gender Department Job_Title \\\n", "0 1 Merle Ingram 24 Female Engineering Engineer \n", "1 2 John Mayes 56 Male Sales Executive \n", "2 3 Carlos Wille 21 Male Engineering Intern \n", "3 4 Michael Bryant 30 Male Finance Analyst \n", "4 5 Paula Douglas 25 Female HR Analyst \n", "\n", " Experience_Years Education_Level Location Salary \n", "0 1 Master Austin 90000 \n", "1 33 Master Seattle 195000 \n", "2 1 Bachelor New York 35000 \n", "3 9 Bachelor New York 75000 \n", "4 2 Master Seattle 70000 " ], "text/html": [ "\n", "
\n", "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Employee_IDNameAgeGenderDepartmentJob_TitleExperience_YearsEducation_LevelLocationSalary
01Merle Ingram24FemaleEngineeringEngineer1MasterAustin90000
12John Mayes56MaleSalesExecutive33MasterSeattle195000
23Carlos Wille21MaleEngineeringIntern1BachelorNew York35000
34Michael Bryant30MaleFinanceAnalyst9BachelorNew York75000
45Paula Douglas25FemaleHRAnalyst2MasterSeattle70000
\n", "
\n", "
\n", "\n", "
\n", " \n", "\n", " \n", "\n", " \n", "
\n", "\n", "\n", "
\n", " \n", "\n", "\n", "\n", " \n", "
\n", "\n", "
\n", "
\n" ], "application/vnd.google.colaboratory.intrinsic+json": { "type": "dataframe", "variable_name": "emp", "summary": "{\n \"name\": \"emp\",\n \"rows\": 10000,\n \"fields\": [\n {\n \"column\": \"Employee_ID\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 2886,\n \"min\": 1,\n \"max\": 10000,\n \"num_unique_values\": 10000,\n \"samples\": [\n 6253,\n 4685,\n 1732\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Name\",\n \"properties\": {\n \"dtype\": \"string\",\n \"num_unique_values\": 9868,\n \"samples\": [\n \"Cristal Rodriguez\",\n \"Stephanie Ellis\",\n \"Sean Green\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Age\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 10,\n \"min\": 21,\n \"max\": 60,\n \"num_unique_values\": 40,\n \"samples\": [\n 22,\n 28,\n 31\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Gender\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 2,\n \"samples\": [\n \"Male\",\n \"Female\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Department\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 6,\n \"samples\": [\n \"Engineering\",\n \"Sales\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Job_Title\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Executive\",\n \"Manager\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Experience_Years\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 9,\n \"min\": 0,\n \"max\": 37,\n \"num_unique_values\": 38,\n \"samples\": [\n 27,\n 34\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Education_Level\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 3,\n \"samples\": [\n \"Master\",\n \"Bachelor\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Location\",\n \"properties\": {\n \"dtype\": \"category\",\n \"num_unique_values\": 5,\n \"samples\": [\n \"Seattle\",\n \"Chicago\"\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n },\n {\n \"column\": \"Salary\",\n \"properties\": {\n \"dtype\": \"number\",\n \"std\": 46066,\n \"min\": 25000,\n \"max\": 215000,\n \"num_unique_values\": 39,\n \"samples\": [\n 100000,\n 50000\n ],\n \"semantic_type\": \"\",\n \"description\": \"\"\n }\n }\n ]\n}" } }, "metadata": {}, "execution_count": 4 } ] }, { "cell_type": "code", "source": [ "emp.info()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "nlba6h9XOYqU", "outputId": "9e811967-31ff-4440-f16d-d2a8b1db819d" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 10000 entries, 0 to 9999\n", "Data columns (total 10 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Employee_ID 10000 non-null int64 \n", " 1 Name 10000 non-null object\n", " 2 Age 10000 non-null int64 \n", " 3 Gender 10000 non-null object\n", " 4 Department 10000 non-null object\n", " 5 Job_Title 10000 non-null object\n", " 6 Experience_Years 10000 non-null int64 \n", " 7 Education_Level 10000 non-null object\n", " 8 Location 10000 non-null object\n", " 9 Salary 10000 non-null int64 \n", "dtypes: int64(4), object(6)\n", "memory usage: 781.4+ KB\n" ] } ] }, { "cell_type": "code", "source": [ "emp.drop(columns=['Employee_ID'], inplace=True)" ], "metadata": { "id": "pJHzWx3hOZzR" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "emp.duplicated().sum()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "8oLiKUWzOdB-", "outputId": "bda192bd-e3c2-494f-aae3-eabd57d35c97" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "np.int64(0)" ] }, "metadata": {}, "execution_count": 7 } ] }, { "cell_type": "code", "source": [ "emp.isna().sum()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 366 }, "id": "WuNxStUKOfNs", "outputId": "e78a25f6-d38f-41b1-bbea-bf763084f4e7" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Name 0\n", "Age 0\n", "Gender 0\n", "Department 0\n", "Job_Title 0\n", "Experience_Years 0\n", "Education_Level 0\n", "Location 0\n", "Salary 0\n", "dtype: int64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
0
Name0
Age0
Gender0
Department0
Job_Title0
Experience_Years0
Education_Level0
Location0
Salary0
\n", "

" ] }, "metadata": {}, "execution_count": 8 } ] }, { "cell_type": "code", "source": [ "emp['Salary'].value_counts(normalize=True)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 1000 }, "id": "dwH_xoFaOgc0", "outputId": "30275cf0-700b-4c47-bb88-ae5b4926f927" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Salary\n", "70000 0.0848\n", "65000 0.0734\n", "75000 0.0595\n", "130000 0.0447\n", "135000 0.0440\n", "125000 0.0439\n", "140000 0.0429\n", "145000 0.0424\n", "150000 0.0404\n", "120000 0.0385\n", "80000 0.0304\n", "60000 0.0298\n", "115000 0.0262\n", "35000 0.0262\n", "100000 0.0233\n", "95000 0.0224\n", "180000 0.0219\n", "190000 0.0215\n", "110000 0.0214\n", "200000 0.0213\n", "90000 0.0211\n", "105000 0.0205\n", "175000 0.0202\n", "155000 0.0201\n", "165000 0.0201\n", "195000 0.0199\n", "185000 0.0198\n", "170000 0.0185\n", "40000 0.0175\n", "85000 0.0150\n", "205000 0.0136\n", "160000 0.0120\n", "30000 0.0110\n", "55000 0.0045\n", "210000 0.0034\n", "45000 0.0021\n", "25000 0.0010\n", "50000 0.0005\n", "215000 0.0003\n", "Name: proportion, dtype: float64" ], "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
proportion
Salary
700000.0848
650000.0734
750000.0595
1300000.0447
1350000.0440
1250000.0439
1400000.0429
1450000.0424
1500000.0404
1200000.0385
800000.0304
600000.0298
1150000.0262
350000.0262
1000000.0233
950000.0224
1800000.0219
1900000.0215
1100000.0214
2000000.0213
900000.0211
1050000.0205
1750000.0202
1550000.0201
1650000.0201
1950000.0199
1850000.0198
1700000.0185
400000.0175
850000.0150
2050000.0136
1600000.0120
300000.0110
550000.0045
2100000.0034
450000.0021
250000.0010
500000.0005
2150000.0003
\n", "

" ] }, "metadata": {}, "execution_count": 9 } ] }, { "cell_type": "code", "source": [ "def remove_outliers(df, col):\n", " q1 = df[col].quantile(0.25)\n", " q3 = df[col].quantile(0.75)\n", " iqr = q3 - q1\n", " lower_bound = q1 - 1.5 * iqr\n", " upper_bound = q3 + 1.5 * iqr\n", " df = df[(df[col] >= lower_bound) & (df[col] <= upper_bound)]\n", " return df" ], "metadata": { "id": "WuEAGhA8Ok9M" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "for col in emp.select_dtypes(include='number').columns:\n", " emp = remove_outliers(emp, col)" ], "metadata": { "id": "0x13-JbHOuVt" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "emp.info()" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "h5YMeQQHOx7c", "outputId": "446e21d6-3c99-43bd-ed98-d441422c0bc8" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "\n", "RangeIndex: 10000 entries, 0 to 9999\n", "Data columns (total 9 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 Name 10000 non-null object\n", " 1 Age 10000 non-null int64 \n", " 2 Gender 10000 non-null object\n", " 3 Department 10000 non-null object\n", " 4 Job_Title 10000 non-null object\n", " 5 Experience_Years 10000 non-null int64 \n", " 6 Education_Level 10000 non-null object\n", " 7 Location 10000 non-null object\n", " 8 Salary 10000 non-null int64 \n", "dtypes: int64(3), object(6)\n", "memory usage: 703.3+ KB\n" ] } ] }, { "cell_type": "code", "source": [ "x = emp.drop('Salary',axis=1)\n", "y = emp['Salary']\n" ], "metadata": { "id": "xTcMhQ0POy6s" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "from sklearn.preprocessing import OneHotEncoder , StandardScaler\n", "from sklearn.impute import SimpleImputer , KNNImputer\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.linear_model import LinearRegression\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import LinearRegression , LogisticRegression , Ridge , Lasso\n", "from sklearn.tree import DecisionTreeRegressor\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.svm import SVR\n", "from sklearn.neighbors import KNeighborsRegressor\n", "from sklearn.ensemble import GradientBoostingRegressor\n", "from sklearn.ensemble import AdaBoostRegressor\n", "from sklearn.ensemble import ExtraTreesRegressor\n", "from sklearn.ensemble import BaggingRegressor\n", "from sklearn.metrics import mean_absolute_error , mean_squared_error , r2_score" ], "metadata": { "id": "nQ3UAVv4O8JE" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "num_cols = x.select_dtypes(include='number').columns\n", "cat_cols = x.select_dtypes(exclude='number').columns" ], "metadata": { "id": "31fPLr4pO_sl" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "num_pipe = Pipeline([\n", " ('imputer', SimpleImputer(strategy='mean')),\n", " ('scaler', StandardScaler())\n", "])\n", "\n", "cat_pipe = Pipeline([\n", " ('imputer', SimpleImputer(strategy='most_frequent')),\n", " ('encoder', OneHotEncoder(handle_unknown='ignore'))\n", "])" ], "metadata": { "id": "r6LDyCuXPBue" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "preprocessor = ColumnTransformer([\n", " ('num', num_pipe, num_cols),\n", " ('cat', cat_pipe, cat_cols)\n", "])" ], "metadata": { "id": "_JKOzdm5PD-F" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "len(emp['Salary'].unique())" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "WHlIlJ6iPFSN", "outputId": "0332b149-4804-4501-c796-a97656a47630" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "39" ] }, "metadata": {}, "execution_count": 18 } ] }, { "cell_type": "code", "source": [ "from imblearn.under_sampling import RandomUnderSampler" ], "metadata": { "id": "1NddaS9XPJH8" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "x_train , x_test , y_train , y_test = train_test_split(x, y, test_size=0.2, random_state=42)" ], "metadata": { "id": "caqkkkrHPKzo" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "rus = RandomUnderSampler(random_state=42)\n", "x_res, y_res = rus.fit_resample(x_train, y_train)" ], "metadata": { "id": "pKbHzncNPL5k" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "x_tr , x_val , y_tr , y_val = train_test_split(x_res, y_res, test_size=0.2, random_state=42)" ], "metadata": { "id": "747i6RGtPM96" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "x_tr = preprocessor.fit_transform(x_tr)\n", "x_val = preprocessor.transform(x_val)" ], "metadata": { "id": "2gAm84ZCPOLn" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "lin = LinearRegression()\n", "lin.fit(x_tr, y_tr)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 80 }, "id": "QeO54dsjPPe9", "outputId": "c30165c2-fc63-4175-fd5e-a3c690192839" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "LinearRegression()" ], "text/html": [ "
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 24 } ] }, { "cell_type": "code", "source": [ "lin.score(x_val, y_val)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "hYx0xiQFPQsF", "outputId": "37eef3d7-918b-45ee-eb1e-8cd58a6711da" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9687384646766936" ] }, "metadata": {}, "execution_count": 25 } ] }, { "cell_type": "code", "source": [ "print(mean_absolute_error(y_val, lin.predict(x_val)))\n", "print(mean_squared_error(y_val, lin.predict(x_val)))\n", "print(r2_score(y_val, lin.predict(x_val)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Ogy0XxjSPRtl", "outputId": "0b069cb5-b17c-495d-ed89-8f45bffce32e" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "9526.033881598178\n", "110511359.09847379\n", "0.9687384646766936\n" ] } ] }, { "cell_type": "code", "source": [ "log = LogisticRegression()\n", "log.fit(x_tr, y_tr)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 80 }, "id": "av-6WoiuPTE2", "outputId": "0e866e16-ddf4-4d70-a221-3fa651fde57e" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "LogisticRegression()" ], "text/html": [ "
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 27 } ] }, { "cell_type": "code", "source": [ "log.score(x_val, y_val)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Xwz6iq0_PUbd", "outputId": "17494660-c9c2-4d21-be02-d876df033555" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.0" ] }, "metadata": {}, "execution_count": 28 } ] }, { "cell_type": "code", "source": [ "print(mean_absolute_error(y_val, log.predict(x_val)))\n", "print(mean_squared_error(y_val, log.predict(x_val)))\n", "print(r2_score(y_val, log.predict(x_val)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "HZyy59jFPVit", "outputId": "879c37d5-aeb5-4b35-a51b-48d0b15bf767" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "10312.5\n", "139062500.0\n", "0.9606618967374789\n" ] } ] }, { "cell_type": "code", "source": [ "rid = Ridge()\n", "rid.fit(x_tr, y_tr)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 80 }, "id": "DTYj5AkgPWsG", "outputId": "0eaa31a9-be47-4672-c615-90c156df8447" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Ridge()" ], "text/html": [ "
Ridge()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 30 } ] }, { "cell_type": "code", "source": [ "rid.score(x_val, y_val)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "jlJKbJKFPYFd", "outputId": "30c075e0-2758-4c64-95bf-65df4fd5fe45" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9630515659674406" ] }, "metadata": {}, "execution_count": 31 } ] }, { "cell_type": "code", "source": [ "print(mean_absolute_error(y_val, rid.predict(x_val)))\n", "print(mean_squared_error(y_val, rid.predict(x_val)))\n", "print(r2_score(y_val, rid.predict(x_val)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "Zc68MC-qPY4k", "outputId": "1ab021da-4d09-42ad-b8f7-b8573f4c24c9" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "10036.11337423515\n", "130614879.25240436\n", "0.9630515659674406\n" ] } ] }, { "cell_type": "code", "source": [ "la = Lasso()\n", "la.fit(x_tr, y_tr)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 135 }, "id": "uWpUU8lgPZ1_", "outputId": "8c7a05f9-cde1-4424-d0b3-c893ff7c2d91" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stderr", "text": [ "/usr/local/lib/python3.12/dist-packages/sklearn/linear_model/_coordinate_descent.py:656: ConvergenceWarning: Objective did not converge. You might want to increase the number of iterations. Duality gap: 33822632.26302413, tolerance: 19010685.48387097\n", " model = cd_fast.sparse_enet_coordinate_descent(\n" ] }, { "output_type": "execute_result", "data": { "text/plain": [ "Lasso()" ], "text/html": [ "
Lasso()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 33 } ] }, { "cell_type": "code", "source": [ "la.score(x_val, y_val)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "2XAtByXwPbAU", "outputId": "30672bdb-c6dc-4277-ee9a-0d535b269735" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9523458161782297" ] }, "metadata": {}, "execution_count": 34 } ] }, { "cell_type": "code", "source": [ "print(mean_absolute_error(y_val, la.predict(x_val)))\n", "print(mean_squared_error(y_val, la.predict(x_val)))\n", "print(r2_score(y_val, la.predict(x_val)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "P3fMltECPcmE", "outputId": "b674d50d-a815-4c90-b658-9a08d25812d3" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "10259.659971565385\n", "168460332.04729128\n", "0.9523458161782297\n" ] } ] }, { "cell_type": "code", "source": [ "kn = KNeighborsRegressor(n_neighbors=10)\n", "kn.fit(x_tr, y_tr)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 80 }, "id": "ln6Y5na7Pdit", "outputId": "52f59070-ba21-4974-d434-b1623076db6d" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "KNeighborsRegressor(n_neighbors=10)" ], "text/html": [ "
KNeighborsRegressor(n_neighbors=10)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 36 } ] }, { "cell_type": "code", "source": [ "kn.score(x_val, y_val)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "-2f5lWAoPeqQ", "outputId": "66287c65-a03a-4a4a-c071-142ab33cfb8b" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9354059504406199" ] }, "metadata": {}, "execution_count": 37 } ] }, { "cell_type": "code", "source": [ "print(mean_absolute_error(y_val, kn.predict(x_val)))\n", "print(mean_squared_error(y_val, kn.predict(x_val)))\n", "print(r2_score(y_val, kn.predict(x_val)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "41FIdGo7Pfpt", "outputId": "256ad7e8-72bf-49f3-db7b-5b08cb889655" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "11437.5\n", "228343750.0\n", "0.9354059504406199\n" ] } ] }, { "cell_type": "code", "source": [ "dtc = DecisionTreeRegressor()\n", "dtc.fit(x_tr, y_tr)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 80 }, "id": "EQiuZ2aKPguE", "outputId": "24a10f63-2a0c-443b-cadc-d4a550f50ab0" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "DecisionTreeRegressor()" ], "text/html": [ "
DecisionTreeRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 39 } ] }, { "cell_type": "code", "source": [ "dtc.score(x_val,y_val)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "qfxTPjXhPkz9", "outputId": "454686a3-2b9f-431c-db91-1dcb428191bc" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.8850796983342082" ] }, "metadata": {}, "execution_count": 41 } ] }, { "cell_type": "code", "source": [ "print(mean_absolute_error(y_val, dtc.predict(x_val)))\n", "print(mean_squared_error(y_val, dtc.predict(x_val)))\n", "print(r2_score(y_val, dtc.predict(x_val)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "sH2iqVH2PiFG", "outputId": "62eadce3-a829-4582-c421-4daedf237d6f" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "13125.0\n", "406250000.0\n", "0.8850796983342082\n" ] } ] }, { "cell_type": "code", "source": [ "rfr = RandomForestRegressor()\n", "rfr.fit(x_tr, y_tr)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 80 }, "id": "kBmUO8kPPi_F", "outputId": "608b0567-1c63-4487-a7f5-c3ecf8770901" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "RandomForestRegressor()" ], "text/html": [ "
RandomForestRegressor()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 42 } ] }, { "cell_type": "code", "source": [ "print(mean_absolute_error(y_val, rfr.predict(x_val)))\n", "print(mean_squared_error(y_val, rfr.predict(x_val)))\n", "print(r2_score(y_val, rfr.predict(x_val)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "5naRwgftPpkr", "outputId": "b6c9ad56-605e-43be-9a36-413082747cc1" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "8003.125\n", "143430781.25\n", "0.9594261940937595\n" ] } ] }, { "cell_type": "code", "source": [ "rfr.score(x_val, y_val)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "uutqLRiKPqgl", "outputId": "7946b631-32b5-4919-a2b7-665ccb83b758" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9594261940937595" ] }, "metadata": {}, "execution_count": 44 } ] }, { "cell_type": "code", "source": [ "lin_svr = SVR(kernel='linear')\n", "lin_svr.fit(x_tr, y_tr)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 80 }, "id": "TtO-S2rdX4p-", "outputId": "a345e4a5-16c5-44d2-b80d-3eed39b9e411" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "SVR(kernel='linear')" ], "text/html": [ "
SVR(kernel='linear')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 45 } ] }, { "cell_type": "code", "source": [ "lin_svr.score(x_tr, y_tr)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "JE1fCx0eYEkH", "outputId": "d476e05d-f7fd-46b8-a93b-f74d80bfed50" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.0037633278401071335" ] }, "metadata": {}, "execution_count": 46 } ] }, { "cell_type": "code", "source": [ "print(mean_absolute_error(y_val, lin_svr.predict(x_val)))\n", "print(mean_squared_error(y_val, lin_svr.predict(x_val)))\n", "print(r2_score(y_val, lin_svr.predict(x_val)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "PpWQPRz-YFZf", "outputId": "92abfd6e-f1af-4e3b-9469-78868a57b7ec" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "52391.57187596244\n", "3563485085.649429\n", "-0.008041307136941622\n" ] } ] }, { "cell_type": "code", "source": [ "poly_svr = SVR(kernel='poly')\n", "poly_svr.fit(x_tr, y_tr)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 80 }, "id": "X_wOzaj4YHIv", "outputId": "1a33bf6f-f9d1-43e6-dfe8-186c5cfcfac1" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "SVR(kernel='poly')" ], "text/html": [ "
SVR(kernel='poly')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 48 } ] }, { "cell_type": "code", "source": [ "poly_svr.score(x_val, y_val)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "SbyXKB9nYJKH", "outputId": "ea779b23-ae5c-47f1-b0e5-10197ad6427b" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "-0.011895723165550098" ] }, "metadata": {}, "execution_count": 49 } ] }, { "cell_type": "code", "source": [ "print(mean_absolute_error(y_val, poly_svr.predict(x_val)))\n", "print(mean_squared_error(y_val, poly_svr.predict(x_val)))\n", "print(r2_score(y_val, poly_svr.predict(x_val)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "bwsWQqrtYLaf", "outputId": "86f09146-e84f-42b0-ff1b-fb425cf18782" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "52493.49175641051\n", "3577110672.155249\n", "-0.011895723165550098\n" ] } ] }, { "cell_type": "code", "source": [ "rbf_svr = SVR(kernel='rbf')\n", "rbf_svr.fit(x_tr, y_tr)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 80 }, "id": "lgPG5jXZYMrn", "outputId": "6f0dc663-53fb-4cca-8f92-03af6aeb8ff5" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "SVR()" ], "text/html": [ "
SVR()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 51 } ] }, { "cell_type": "code", "source": [ "rbf_svr.score(x_val, y_val)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "cbN8vROYYQtf", "outputId": "a95b969f-2aba-4594-924c-9e5c1b268ad3" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "-0.01197346602359639" ] }, "metadata": {}, "execution_count": 52 } ] }, { "cell_type": "code", "source": [ "print(mean_absolute_error(y_val, rbf_svr.predict(x_val)))\n", "print(mean_squared_error(y_val, rbf_svr.predict(x_val)))\n", "print(r2_score(y_val, rbf_svr.predict(x_val)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "3Qn7IRxqYSNv", "outputId": "3e637cff-ed8a-40aa-8f8a-1e571ecee44c" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "52494.214614453245\n", "3577385497.713688\n", "-0.01197346602359639\n" ] } ] }, { "cell_type": "code", "source": [ "from xgboost import XGBRegressor" ], "metadata": { "id": "KXBAus0dYWBc" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "xgr = XGBRegressor()\n", "xgr.fit(x_tr, y_tr)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 254 }, "id": "3Dn0EAE-YYfv", "outputId": "b6c1fef6-d369-48a2-e7e3-389d813ab956" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "XGBRegressor(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=None, device=None, early_stopping_rounds=None,\n", " enable_categorical=False, eval_metric=None, feature_types=None,\n", " feature_weights=None, gamma=None, grow_policy=None,\n", " importance_type=None, interaction_constraints=None,\n", " learning_rate=None, max_bin=None, max_cat_threshold=None,\n", " max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n", " max_leaves=None, min_child_weight=None, missing=nan,\n", " monotone_constraints=None, multi_strategy=None, n_estimators=None,\n", " n_jobs=None, num_parallel_tree=None, ...)" ], "text/html": [ "
XGBRegressor(base_score=None, booster=None, callbacks=None,\n",
              "             colsample_bylevel=None, colsample_bynode=None,\n",
              "             colsample_bytree=None, device=None, early_stopping_rounds=None,\n",
              "             enable_categorical=False, eval_metric=None, feature_types=None,\n",
              "             feature_weights=None, gamma=None, grow_policy=None,\n",
              "             importance_type=None, interaction_constraints=None,\n",
              "             learning_rate=None, max_bin=None, max_cat_threshold=None,\n",
              "             max_cat_to_onehot=None, max_delta_step=None, max_depth=None,\n",
              "             max_leaves=None, min_child_weight=None, missing=nan,\n",
              "             monotone_constraints=None, multi_strategy=None, n_estimators=None,\n",
              "             n_jobs=None, num_parallel_tree=None, ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 55 } ] }, { "cell_type": "code", "source": [ "xgr.score(x_val , y_val)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "CoPg3XQgYZrv", "outputId": "121ec9f9-9b01-47d3-83a9-ddddf13448cf" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9270912408828735" ] }, "metadata": {}, "execution_count": 56 } ] }, { "cell_type": "code", "source": [ "from sklearn.model_selection import GridSearchCV" ], "metadata": { "id": "eKEHa0x0QLDI" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "rid_searchcv = GridSearchCV(rid, param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10, 100]}, cv=5)\n", "rid_searchcv.fit(x_tr, y_tr)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 166 }, "id": "Yes0iROsP_c7", "outputId": "bbe89cbc-c43c-40ed-f371-bd3e093369c3" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "GridSearchCV(cv=5, estimator=Ridge(),\n", " param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10, 100]})" ], "text/html": [ "
GridSearchCV(cv=5, estimator=Ridge(),\n",
              "             param_grid={'alpha': [0.001, 0.01, 0.1, 1, 10, 100]})
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 59 } ] }, { "cell_type": "code", "source": [ "rid_searchcv.best_estimator_" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 80 }, "id": "anUT_ZC2QKF9", "outputId": "22ee339d-6452-43de-e30d-ac5a4c89e8d7" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "Ridge(alpha=0.001)" ], "text/html": [ "
Ridge(alpha=0.001)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ] }, "metadata": {}, "execution_count": 60 } ] }, { "cell_type": "code", "source": [ "rid_searchcv.best_params_" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "M9VEbs2IQPAE", "outputId": "aa7ca11f-b728-4bd5-f8de-9d6cfa7597f4" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "{'alpha': 0.001}" ] }, "metadata": {}, "execution_count": 61 } ] }, { "cell_type": "code", "source": [ "rid_searchcv.score(x_val,y_val)" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "i9sl7XNbQQ4M", "outputId": "81978c1b-3d79-432f-9970-53888b05ff1e" }, "execution_count": null, "outputs": [ { "output_type": "execute_result", "data": { "text/plain": [ "0.9687184268641414" ] }, "metadata": {}, "execution_count": 62 } ] }, { "cell_type": "code", "source": [ "print(mean_absolute_error(y_val, rid_searchcv.predict(x_val)))\n", "print(mean_squared_error(y_val, rid_searchcv.predict(x_val)))\n", "print(r2_score(y_val, rid_searchcv.predict(x_val)))" ], "metadata": { "colab": { "base_uri": "https://localhost:8080/" }, "id": "ptZztDxOQVk0", "outputId": "ffb19e3e-5983-4590-81c7-22e3d483f8de" }, "execution_count": null, "outputs": [ { "output_type": "stream", "name": "stdout", "text": [ "9528.283094826926\n", "110582193.93993585\n", "0.9687184268641414\n" ] } ] }, { "cell_type": "code", "source": [ "import pickle as pkl" ], "metadata": { "id": "OmkiiLe7ZZt3" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "pkl.dump(rid_searchcv, open('rid.pkl', 'wb'))\n", "pkl.dump(preprocessor, open('prep.pkl', 'wb'))" ], "metadata": { "id": "YT_t965gr5AM" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [ "emp.to_csv('emp.csv')" ], "metadata": { "id": "TUE56B_hQfAH" }, "execution_count": null, "outputs": [] }, { "cell_type": "code", "source": [], "metadata": { "id": "9g6TY7S-Q0Ll" }, "execution_count": null, "outputs": [] } ] }