{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "

----- PIPELINE NOTEBOOK -----

" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import numpy as np\n", "import pandas as pd\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.preprocessing import OneHotEncoder\n", "from xgboost import XGBRegressor\n", "\n", "from sklearn.compose import ColumnTransformer\n", "\n", "from sklearn import set_config" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Unnamed: 0StoreDayOfWeekDateSalesCustomersPromoStateHolidaySchoolHolidayStoreTypeAssortmentCompetitionDistanceCompetitionOpenSinceMonthCompetitionOpenSinceYearPromo2Promo2SinceWeekPromo2SinceYearPromoInterval
00152015-07-315263555101Large Storebasic1270920080000
11252015-07-316064625101Small Shopbasic5701120071132010Jan,Apr,Jul,Oct
22352015-07-318314821101Small Shopbasic141301220061142011Jan,Apr,Jul,Oct
33452015-07-31139951498101Large Storeextended620920090000
44552015-07-314822559101Small Shopbasic29910420150000
\n", "
" ], "text/plain": [ " Unnamed: 0 Store DayOfWeek Date Sales Customers Promo \\\n", "0 0 1 5 2015-07-31 5263 555 1 \n", "1 1 2 5 2015-07-31 6064 625 1 \n", "2 2 3 5 2015-07-31 8314 821 1 \n", "3 3 4 5 2015-07-31 13995 1498 1 \n", "4 4 5 5 2015-07-31 4822 559 1 \n", "\n", " StateHoliday SchoolHoliday StoreType Assortment CompetitionDistance \\\n", "0 0 1 Large Store basic 1270 \n", "1 0 1 Small Shop basic 570 \n", "2 0 1 Small Shop basic 14130 \n", "3 0 1 Large Store extended 620 \n", "4 0 1 Small Shop basic 29910 \n", "\n", " CompetitionOpenSinceMonth CompetitionOpenSinceYear Promo2 \\\n", "0 9 2008 0 \n", "1 11 2007 1 \n", "2 12 2006 1 \n", "3 9 2009 0 \n", "4 4 2015 0 \n", "\n", " Promo2SinceWeek Promo2SinceYear PromoInterval \n", "0 0 0 0 \n", "1 13 2010 Jan,Apr,Jul,Oct \n", "2 14 2011 Jan,Apr,Jul,Oct \n", "3 0 0 0 \n", "4 0 0 0 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv(r\"../Dataset/Rossmann_Cleaned_data.csv\")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PromoIntervalStoreTypeAssortmentStateHolidayStoreCustomersPromoSchoolHolidayCompetitionDistanceCompetitionOpenSinceMonthCompetitionOpenSinceYearSales
00Large Storebasic01555111270920085263
1Jan,Apr,Jul,OctSmall Shopbasic02625115701120076064
2Jan,Apr,Jul,OctSmall Shopbasic0382111141301220068314
30Large Storeextended041498116209200913995
40Small Shopbasic055591129910420154822
\n", "
" ], "text/plain": [ " PromoInterval StoreType Assortment StateHoliday Store Customers \\\n", "0 0 Large Store basic 0 1 555 \n", "1 Jan,Apr,Jul,Oct Small Shop basic 0 2 625 \n", "2 Jan,Apr,Jul,Oct Small Shop basic 0 3 821 \n", "3 0 Large Store extended 0 4 1498 \n", "4 0 Small Shop basic 0 5 559 \n", "\n", " Promo SchoolHoliday CompetitionDistance CompetitionOpenSinceMonth \\\n", "0 1 1 1270 9 \n", "1 1 1 570 11 \n", "2 1 1 14130 12 \n", "3 1 1 620 9 \n", "4 1 1 29910 4 \n", "\n", " CompetitionOpenSinceYear Sales \n", "0 2008 5263 \n", "1 2007 6064 \n", "2 2006 8314 \n", "3 2009 13995 \n", "4 2015 4822 " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = df[[\"PromoInterval\",\"StoreType\",\"Assortment\",\"StateHoliday\",\"Store\",\"Customers\",\"Promo\",\"SchoolHoliday\",\"CompetitionDistance\",\"CompetitionOpenSinceMonth\",\"CompetitionOpenSinceYear\",\"Sales\"]]\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "8\n", "7388\n" ] } ], "source": [ "print(df[\"Customers\"].min())\n", "print(df[\"Customers\"].max())" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 844338 entries, 0 to 844337\n", "Data columns (total 12 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 PromoInterval 844338 non-null object\n", " 1 StoreType 844338 non-null object\n", " 2 Assortment 844338 non-null object\n", " 3 StateHoliday 844338 non-null int64 \n", " 4 Store 844338 non-null int64 \n", " 5 Customers 844338 non-null int64 \n", " 6 Promo 844338 non-null int64 \n", " 7 SchoolHoliday 844338 non-null int64 \n", " 8 CompetitionDistance 844338 non-null int64 \n", " 9 CompetitionOpenSinceMonth 844338 non-null int64 \n", " 10 CompetitionOpenSinceYear 844338 non-null int64 \n", " 11 Sales 844338 non-null int64 \n", "dtypes: int64(9), object(3)\n", "memory usage: 77.3+ MB\n" ] } ], "source": [ "df.info()" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(844338, 12)" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Unique values in PromoInterval: ['0' 'Jan,Apr,Jul,Oct' 'Feb,May,Aug,Nov' 'Mar,Jun,Sept,Dec']\n", "Unique values in StoreType: ['Large Store' 'Small Shop' 'Hypermarket' 'Medium Store']\n", "Unique values in Assortment: ['basic' 'extended' 'extra']\n", "Unique values in StateHoliday: [0 1]\n", "Unique values in Store: [ 1 2 3 ... 1115 876 292]\n", "Unique values in Customers: [ 555 625 821 ... 3900 36 4065]\n", "Unique values in Promo: [1 0]\n", "Unique values in SchoolHoliday: [1 0]\n", "Unique values in CompetitionDistance: [ 1270 570 14130 620 29910 310 24000 7520 2030 3160 960 1070\n", " 1300 4110 3270 50 13840 3240 2340 550 1040 4060 4590 430\n", " 2300 60 1200 2170 40 9800 2910 1320 2240 7660 540 4230\n", " 1090 260 180 1180 290 4880 9710 270 1060 18010 6260 10570\n", " 450 30360 7170 720 6620 420 7340 2840 5540 350 2050 3700\n", " 22560 410 250 1130 4840 17500 2200 1650 330 22440 19960 3510\n", " 3320 7910 2370 22390 2710 11810 1870 480 560 10690 2380 2410\n", " 240 16690 14620 1890 8780 8980 15140 17930 2440 150 5210 390\n", " 6190 1390 1930 2190 3300 46590 7890 1630 20930 4510 5740 680\n", " 3450 3580 2100 2290 3570 58260 16760 1410 760 3370 1350 2000\n", " 2460 900 920 5190 1730 25360 1700 1540 2930 16570 280 8050\n", " 8540 2090 2610 31830 4360 1780 16240 16420 3050 2020 2950 11840\n", " 8530 17110 2970 5340 1480 1160 3720 100 140 12540 980 2640\n", " 110 13090 4130 3770 1250 1710 5800 12610 9670 3560 1860 19360\n", " 850 5760 1470 1100 2770 520 16970 220 3850 4210 6360 20260\n", " 5140 490 5630 380 6870 300 11680 970 15050 4030 8650 190\n", " 3150 640 1640 1000 13530 2920 7930 10180 10800 17410 6680 3840\n", " 13570 4370 5710 1420 320 610 1110 780 6880 710 1310 4660\n", " 70 340 3520 22330 4630 80 27190 210 15340 1140 4580 360\n", " 4520 1450 16180 8480 3640 2960 7840 9260 2320 18640 6970 1220\n", " 2260 1290 1460 2740 800 6540 4150 2325 9580 19840 38630 120\n", " 15430 1950 2470 5100 18660 8740 11300 14160 38710 9000 3140 32330\n", " 8140 8400 13140 10070 3130 370 670 1840 4040 90 10600 1590\n", " 2280 8080 15770 18650 8090 9360 16490 1490 8880 5290 1500 9720\n", " 8970 2060 2890 2040 4490 13620 6470 5870 8250 1970 11120 1150\n", " 15710 160 2140 6630 1800 26130 130 6690 1600 460 2120 4820\n", " 10850 3620 23130 5360 9200 5830 4970 1080 8240 5890 1560 840\n", " 8460 4460 6210 6910 4650 1620 3530 2880 16350 12870 810 30030\n", " 13020 910 3900 2530 500 11400 1510 3970 5780 1850 75860 26450\n", " 3390 34050 1790 44320 4160 10890 3110 20390 5260 5300 5030 14810\n", " 8300 770 1940 7470 2550 2310 14300 2180 14960 660 4680 1740\n", " 1260 5470 2780 1610 990 13080 820 9070 1280 4740 8260 590\n", " 400 11260 20 22490 3330 2510 6900 18610 7160 40860 20620 12920\n", " 18160 5950 4700 600 650 7280 5020 580 8990 3760 2330 4260\n", " 3040 3000 3910 1910 1210 700 1010 4270 1340 2110 9230 1190\n", " 4400 2270 12700 20970 170 7250 1360 440 15720 3340 2540 33060\n", " 17340 8220 10950 10310 18370 2070 2490 730 8940 9910 5440 30\n", " 4080 6920 1170 10740 510 1690 2870 3350 11640 27530 9790 10170\n", " 7780 8040 530 230 7420 2130 14570 200 6930 7860 1680 2700\n", " 17080 15170 3250 4140 2850 20050 18760 15040 3030 3780 830 8550\n", " 7830 2900 11470 4870 12070 3200 8190 15320 3590 5650 5900 17540\n", " 40540 13990 15270 35280 860 1920 5980 6400 11900 4380 6710 1370\n", " 17650 4330 45740 3410 8670 13130 19780 2390 32240 26490 25430 9820\n", " 2630 20640 16990 630 5390 15490 3210 1530 9770 17280 5090 7180\n", " 9560 48330 1760 24770 3870 18620 12770 9640 2590 24530 16210 17570\n", " 7980 3290 6320 5070 3470 2720 14600 6890 27650 8860 5000 1120\n", " 940 14040 4770 3440 3020 6270 21770 740 21370 1020 9680 21810\n", " 10620 3860 29190 4570 7550 12430 19700 4450 18670 19370 18540 3920\n", " 3170 7290 1980 12480 3100 7240 18710 2620 6420 470 5150 15700\n", " 5460 22350 2810 2820 6860 18020 1670 2220 1430 870 6300 19830\n", " 9430 23620 9630 4180 3890 4420 21930 2480 3460 6560 5840 2230\n", " 19640 6480 4610 6330 1520 3740 1990 36410 7680 13750 27150 17290\n", " 26990 29070 3750 13170 5080 13190 5350 3230 3380 3430 8110 6250\n", " 12020 5010 18050 5380 16680 11540 2210 4300 5220 9990 10450 690\n", " 1830 5330 1400 3490 1900 1880 21790]\n", "Unique values in CompetitionOpenSinceMonth: [ 9 11 12 4 10 8 3 6 5 1 2 7]\n", "Unique values in CompetitionOpenSinceYear: [2008 2007 2006 2009 2015 2013 2014 2000 2011 2010 2005 1999 2003 2012\n", " 2004 2002 1961 1995 2001 1990 1994 1900 1998]\n", "Unique values in Sales: [ 5263 6064 8314 ... 660 17815 23303]\n" ] } ], "source": [ "def print_unique_values(dataframe):\n", " for column in dataframe.columns:\n", " unique_values = dataframe[column].unique()\n", " print(f\"Unique values in {column}: {unique_values}\")\n", "\n", "# Example usage:\n", "print_unique_values(df)\n" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "X = df.drop(columns = [\"Sales\"])\n", "y = df[\"Sales\"]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Train Test Split" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "((633253, 11), (211085, 11), (633253,), (211085,))" ] }, "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from sklearn.model_selection import train_test_split\n", "X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)\n", "\n", "# Checking the shape after spliting\n", "X_train.shape, X_test.shape, y_train.shape, y_test.shape" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PromoIntervalStoreTypeAssortmentStateHolidayStoreCustomersPromoSchoolHolidayCompetitionDistanceCompetitionOpenSinceMonthCompetitionOpenSinceYear
795018Jan,Apr,Jul,OctSmall Shopbasic0650636101420102012
463276Jan,Apr,Jul,OctSmall Shopbasic072261002200122009
2683520Medium Storeextra0733356710860101999
673080Small Shopextended0796791107180112012
4824580Small Shopextended030148000451032015
....................................
259178Feb,May,Aug,NovSmall Shopbasic010132170063022015
365838Jan,Apr,Jul,OctSmall Shopextended011139410960112011
1319320Small Shopbasic03767960016082012
6711550Hypermarketextended076885001996032006
121958Feb,May,Aug,NovSmall Shopbasic044668410340102000
\n", "

633253 rows × 11 columns

\n", "
" ], "text/plain": [ " PromoInterval StoreType Assortment StateHoliday Store \\\n", "795018 Jan,Apr,Jul,Oct Small Shop basic 0 650 \n", "463276 Jan,Apr,Jul,Oct Small Shop basic 0 72 \n", "268352 0 Medium Store extra 0 733 \n", "67308 0 Small Shop extended 0 796 \n", "482458 0 Small Shop extended 0 301 \n", "... ... ... ... ... ... \n", "259178 Feb,May,Aug,Nov Small Shop basic 0 1013 \n", "365838 Jan,Apr,Jul,Oct Small Shop extended 0 11 \n", "131932 0 Small Shop basic 0 376 \n", "671155 0 Hypermarket extended 0 76 \n", "121958 Feb,May,Aug,Nov Small Shop basic 0 446 \n", "\n", " Customers Promo SchoolHoliday CompetitionDistance \\\n", "795018 636 1 0 1420 \n", "463276 261 0 0 2200 \n", "268352 3567 1 0 860 \n", "67308 791 1 0 7180 \n", "482458 480 0 0 4510 \n", "... ... ... ... ... \n", "259178 217 0 0 630 \n", "365838 1394 1 0 960 \n", "131932 796 0 0 160 \n", "671155 885 0 0 19960 \n", "121958 684 1 0 340 \n", "\n", " CompetitionOpenSinceMonth CompetitionOpenSinceYear \n", "795018 10 2012 \n", "463276 12 2009 \n", "268352 10 1999 \n", "67308 11 2012 \n", "482458 3 2015 \n", "... ... ... \n", "259178 2 2015 \n", "365838 11 2011 \n", "131932 8 2012 \n", "671155 3 2006 \n", "121958 10 2000 \n", "\n", "[633253 rows x 11 columns]" ] }, "execution_count": 18, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "PromoInterval\n", "0 423292\n", "Jan,Apr,Jul,Oct 242397\n", "Feb,May,Aug,Nov 97998\n", "Mar,Jun,Sept,Dec 80651\n", "Name: count, dtype: int64" ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df[\"PromoInterval\"].value_counts()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# Pipeline" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
Pipeline(steps=[('encoding',\n",
       "                 ColumnTransformer(remainder='passthrough',\n",
       "                                   transformers=[('ohe',\n",
       "                                                  OneHotEncoder(handle_unknown='ignore'),\n",
       "                                                  ['PromoInterval', 'StoreType',\n",
       "                                                   'Assortment'])])),\n",
       "                ('scaler', StandardScaler()),\n",
       "                ('model',\n",
       "                 XGBRegressor(base_score=None, booster=None, callbacks=None,\n",
       "                              colsample_bylevel=None, colsample_bynode=None,\n",
       "                              colsample_bytree=None, device=None,...\n",
       "                              feature_types=None, gamma=None, grow_policy=None,\n",
       "                              importance_type=None,\n",
       "                              interaction_constraints=None, learning_rate=0.1,\n",
       "                              max_bin=None, max_cat_threshold=None,\n",
       "                              max_cat_to_onehot=None, max_delta_step=None,\n",
       "                              max_depth=13, max_leaves=None,\n",
       "                              min_child_weight=None, missing=nan,\n",
       "                              monotone_constraints=None, multi_strategy=None,\n",
       "                              n_estimators=None, n_jobs=None,\n",
       "                              num_parallel_tree=None, random_state=None, ...))])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" ], "text/plain": [ "Pipeline(steps=[('encoding',\n", " ColumnTransformer(remainder='passthrough',\n", " transformers=[('ohe',\n", " OneHotEncoder(handle_unknown='ignore'),\n", " ['PromoInterval', 'StoreType',\n", " 'Assortment'])])),\n", " ('scaler', StandardScaler()),\n", " ('model',\n", " XGBRegressor(base_score=None, booster=None, callbacks=None,\n", " colsample_bylevel=None, colsample_bynode=None,\n", " colsample_bytree=None, device=None,...\n", " feature_types=None, gamma=None, grow_policy=None,\n", " importance_type=None,\n", " interaction_constraints=None, learning_rate=0.1,\n", " max_bin=None, max_cat_threshold=None,\n", " max_cat_to_onehot=None, max_delta_step=None,\n", " max_depth=13, max_leaves=None,\n", " min_child_weight=None, missing=nan,\n", " monotone_constraints=None, multi_strategy=None,\n", " n_estimators=None, n_jobs=None,\n", " num_parallel_tree=None, random_state=None, ...))])" ] }, "execution_count": 20, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Define the ColumnTransformer\n", "ohe_col = [\"PromoInterval\", \"StoreType\", \"Assortment\"]\n", "\n", "ct_encoding = ColumnTransformer(\n", " transformers=[\n", " (\"ohe\", OneHotEncoder(handle_unknown=\"ignore\"), ohe_col)\n", " ],\n", " remainder=\"passthrough\"\n", ")\n", "\n", "\n", "# Define the XGBRegressor model\n", "model = XGBRegressor(learning_rate=0.1, max_depth=13)\n", "\n", "# Define the pipeline\n", "pipe = Pipeline(steps=[\n", " (\"encoding\", ct_encoding),\n", " (\"scaler\", StandardScaler()),\n", " (\"model\", model)\n", "])\n", "\n", "# Now you can fit your pipeline with your data\n", "pipe.fit(X_train, y_train)\n" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([5674.2217, 7922.6377, 9180.126 , ..., 7287.449 , 3228.0945,\n", " 4453.9897], dtype=float32)" ] }, "execution_count": 21, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_pred = pipe.predict(X_test)\n", "y_pred" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "43879 5934\n", "562681 7800\n", "239643 9111\n", "689976 7831\n", "397240 10046\n", " ... \n", "512864 13692\n", "750784 6958\n", "192729 6785\n", "755727 2925\n", "604917 4178\n", "Name: Sales, Length: 211085, dtype: int64" ] }, "execution_count": 22, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_test" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Index(['PromoInterval', 'StoreType', 'Assortment', 'StateHoliday', 'Store',\n", " 'Customers', 'Promo', 'SchoolHoliday', 'CompetitionDistance',\n", " 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'],\n", " dtype='object')" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.columns" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PromoIntervalStoreTypeAssortmentStateHolidayStoreCustomersPromoSchoolHolidayCompetitionDistanceCompetitionOpenSinceMonthCompetitionOpenSinceYear
795018Jan,Apr,Jul,OctSmall Shopbasic0650636101420102012
463276Jan,Apr,Jul,OctSmall Shopbasic072261002200122009
2683520Medium Storeextra0733356710860101999
673080Small Shopextended0796791107180112012
4824580Small Shopextended030148000451032015
\n", "
" ], "text/plain": [ " PromoInterval StoreType Assortment StateHoliday Store \\\n", "795018 Jan,Apr,Jul,Oct Small Shop basic 0 650 \n", "463276 Jan,Apr,Jul,Oct Small Shop basic 0 72 \n", "268352 0 Medium Store extra 0 733 \n", "67308 0 Small Shop extended 0 796 \n", "482458 0 Small Shop extended 0 301 \n", "\n", " Customers Promo SchoolHoliday CompetitionDistance \\\n", "795018 636 1 0 1420 \n", "463276 261 0 0 2200 \n", "268352 3567 1 0 860 \n", "67308 791 1 0 7180 \n", "482458 480 0 0 4510 \n", "\n", " CompetitionOpenSinceMonth CompetitionOpenSinceYear \n", "795018 10 2012 \n", "463276 12 2009 \n", "268352 10 1999 \n", "67308 11 2012 \n", "482458 3 2015 " ] }, "execution_count": 24, "metadata": {}, "output_type": "execute_result" } ], "source": [ "X_train.head()" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PromoIntervalStoreTypeAssortmentStateHolidayStoreCustomersPromoSchoolHolidayCompetitionDistanceCompetitionOpenSinceMonthCompetitionOpenSinceYear
0Jan,Apr,Jul,OctSmall Shopbasic0650636101420102012
\n", "
" ], "text/plain": [ " PromoInterval StoreType Assortment StateHoliday Store Customers Promo \\\n", "0 Jan,Apr,Jul,Oct Small Shop basic 0 650 636 1 \n", "\n", " SchoolHoliday CompetitionDistance CompetitionOpenSinceMonth \\\n", "0 0 1420 10 \n", "\n", " CompetitionOpenSinceYear \n", "0 2012 " ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# 795018\n", "temp_df = pd.DataFrame(data = [[\"Jan,Apr,Jul,Oct\",\"Small Shop\",\"basic\",\"0\",\"650\",\"636\",\"1\",\"0\",\"1420\",\"10\",\"2012\"]], columns = X_test.columns)\n", "temp_df" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([6357.158], dtype=float32)" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe.predict(temp_df)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Record at index 795018:\n", "PromoInterval Jan,Apr,Jul,Oct\n", "StoreType Small Shop\n", "Assortment basic\n", "StateHoliday 0\n", "Store 650\n", "Customers 636\n", "Promo 1\n", "SchoolHoliday 0\n", "CompetitionDistance 1420\n", "CompetitionOpenSinceMonth 10\n", "CompetitionOpenSinceYear 2012\n", "Sales 6322\n", "Name: 795018, dtype: object\n" ] } ], "source": [ "# Assuming your DataFrame is named df\n", "record = df.iloc[795018]\n", "\n", "print(\"Record at index 795018:\")\n", "print(record)\n" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Unique values in PromoInterval: ['0' 'Jan,Apr,Jul,Oct' 'Feb,May,Aug,Nov' 'Mar,Jun,Sept,Dec']\n", "Unique values in StoreType: ['Large Store' 'Small Shop' 'Hypermarket' 'Medium Store']\n", "Unique values in Assortment: ['basic' 'extended' 'extra']\n", "Unique values in StateHoliday: [0 1]\n", "Unique values in Store: [ 1 2 3 ... 1115 876 292]\n", "Unique values in Customers: [ 555 625 821 ... 3900 36 4065]\n", "Unique values in Promo: [1 0]\n", "Unique values in SchoolHoliday: [1 0]\n", "Unique values in CompetitionDistance: [ 1270 570 14130 620 29910 310 24000 7520 2030 3160 960 1070\n", " 1300 4110 3270 50 13840 3240 2340 550 1040 4060 4590 430\n", " 2300 60 1200 2170 40 9800 2910 1320 2240 7660 540 4230\n", " 1090 260 180 1180 290 4880 9710 270 1060 18010 6260 10570\n", " 450 30360 7170 720 6620 420 7340 2840 5540 350 2050 3700\n", " 22560 410 250 1130 4840 17500 2200 1650 330 22440 19960 3510\n", " 3320 7910 2370 22390 2710 11810 1870 480 560 10690 2380 2410\n", " 240 16690 14620 1890 8780 8980 15140 17930 2440 150 5210 390\n", " 6190 1390 1930 2190 3300 46590 7890 1630 20930 4510 5740 680\n", " 3450 3580 2100 2290 3570 58260 16760 1410 760 3370 1350 2000\n", " 2460 900 920 5190 1730 25360 1700 1540 2930 16570 280 8050\n", " 8540 2090 2610 31830 4360 1780 16240 16420 3050 2020 2950 11840\n", " 8530 17110 2970 5340 1480 1160 3720 100 140 12540 980 2640\n", " 110 13090 4130 3770 1250 1710 5800 12610 9670 3560 1860 19360\n", " 850 5760 1470 1100 2770 520 16970 220 3850 4210 6360 20260\n", " 5140 490 5630 380 6870 300 11680 970 15050 4030 8650 190\n", " 3150 640 1640 1000 13530 2920 7930 10180 10800 17410 6680 3840\n", " 13570 4370 5710 1420 320 610 1110 780 6880 710 1310 4660\n", " 70 340 3520 22330 4630 80 27190 210 15340 1140 4580 360\n", " 4520 1450 16180 8480 3640 2960 7840 9260 2320 18640 6970 1220\n", " 2260 1290 1460 2740 800 6540 4150 2325 9580 19840 38630 120\n", " 15430 1950 2470 5100 18660 8740 11300 14160 38710 9000 3140 32330\n", " 8140 8400 13140 10070 3130 370 670 1840 4040 90 10600 1590\n", " 2280 8080 15770 18650 8090 9360 16490 1490 8880 5290 1500 9720\n", " 8970 2060 2890 2040 4490 13620 6470 5870 8250 1970 11120 1150\n", " 15710 160 2140 6630 1800 26130 130 6690 1600 460 2120 4820\n", " 10850 3620 23130 5360 9200 5830 4970 1080 8240 5890 1560 840\n", " 8460 4460 6210 6910 4650 1620 3530 2880 16350 12870 810 30030\n", " 13020 910 3900 2530 500 11400 1510 3970 5780 1850 75860 26450\n", " 3390 34050 1790 44320 4160 10890 3110 20390 5260 5300 5030 14810\n", " 8300 770 1940 7470 2550 2310 14300 2180 14960 660 4680 1740\n", " 1260 5470 2780 1610 990 13080 820 9070 1280 4740 8260 590\n", " 400 11260 20 22490 3330 2510 6900 18610 7160 40860 20620 12920\n", " 18160 5950 4700 600 650 7280 5020 580 8990 3760 2330 4260\n", " 3040 3000 3910 1910 1210 700 1010 4270 1340 2110 9230 1190\n", " 4400 2270 12700 20970 170 7250 1360 440 15720 3340 2540 33060\n", " 17340 8220 10950 10310 18370 2070 2490 730 8940 9910 5440 30\n", " 4080 6920 1170 10740 510 1690 2870 3350 11640 27530 9790 10170\n", " 7780 8040 530 230 7420 2130 14570 200 6930 7860 1680 2700\n", " 17080 15170 3250 4140 2850 20050 18760 15040 3030 3780 830 8550\n", " 7830 2900 11470 4870 12070 3200 8190 15320 3590 5650 5900 17540\n", " 40540 13990 15270 35280 860 1920 5980 6400 11900 4380 6710 1370\n", " 17650 4330 45740 3410 8670 13130 19780 2390 32240 26490 25430 9820\n", " 2630 20640 16990 630 5390 15490 3210 1530 9770 17280 5090 7180\n", " 9560 48330 1760 24770 3870 18620 12770 9640 2590 24530 16210 17570\n", " 7980 3290 6320 5070 3470 2720 14600 6890 27650 8860 5000 1120\n", " 940 14040 4770 3440 3020 6270 21770 740 21370 1020 9680 21810\n", " 10620 3860 29190 4570 7550 12430 19700 4450 18670 19370 18540 3920\n", " 3170 7290 1980 12480 3100 7240 18710 2620 6420 470 5150 15700\n", " 5460 22350 2810 2820 6860 18020 1670 2220 1430 870 6300 19830\n", " 9430 23620 9630 4180 3890 4420 21930 2480 3460 6560 5840 2230\n", " 19640 6480 4610 6330 1520 3740 1990 36410 7680 13750 27150 17290\n", " 26990 29070 3750 13170 5080 13190 5350 3230 3380 3430 8110 6250\n", " 12020 5010 18050 5380 16680 11540 2210 4300 5220 9990 10450 690\n", " 1830 5330 1400 3490 1900 1880 21790]\n", "Unique values in CompetitionOpenSinceMonth: [ 9 11 12 4 10 8 3 6 5 1 2 7]\n", "Unique values in CompetitionOpenSinceYear: [2008 2007 2006 2009 2015 2013 2014 2000 2011 2010 2005 1999 2003 2012\n", " 2004 2002 1961 1995 2001 1990 1994 1900 1998]\n", "Unique values in Sales: [ 5263 6064 8314 ... 660 17815 23303]\n" ] } ], "source": [ "def print_unique_values(dataframe):\n", " for column in dataframe.columns:\n", " unique_values = dataframe[column].unique()\n", " print(f\"Unique values in {column}: {unique_values}\")\n", "\n", "# Example usage:\n", "print_unique_values(df)\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Save The Model " ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['model2.pkl']" ] }, "execution_count": 32, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import joblib\n", "\n", "# joblib.dump(pipe, 'model2.pkl')" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [], "source": [ "model1 = joblib.load(\"../models/model2.pkl\")" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([6357.158], dtype=float32)" ] }, "execution_count": 34, "metadata": {}, "output_type": "execute_result" } ], "source": [ "model1.predict(temp_df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# ..." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "
\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 2 }