<h1> ----- PIPELINE NOTEBOOK ----- </h1>

In [2]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor

from sklearn.compose import ColumnTransformer

from sklearn import set_config

In [3]:
df = pd.read_csv(r"../Dataset/Rossmann_Cleaned_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,0,1,5,2015-07-31,5263,555,1,0,1,Large Store,basic,1270,9,2008,0,0,0,0
1,1,2,5,2015-07-31,6064,625,1,0,1,Small Shop,basic,570,11,2007,1,13,2010,"Jan,Apr,Jul,Oct"
2,2,3,5,2015-07-31,8314,821,1,0,1,Small Shop,basic,14130,12,2006,1,14,2011,"Jan,Apr,Jul,Oct"
3,3,4,5,2015-07-31,13995,1498,1,0,1,Large Store,extended,620,9,2009,0,0,0,0
4,4,5,5,2015-07-31,4822,559,1,0,1,Small Shop,basic,29910,4,2015,0,0,0,0


In [4]:
df = df[["PromoInterval","StoreType","Assortment","StateHoliday","Store","Customers","Promo","SchoolHoliday","CompetitionDistance","CompetitionOpenSinceMonth","CompetitionOpenSinceYear","Sales"]]
df.head()

Unnamed: 0,PromoInterval,StoreType,Assortment,StateHoliday,Store,Customers,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Sales
0,0,Large Store,basic,0,1,555,1,1,1270,9,2008,5263
1,"Jan,Apr,Jul,Oct",Small Shop,basic,0,2,625,1,1,570,11,2007,6064
2,"Jan,Apr,Jul,Oct",Small Shop,basic,0,3,821,1,1,14130,12,2006,8314
3,0,Large Store,extended,0,4,1498,1,1,620,9,2009,13995
4,0,Small Shop,basic,0,5,559,1,1,29910,4,2015,4822


In [10]:
print(df["Customers"].min())
print(df["Customers"].max())

8
7388


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 844338 entries, 0 to 844337
Data columns (total 12 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   PromoInterval              844338 non-null  object
 1   StoreType                  844338 non-null  object
 2   Assortment                 844338 non-null  object
 3   StateHoliday               844338 non-null  int64 
 4   Store                      844338 non-null  int64 
 5   Customers                  844338 non-null  int64 
 6   Promo                      844338 non-null  int64 
 7   SchoolHoliday              844338 non-null  int64 
 8   CompetitionDistance        844338 non-null  int64 
 9   CompetitionOpenSinceMonth  844338 non-null  int64 
 10  CompetitionOpenSinceYear   844338 non-null  int64 
 11  Sales                      844338 non-null  int64 
dtypes: int64(9), object(3)
memory usage: 77.3+ MB


In [14]:
df.shape

(844338, 12)

In [15]:
def print_unique_values(dataframe):
    for column in dataframe.columns:
        unique_values = dataframe[column].unique()
        print(f"Unique values in {column}: {unique_values}")

# Example usage:
print_unique_values(df)


Unique values in PromoInterval: ['0' 'Jan,Apr,Jul,Oct' 'Feb,May,Aug,Nov' 'Mar,Jun,Sept,Dec']
Unique values in StoreType: ['Large Store' 'Small Shop' 'Hypermarket' 'Medium Store']
Unique values in Assortment: ['basic' 'extended' 'extra']
Unique values in StateHoliday: [0 1]
Unique values in Store: [   1    2    3 ... 1115  876  292]
Unique values in Customers: [ 555  625  821 ... 3900   36 4065]
Unique values in Promo: [1 0]
Unique values in SchoolHoliday: [1 0]
Unique values in CompetitionDistance: [ 1270   570 14130   620 29910   310 24000  7520  2030  3160   960  1070
  1300  4110  3270    50 13840  3240  2340   550  1040  4060  4590   430
  2300    60  1200  2170    40  9800  2910  1320  2240  7660   540  4230
  1090   260   180  1180   290  4880  9710   270  1060 18010  6260 10570
   450 30360  7170   720  6620   420  7340  2840  5540   350  2050  3700
 22560   410   250  1130  4840 17500  2200  1650   330 22440 19960  3510
  3320  7910  2370 22390  2710 11810  1870   480   560 106

In [16]:
X = df.drop(columns = ["Sales"])
y = df["Sales"]

## Train Test Split

In [17]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

# Checking the shape after spliting
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((633253, 11), (211085, 11), (633253,), (211085,))

In [18]:
X_train

Unnamed: 0,PromoInterval,StoreType,Assortment,StateHoliday,Store,Customers,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear
795018,"Jan,Apr,Jul,Oct",Small Shop,basic,0,650,636,1,0,1420,10,2012
463276,"Jan,Apr,Jul,Oct",Small Shop,basic,0,72,261,0,0,2200,12,2009
268352,0,Medium Store,extra,0,733,3567,1,0,860,10,1999
67308,0,Small Shop,extended,0,796,791,1,0,7180,11,2012
482458,0,Small Shop,extended,0,301,480,0,0,4510,3,2015
...,...,...,...,...,...,...,...,...,...,...,...
259178,"Feb,May,Aug,Nov",Small Shop,basic,0,1013,217,0,0,630,2,2015
365838,"Jan,Apr,Jul,Oct",Small Shop,extended,0,11,1394,1,0,960,11,2011
131932,0,Small Shop,basic,0,376,796,0,0,160,8,2012
671155,0,Hypermarket,extended,0,76,885,0,0,19960,3,2006


In [19]:
df["PromoInterval"].value_counts()

PromoInterval
0                   423292
Jan,Apr,Jul,Oct     242397
Feb,May,Aug,Nov      97998
Mar,Jun,Sept,Dec     80651
Name: count, dtype: int64

# Pipeline

In [20]:
# Define the ColumnTransformer
ohe_col = ["PromoInterval", "StoreType", "Assortment"]

ct_encoding = ColumnTransformer(
    transformers=[
        ("ohe", OneHotEncoder(handle_unknown="ignore"), ohe_col)
    ],
    remainder="passthrough"
)


# Define the XGBRegressor model
model = XGBRegressor(learning_rate=0.1, max_depth=13)

# Define the pipeline
pipe = Pipeline(steps=[
    ("encoding", ct_encoding),
    ("scaler", StandardScaler()),
    ("model", model)
])

# Now you can fit your pipeline with your data
pipe.fit(X_train, y_train)


In [21]:
y_pred = pipe.predict(X_test)
y_pred

array([5674.2217, 7922.6377, 9180.126 , ..., 7287.449 , 3228.0945,
       4453.9897], dtype=float32)

In [22]:
y_test

43879      5934
562681     7800
239643     9111
689976     7831
397240    10046
          ...  
512864    13692
750784     6958
192729     6785
755727     2925
604917     4178
Name: Sales, Length: 211085, dtype: int64

In [23]:
X_train.columns

Index(['PromoInterval', 'StoreType', 'Assortment', 'StateHoliday', 'Store',
       'Customers', 'Promo', 'SchoolHoliday', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear'],
      dtype='object')

In [24]:
X_train.head()

Unnamed: 0,PromoInterval,StoreType,Assortment,StateHoliday,Store,Customers,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear
795018,"Jan,Apr,Jul,Oct",Small Shop,basic,0,650,636,1,0,1420,10,2012
463276,"Jan,Apr,Jul,Oct",Small Shop,basic,0,72,261,0,0,2200,12,2009
268352,0,Medium Store,extra,0,733,3567,1,0,860,10,1999
67308,0,Small Shop,extended,0,796,791,1,0,7180,11,2012
482458,0,Small Shop,extended,0,301,480,0,0,4510,3,2015


In [26]:
# 795018
temp_df = pd.DataFrame(data =  [["Jan,Apr,Jul,Oct","Small Shop","basic","0","650","636","1","0","1420","10","2012"]], columns = X_test.columns)
temp_df

Unnamed: 0,PromoInterval,StoreType,Assortment,StateHoliday,Store,Customers,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear
0,"Jan,Apr,Jul,Oct",Small Shop,basic,0,650,636,1,0,1420,10,2012


In [27]:
pipe.predict(temp_df)

array([6357.158], dtype=float32)

In [31]:
# Assuming your DataFrame is named df
record = df.iloc[795018]

print("Record at index 795018:")
print(record)


Record at index 795018:
PromoInterval                Jan,Apr,Jul,Oct
StoreType                         Small Shop
Assortment                             basic
StateHoliday                               0
Store                                    650
Customers                                636
Promo                                      1
SchoolHoliday                              0
CompetitionDistance                     1420
CompetitionOpenSinceMonth                 10
CompetitionOpenSinceYear                2012
Sales                                   6322
Name: 795018, dtype: object


In [30]:
def print_unique_values(dataframe):
    for column in dataframe.columns:
        unique_values = dataframe[column].unique()
        print(f"Unique values in {column}: {unique_values}")

# Example usage:
print_unique_values(df)


Unique values in PromoInterval: ['0' 'Jan,Apr,Jul,Oct' 'Feb,May,Aug,Nov' 'Mar,Jun,Sept,Dec']
Unique values in StoreType: ['Large Store' 'Small Shop' 'Hypermarket' 'Medium Store']
Unique values in Assortment: ['basic' 'extended' 'extra']
Unique values in StateHoliday: [0 1]
Unique values in Store: [   1    2    3 ... 1115  876  292]
Unique values in Customers: [ 555  625  821 ... 3900   36 4065]
Unique values in Promo: [1 0]
Unique values in SchoolHoliday: [1 0]
Unique values in CompetitionDistance: [ 1270   570 14130   620 29910   310 24000  7520  2030  3160   960  1070
  1300  4110  3270    50 13840  3240  2340   550  1040  4060  4590   430
  2300    60  1200  2170    40  9800  2910  1320  2240  7660   540  4230
  1090   260   180  1180   290  4880  9710   270  1060 18010  6260 10570
   450 30360  7170   720  6620   420  7340  2840  5540   350  2050  3700
 22560   410   250  1130  4840 17500  2200  1650   330 22440 19960  3510
  3320  7910  2370 22390  2710 11810  1870   480   560 106

## Save The Model 

In [32]:
import joblib

# joblib.dump(pipe, 'model2.pkl')

['model2.pkl']

In [33]:
model1 = joblib.load("../models/model2.pkl")

In [34]:
model1.predict(temp_df)

array([6357.158], dtype=float32)

# ...

<hr>
