Varun6299 commited on
Commit
86359e6
·
verified ·
1 Parent(s): b118c1b

Upload folder using huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +189 -0
app.py CHANGED
@@ -6,6 +6,194 @@ from sklearn.compose import ColumnTransformer
6
  import traceback
7
  import numpy as np
8
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
 
11
  # Initialize the Flask application
@@ -14,6 +202,7 @@ superKart_sales_predictor_api = Flask("SuperKart Sales Predictor")
14
  # # Load the trained machine learning model
15
  model = joblib.load("final_xgboost_pipeline.joblib")
16
 
 
17
  # Define a route for the home page
18
  @superKart_sales_predictor_api.get('/')
19
  def home():
 
6
  import traceback
7
  import numpy as np
8
  import os
9
+ from typing import Iterable, Optional, Any
10
+ from sklearn.base import BaseEstimator, TransformerMixin
11
+ from sklearn.pipeline import Pipeline
12
+ from sklearn.compose import ColumnTransformer
13
+ from sklearn.preprocessing import OneHotEncoder
14
+
15
+
16
+ class ManualProductTypeMapper(BaseEstimator, TransformerMixin):
17
+ """
18
+ Transformer that maps values of a Product-Type column to a controlled set of
19
+ allowed categories, mapping all other (unwanted / rare / unknown) values to 'Others'.
20
+
21
+ Key design choices (sklearn-compatible):
22
+ - Constructor arguments are stored exactly as provided in __init__ (no conversion).
23
+ This is required so sklearn.clone can recreate the estimator for CV/hyperparameter search.
24
+ - Any conversion/validation of constructor args is done in fit() and stored in attributes
25
+ that end with an underscore (e.g., `keep_set_`) following sklearn conventions.
26
+ - fit_transform() appends a single synthetic row with Product_Type='Others' when necessary,
27
+ so that downstream OneHotEncoder will always learn an 'Others' category during fit.
28
+
29
+ Parameters
30
+ ----------
31
+ product_col : str, default='Product_Type'
32
+ Name of the column in the incoming DataFrame that contains product type labels.
33
+
34
+ keep_set : Iterable[str] or None, default=None
35
+ Iterable (list / set / tuple) of product type values that should be preserved as-is.
36
+ All other values will be mapped to the literal string 'Others'.
37
+ Must be provided (non-empty) before calling fit().
38
+
39
+ Notes
40
+ -----
41
+ - This transformer does not use the target `y` — it only needs X (pandas DataFrame).
42
+ - After fit(), an internal attribute `keep_set_` (a set) contains the validated keep values.
43
+ - transform() requires the transformer to be fit first (i.e., keep_set_ must exist).
44
+ """
45
+
46
+ def __init__(self, product_col: str = 'Product_Type', keep_set: Optional[Iterable[str]] = None):
47
+ # Store constructor arguments exactly as provided. Do NOT mutate or coerce them here.
48
+ # sklearn.clone relies on the instance attributes matching constructor args.
49
+ self.product_col = product_col
50
+ self.keep_set = keep_set
51
+
52
+ def fit(self, X: pd.DataFrame, y: Optional[Any] = None):
53
+ """
54
+ Validate inputs and prepare internal state.
55
+
56
+ This method:
57
+ - Validates X is a pandas DataFrame and contains the product_col.
58
+ - Validates that keep_set is provided and converts it into an internal set
59
+ stored as `keep_set_` (sklearn-style learned attribute).
60
+
61
+ Parameters
62
+ ----------
63
+ X : pd.DataFrame
64
+ Input training DataFrame. Must contain the `product_col`.
65
+ y : ignored
66
+ Present for sklearn compatibility with fit(X, y) API.
67
+
68
+ Returns
69
+ -------
70
+ self : ManualProductTypeMapper
71
+ Returns self.
72
+ """
73
+ # Basic input validation
74
+ if not isinstance(X, pd.DataFrame):
75
+ raise ValueError("fit expects X to be a pandas DataFrame")
76
+ if self.product_col not in X.columns:
77
+ raise ValueError(f"product_col '{self.product_col}' not found in X during fit")
78
+
79
+ # keep_set must be provided by user; convert into an internal set for fast membership tests
80
+ if self.keep_set is None:
81
+ raise ValueError("ManualProductTypeMapper requires a non-empty keep_set (pass an iterable of values)")
82
+ # Create a defensive copy and ensure type is set
83
+ self.keep_set_ = set(self.keep_set)
84
+
85
+ # No statistical learning required, return the fitted instance
86
+ return self
87
+
88
+ def transform(self, X: pd.DataFrame) -> pd.DataFrame:
89
+ """
90
+ Map values not in keep_set_ to 'Others'.
91
+
92
+ Parameters
93
+ ----------
94
+ X : pd.DataFrame
95
+ Input DataFrame to transform. Must contain the `product_col`.
96
+ transform assumes fit() was called previously to create `keep_set_`.
97
+
98
+ Returns
99
+ -------
100
+ X_transformed : pd.DataFrame
101
+ A copy of X with values in `product_col` replaced with 'Others' when not in keep_set_.
102
+ """
103
+ # Ensure fit has been called
104
+ if not hasattr(self, 'keep_set_'):
105
+ raise ValueError("transform called before fit(). Call fit(X) first.")
106
+
107
+ if not isinstance(X, pd.DataFrame):
108
+ raise ValueError("transform expects a pandas DataFrame")
109
+ if self.product_col not in X.columns:
110
+ raise ValueError(f"product_col '{self.product_col}' not found in X during transform")
111
+
112
+ # Work on a shallow copy to avoid mutating the user's DataFrame
113
+ X2 = X.copy()
114
+ # Replace values not in keep_set_ with 'Others'
115
+ X2[self.product_col] = X2[self.product_col].apply(lambda v: v if v in self.keep_set_ else 'Others')
116
+ return X2
117
+
118
+ def fit_transform(self, X: pd.DataFrame, y: Optional[Any] = None, **fit_params) -> pd.DataFrame:
119
+ """
120
+ Fit the transformer and transform X in one step.
121
+
122
+ Additionally ensures that the transformed training data contains at least one
123
+ row with Product_Type == 'Others'. If no training row mapped to 'Others',
124
+ a synthetic row is appended so downstream OneHotEncoder will create the
125
+ 'Others' category at fit time.
126
+
127
+ Parameters
128
+ ----------
129
+ X : pd.DataFrame
130
+ Training DataFrame (must contain product_col).
131
+ y : ignored
132
+ For sklearn-compatible API.
133
+
134
+ Returns
135
+ -------
136
+ X_with_dummy : pd.DataFrame
137
+ Transformed DataFrame (may contain one extra synthetic row if needed).
138
+ """
139
+ # Fit to create keep_set_
140
+ self.fit(X, y)
141
+ # Apply mapping to the data
142
+ X_trans = self.transform(X)
143
+
144
+ # If 'Others' already present, return transformed data as-is
145
+ if 'Others' in X_trans[self.product_col].unique():
146
+ return X_trans
147
+
148
+ # Build a synthetic row with Product_Type='Others' and safe defaults for other columns
149
+ synthetic: dict = {}
150
+ for col in X_trans.columns:
151
+ if col == self.product_col:
152
+ synthetic[col] = 'Others' # ensure 'Others' exists in transformed data
153
+ else:
154
+ # Choose a safe default: mode for categorical-like columns, median for numeric
155
+ ser = X_trans[col].dropna()
156
+ if ser.empty:
157
+ synthetic[col] = np.nan
158
+ else:
159
+ if pd.api.types.is_object_dtype(ser) or pd.api.types.is_categorical_dtype(ser):
160
+ synthetic[col] = ser.mode().iloc[0]
161
+ else:
162
+ # numeric fallback
163
+ synthetic[col] = float(ser.median())
164
+
165
+ synthetic_df = pd.DataFrame([synthetic], columns=X_trans.columns)
166
+ # Append the synthetic row and return the augmented DataFrame
167
+ X_with_dummy = pd.concat([X_trans, synthetic_df], ignore_index=True)
168
+ return X_with_dummy
169
+
170
+
171
+ # ------------------ Hard-coded keep list (edit as needed) ------------------
172
+ KEEP_PRODUCT_TYPES = {
173
+ 'Fruits and Vegetables', 'Snack Foods', 'Dairy', 'Frozen Foods', 'Household',
174
+ 'Baking Goods', 'Canned', 'Health and Hygiene', 'Meat', 'Soft Drinks'
175
+ }
176
+
177
+ # ------------------ ColumnTransformer ------------------
178
+
179
+ # Step 2: Define how categorical columns should be encoded
180
+ col_transformer = ColumnTransformer(
181
+ transformers=[
182
+ # 'ohe_cat' = name of the transformer (for reference)
183
+ # OneHotEncoder is applied to all columns listed in cat_cols
184
+ # handle_unknown='ignore' prevents errors on unseen categories at inference
185
+ # sparse_output=False → return dense numpy array (easier for pandas conversion later)
186
+ ('ohe_cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first'), cat_cols),
187
+ ],
188
+ # All columns NOT listed in cat_cols will be passed through unchanged
189
+ remainder='passthrough'
190
+ )
191
+
192
+ # Step 1: Custom transformer that groups rare Product_Type values into 'Others'
193
+ mapper = ManualProductTypeMapper(
194
+ product_col='Product_Type',
195
+ keep_set=KEEP_PRODUCT_TYPES # your manually defined keep list
196
+ )
197
 
198
 
199
  # Initialize the Flask application
 
202
  # # Load the trained machine learning model
203
  model = joblib.load("final_xgboost_pipeline.joblib")
204
 
205
+
206
  # Define a route for the home page
207
  @superKart_sales_predictor_api.get('/')
208
  def home():