chkp-talexm commited on
Commit
14e9f1b
Β·
1 Parent(s): e285cce
Files changed (1) hide show
  1. app.py +49 -171
app.py CHANGED
@@ -1,11 +1,8 @@
1
-
2
- import os, shutil
3
-
4
  import streamlit as st
5
  import pandas as pd
6
- import numpy as np
7
  import joblib
8
- import os
9
  from huggingface_hub import hf_hub_download
10
  from sklearn.preprocessing import LabelEncoder, StandardScaler
11
  from catboost import Pool
@@ -16,14 +13,14 @@ MODEL_DIR = "models"
16
  os.makedirs(MODEL_DIR, exist_ok=True)
17
 
18
  # Model Filenames
19
- CATBOOST_MODEL_FILENAME = "models/catboost_model.pkl"
20
- XGB_MODEL_FILENAME = "models/xgb_model.pkl"
21
- RF_MODEL_FILENAME = "models/rf_model.pkl"
22
 
23
  # Local Paths
24
- CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.pkl")
25
- XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.pkl")
26
- RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")
27
 
28
  # Define Features
29
  CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"]
@@ -37,21 +34,11 @@ NUMERICAL_COLUMNS = [
37
 
38
  FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
39
 
40
- from sklearn.preprocessing import LabelEncoder, StandardScaler
41
- from catboost import Pool
42
-
43
 
44
  def preprocess_input(input_df, expected_feature_order):
45
- """
46
- Ensure preprocessing is correct:
47
- - Removes duplicate columns
48
- - Computes aggregations using only test data
49
- - Ensures categorical variables are properly encoded
50
- - Normalizes numerical features
51
- - Adds `is_click` column with 0 for compatibility
52
- - Orders columns as expected by the model
53
- """
54
- # Drop the DateTime column if it exists
55
  if "DateTime" in input_df.columns:
56
  input_df.drop(columns=["DateTime"], inplace=True)
57
 
@@ -65,7 +52,6 @@ def preprocess_input(input_df, expected_feature_order):
65
  "webpage_id": "nunique"
66
  }).reset_index()
67
 
68
- # Fix renaming: Remove missing columns
69
  age_sex_product_agg.columns = ["age_level", "gender", "product",
70
  "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"]
71
 
@@ -77,7 +63,6 @@ def preprocess_input(input_df, expected_feature_order):
77
  "webpage_id": "nunique"
78
  }).reset_index()
79
 
80
- # Fix renaming: Remove missing columns
81
  city_age_product_agg.columns = ["city_development_index", "age_level", "product",
82
  "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"]
83
 
@@ -96,52 +81,17 @@ def preprocess_input(input_df, expected_feature_order):
96
  # **Add `is_click` column with 0 for compatibility**
97
  if "is_click" not in input_df.columns:
98
  print("Adding `is_click` column with all values set to 0.")
99
- input_df["is_click"] = 0 # Model will ignore this for prediction
100
-
101
- # Feature List (Now includes `is_click`)
102
- features = ["age_level", "gender", "product", "campaign_id", "webpage_id",
103
- "product_category_1", "product_category_2", "user_group_id",
104
- "user_depth", "city_development_index", "var_1",
105
- "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
106
- "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod",
107
- "click_sum_age_sex_prod", "click_count_age_sex_prod",
108
- "click_sum_city_age_prod", "click_count_city_age_prod",
109
- "is_click"] # Included for compatibility
110
-
111
- categorical_columns = ["gender", "product", "campaign_id", "webpage_id"]
112
-
113
- # ===========================
114
- # ENCODE CATEGORICAL FEATURES
115
- # ===========================
116
-
117
- label_encoders = {}
118
- for col in categorical_columns:
119
- le = LabelEncoder()
120
- input_df[col] = le.fit_transform(input_df[col].astype(str)) # Apply transformation correctly
121
- label_encoders[col] = le # Store encoder for reference
122
-
123
- # Normalize numerical features
124
- numerical_columns = [col for col in features if col not in categorical_columns]
125
- scaler = StandardScaler()
126
- input_df[numerical_columns] = scaler.fit_transform(input_df[numerical_columns])
127
 
128
  # ===========================
129
  # ENFORCE FEATURE ORDER
130
  # ===========================
131
- missing_features = set(expected_feature_order) - set(input_df.columns)
132
- extra_features = set(input_df.columns) - set(expected_feature_order)
133
-
134
- # Add missing features with default values
135
- for col in missing_features:
136
- print(f"Warning: Missing feature {col}. Filling with 0.")
137
- input_df[col] = 0
138
-
139
- # Drop unexpected features
140
- if extra_features:
141
- print(f"Warning: Dropping unexpected features: {extra_features}")
142
- input_df = input_df.drop(columns=list(extra_features))
143
 
144
- # Reorder columns to match the model's expected input
145
  input_df = input_df[expected_feature_order]
146
 
147
  return input_df
@@ -151,7 +101,6 @@ def download_model(filename, local_path):
151
  """Download model from Hugging Face and move it to the correct location."""
152
  temp_path = hf_hub_download(repo_id=MODEL_REPO, filename=filename, local_dir=MODEL_DIR)
153
 
154
- # Ensure correct file placement
155
  if temp_path != local_path:
156
  shutil.move(temp_path, local_path)
157
 
@@ -163,7 +112,6 @@ def load_models():
163
  try:
164
  print("πŸ”„ Checking and downloading models...")
165
 
166
- # Ensure models are downloaded and placed correctly
167
  if not os.path.exists(CATBOOST_MODEL_PATH):
168
  print("πŸš€ Downloading CatBoost model...")
169
  download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH)
@@ -176,7 +124,6 @@ def load_models():
176
  print("πŸš€ Downloading RandomForest model...")
177
  download_model(RF_MODEL_FILENAME, RF_MODEL_PATH)
178
 
179
- # βœ… Load models
180
  print("πŸ“¦ Loading models...")
181
  catboost_model = joblib.load(CATBOOST_MODEL_PATH)
182
  xgb_model = joblib.load(XGB_MODEL_PATH)
@@ -189,11 +136,19 @@ def load_models():
189
  print(f"❌ Error loading models: {e}")
190
  return None, None, None
191
 
 
192
  # Streamlit UI
193
  st.title("Is_Click Predictor - ML Model Inference")
194
  st.info("Upload a CSV file, and the trained models will predict click probability.")
195
 
196
- catboost, xgb, rf = load_models()
 
 
 
 
 
 
 
197
 
198
  # Upload File
199
  uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
@@ -202,124 +157,47 @@ if uploaded_file:
202
  st.success("File uploaded successfully!")
203
 
204
  # βœ… Compute aggregations & preprocess
205
- input_df = preprocess_input(input_df)
206
 
207
  # βœ… Make Predictions
208
  st.subheader("Predictions in Progress...")
209
- from catboost import Pool
210
 
211
  # Define categorical features (MUST MATCH what was used during training)
212
  cat_features = ["gender", "product", "campaign_id", "webpage_id"]
213
 
214
- # Convert categorical features to strings (MUST be string, not float)
215
  for col in cat_features:
216
  input_df[col] = input_df[col].astype(str)
217
 
218
- expected_feature_order = catboost.feature_names_
219
- print("Expected Feature Order:", expected_feature_order)
 
 
220
 
221
- # Ensure input_df has the correct column order
222
- input_df = input_df[expected_feature_order]
 
 
223
 
224
- input_pool = Pool(input_df, cat_features=cat_features)
225
- catboost_preds = catboost.predict(input_pool)
226
- catboost_probs = catboost.predict_proba(input_df)[:, 1]
227
- label_encoders = {} # Store encoders to ensure consistency
 
 
 
 
 
 
228
 
229
- for col in cat_features:
230
- le = LabelEncoder()
231
- input_df[col] = input_df[col].astype(str) # Ensure it's a string
232
- le.fit(input_df[col]) # Fit only on input_df (since training is done)
233
- label_encoders[col] = le # Save encoder for reference
234
- input_df[col] = le.transform(input_df[col])
235
-
236
- # List of features used during training for XGBoost
237
- xgb_training_features = [
238
- "age_level", "gender", "product", "campaign_id", "webpage_id",
239
- "product_category_1", "product_category_2", "user_group_id",
240
- "user_depth", "city_development_index", "var_1",
241
- "click_sum_age_sex_prod", "click_count_age_sex_prod",
242
- "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
243
- "click_sum_city_age_prod", "click_count_city_age_prod",
244
- "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
245
- ]
246
-
247
- xgb_preds = xgb.predict(input_df[xgb_training_features])
248
-
249
- # # πŸ”₯ List of features RandomForest was trained with
250
- # rf_training_features = [
251
- # "age_level", "gender", "product", "campaign_id", "webpage_id",
252
- # "product_category_1", "product_category_2", "user_group_id",
253
- # "user_depth", "city_development_index", "var_1",
254
- # "click_sum_age_sex_prod", "click_count_age_sex_prod",
255
- # "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
256
- # "click_sum_city_age_prod", "click_count_city_age_prod",
257
- # "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
258
- # ]
259
- #
260
- # # βœ… Ensure all training features exist in `input_df`
261
- # for col in rf_training_features:
262
- # if col not in input_df.columns:
263
- # input_df[col] = 0 # Default missing columns to 0
264
- #
265
- # # Get intersection of trained features and current input_df columns
266
- # common_features = list(set(rf.feature_names_in_) & set(input_df.columns))
267
- #
268
- # # Select only the matching features
269
- # input_df_rf = input_df[common_features]
270
- #
271
- # # Predict without needing to add missing features
272
- # rf_preds = rf.predict(input_df_rf)
273
- #
274
- #
275
- # print("RF Model Trained Features:", rf.feature_names_in_)
276
- # print("Input Data Features:", input_df_rf.columns.tolist())
277
- #
278
- # # Debugging: Check for missing or extra features
279
- # missing_features = set(rf.feature_names_in_) - set(input_df_rf.columns)
280
- # extra_features = set(input_df_rf.columns) - set(rf.feature_names_in_)
281
- #
282
- # print("Missing Features in Input:", missing_features)
283
- # print("Extra Features in Input:", extra_features)
284
- # # βœ… Make Predictions with RandomForest
285
- # rf_preds = rf.predict(input_df_rf)
286
-
287
- xgb_probs = xgb.predict_proba(input_df)[:, 1]
288
- #rf_probs = rf.predict_proba(input_df)[:, 1]
289
- #test
290
  # Combine results
291
  predictions_df = pd.DataFrame({
292
  "CatBoost": catboost_preds,
293
  "XGBoost": xgb_preds,
294
- # "RandomForest": rf_preds
295
  })
296
- # Apply "at least one model predicts 1" rule
297
- predictions_df["is_click_predicted"] = predictions_df.max(axis=1)
298
-
299
- # Generate probability file
300
- probabilities_df = pd.DataFrame({
301
- "CatBoost_Prob": catboost_probs,
302
- "XGBoost_Prob": xgb_probs,
303
- # "RandomForest_Prob": rf_probs
304
- })
305
-
306
- # Save results
307
- binary_predictions_path = "binary_predictions.csv"
308
- filtered_predictions_path = "filtered_predictions.csv"
309
- probabilities_path = "model_probabilities.csv"
310
 
311
- predictions_df.to_csv(binary_predictions_path, index=False)
312
- predictions_df[predictions_df["is_click_predicted"] == 1].to_csv(filtered_predictions_path, index=False)
313
- probabilities_df.to_csv(probabilities_path, index=False)
314
 
315
  st.success("Predictions completed! Download results below.")
316
-
317
- # Download Buttons
318
- with open(binary_predictions_path, "rb") as f:
319
- st.download_button("Download Binary Predictions (0/1)", f, file_name="binary_predictions.csv")
320
-
321
- with open(filtered_predictions_path, "rb") as f:
322
- st.download_button("Download Clicked Predictions (Only 1s)", f, file_name="filtered_predictions.csv")
323
-
324
- with open(probabilities_path, "rb") as f:
325
- st.download_button("Download Probability Predictions", f, file_name="model_probabilities.csv")
 
1
+ import os
2
+ import shutil
 
3
  import streamlit as st
4
  import pandas as pd
 
5
  import joblib
 
6
  from huggingface_hub import hf_hub_download
7
  from sklearn.preprocessing import LabelEncoder, StandardScaler
8
  from catboost import Pool
 
13
  os.makedirs(MODEL_DIR, exist_ok=True)
14
 
15
  # Model Filenames
16
+ CATBOOST_MODEL_FILENAME = "catboost_model.pkl"
17
+ XGB_MODEL_FILENAME = "xgb_model.pkl"
18
+ RF_MODEL_FILENAME = "rf_model.pkl"
19
 
20
  # Local Paths
21
+ CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, CATBOOST_MODEL_FILENAME)
22
+ XGB_MODEL_PATH = os.path.join(MODEL_DIR, XGB_MODEL_FILENAME)
23
+ RF_MODEL_PATH = os.path.join(MODEL_DIR, RF_MODEL_FILENAME)
24
 
25
  # Define Features
26
  CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"]
 
34
 
35
  FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
36
 
 
 
 
37
 
38
  def preprocess_input(input_df, expected_feature_order):
39
+ """ Preprocess input data before making predictions """
40
+
41
+ # Drop DateTime if present
 
 
 
 
 
 
 
42
  if "DateTime" in input_df.columns:
43
  input_df.drop(columns=["DateTime"], inplace=True)
44
 
 
52
  "webpage_id": "nunique"
53
  }).reset_index()
54
 
 
55
  age_sex_product_agg.columns = ["age_level", "gender", "product",
56
  "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"]
57
 
 
63
  "webpage_id": "nunique"
64
  }).reset_index()
65
 
 
66
  city_age_product_agg.columns = ["city_development_index", "age_level", "product",
67
  "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"]
68
 
 
81
  # **Add `is_click` column with 0 for compatibility**
82
  if "is_click" not in input_df.columns:
83
  print("Adding `is_click` column with all values set to 0.")
84
+ input_df["is_click"] = 0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
  # ===========================
87
  # ENFORCE FEATURE ORDER
88
  # ===========================
89
+ for col in expected_feature_order:
90
+ if col not in input_df.columns:
91
+ print(f"Warning: Missing feature {col}. Filling with 0.")
92
+ input_df[col] = 0
 
 
 
 
 
 
 
 
93
 
94
+ # Reorder columns
95
  input_df = input_df[expected_feature_order]
96
 
97
  return input_df
 
101
  """Download model from Hugging Face and move it to the correct location."""
102
  temp_path = hf_hub_download(repo_id=MODEL_REPO, filename=filename, local_dir=MODEL_DIR)
103
 
 
104
  if temp_path != local_path:
105
  shutil.move(temp_path, local_path)
106
 
 
112
  try:
113
  print("πŸ”„ Checking and downloading models...")
114
 
 
115
  if not os.path.exists(CATBOOST_MODEL_PATH):
116
  print("πŸš€ Downloading CatBoost model...")
117
  download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH)
 
124
  print("πŸš€ Downloading RandomForest model...")
125
  download_model(RF_MODEL_FILENAME, RF_MODEL_PATH)
126
 
 
127
  print("πŸ“¦ Loading models...")
128
  catboost_model = joblib.load(CATBOOST_MODEL_PATH)
129
  xgb_model = joblib.load(XGB_MODEL_PATH)
 
136
  print(f"❌ Error loading models: {e}")
137
  return None, None, None
138
 
139
+
140
  # Streamlit UI
141
  st.title("Is_Click Predictor - ML Model Inference")
142
  st.info("Upload a CSV file, and the trained models will predict click probability.")
143
 
144
+ catboost_model, xgb_model, rf_model = load_models()
145
+
146
+ if not catboost_model:
147
+ st.error("❌ Error: Failed to load models. Please check your Hugging Face repo.")
148
+ st.stop()
149
+
150
+ expected_feature_order = catboost_model.feature_names_
151
+ print("Expected Feature Order:", expected_feature_order)
152
 
153
  # Upload File
154
  uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
 
157
  st.success("File uploaded successfully!")
158
 
159
  # βœ… Compute aggregations & preprocess
160
+ input_df = preprocess_input(input_df, expected_feature_order)
161
 
162
  # βœ… Make Predictions
163
  st.subheader("Predictions in Progress...")
 
164
 
165
  # Define categorical features (MUST MATCH what was used during training)
166
  cat_features = ["gender", "product", "campaign_id", "webpage_id"]
167
 
168
+ # Convert categorical features to string type
169
  for col in cat_features:
170
  input_df[col] = input_df[col].astype(str)
171
 
172
+ # Create CatBoost pool
173
+ input_pool = Pool(input_df, cat_features=cat_features)
174
+ catboost_preds = catboost_model.predict(input_pool)
175
+ catboost_probs = catboost_model.predict_proba(input_pool)[:, 1]
176
 
177
+ # Ensure all required columns exist for XGBoost
178
+ for col in xgb_model.feature_names_in_:
179
+ if col not in input_df.columns:
180
+ input_df[col] = 0
181
 
182
+ xgb_preds = xgb_model.predict(input_df[xgb_model.feature_names_in_])
183
+ xgb_probs = xgb_model.predict_proba(input_df[xgb_model.feature_names_in_])[:, 1]
184
+
185
+ # Ensure all required columns exist for RandomForest
186
+ for col in rf_model.feature_names_in_:
187
+ if col not in input_df.columns:
188
+ input_df[col] = 0
189
+
190
+ rf_preds = rf_model.predict(input_df[rf_model.feature_names_in_])
191
+ rf_probs = rf_model.predict_proba(input_df[rf_model.feature_names_in_])[:, 1]
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  # Combine results
194
  predictions_df = pd.DataFrame({
195
  "CatBoost": catboost_preds,
196
  "XGBoost": xgb_preds,
197
+ "RandomForest": rf_preds
198
  })
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
+ predictions_df["is_click_predicted"] = predictions_df.max(axis=1)
 
 
201
 
202
  st.success("Predictions completed! Download results below.")
203
+ st.dataframe(predictions_df)