chkp-talexm commited on
Commit
8a5806f
Β·
1 Parent(s): 615184d
Files changed (1) hide show
  1. app.py +219 -55
app.py CHANGED
@@ -1,10 +1,13 @@
1
- import os
2
- import shutil
 
3
  import streamlit as st
4
  import pandas as pd
5
  import numpy as np
6
  import joblib
 
7
  from huggingface_hub import hf_hub_download
 
8
  from catboost import Pool
9
 
10
  # Hugging Face Model Repo
@@ -34,10 +37,21 @@ NUMERICAL_COLUMNS = [
34
 
35
  FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
36
 
37
- def preprocess_input(input_df, expected_feature_order):
38
- """Prepares test data to match the training format"""
39
 
40
- # Drop DateTime column if present
 
 
 
 
 
 
 
 
 
 
 
41
  if "DateTime" in input_df.columns:
42
  input_df.drop(columns=["DateTime"], inplace=True)
43
 
@@ -45,32 +59,124 @@ def preprocess_input(input_df, expected_feature_order):
45
  input_df = input_df.loc[:, ~input_df.columns.duplicated()]
46
  input_df.fillna(0, inplace=True)
47
 
48
- # Ensure missing columns exist
49
- for col in expected_feature_order:
50
- if col not in input_df.columns:
51
- print(f"⚠️ Warning: Missing feature {col}. Filling with 0.")
52
- input_df[col] = 0
 
 
 
 
 
 
53
 
54
- # Reorder columns before prediction
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  input_df = input_df[expected_feature_order]
56
 
57
  return input_df
58
 
59
 
 
 
 
 
 
 
 
 
 
 
 
60
  def load_models():
61
- """Downloads and loads models from Hugging Face."""
62
  try:
63
  print("πŸ”„ Checking and downloading models...")
64
 
 
65
  if not os.path.exists(CATBOOST_MODEL_PATH):
 
66
  download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH)
67
 
68
  if not os.path.exists(XGB_MODEL_PATH):
 
69
  download_model(XGB_MODEL_FILENAME, XGB_MODEL_PATH)
70
 
71
  if not os.path.exists(RF_MODEL_PATH):
 
72
  download_model(RF_MODEL_FILENAME, RF_MODEL_PATH)
73
 
 
74
  print("πŸ“¦ Loading models...")
75
  catboost_model = joblib.load(CATBOOST_MODEL_PATH)
76
  xgb_model = joblib.load(XGB_MODEL_PATH)
@@ -83,20 +189,14 @@ def load_models():
83
  print(f"❌ Error loading models: {e}")
84
  return None, None, None
85
 
86
-
87
  # Streamlit UI
88
  st.title("Is_Click Predictor - ML Model Inference")
89
  st.info("Upload a CSV file, and the trained models will predict click probability.")
90
 
91
  catboost, xgb, rf = load_models()
92
 
93
- if not catboost:
94
- st.error("❌ Error: Failed to load models. Please check your Hugging Face repo.")
95
- st.stop()
96
-
97
  expected_feature_order = catboost.feature_names_
98
  print("Expected Feature Order:", expected_feature_order)
99
-
100
  # Upload File
101
  uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
102
  if uploaded_file:
@@ -106,58 +206,122 @@ if uploaded_file:
106
  # βœ… Compute aggregations & preprocess
107
  input_df = preprocess_input(input_df, expected_feature_order)
108
 
109
- # βœ… Debugging: Check probability distribution before prediction
110
- print("πŸ” Checking feature distributions before prediction...")
111
- print(input_df.describe())
112
-
113
  # βœ… Make Predictions
114
  st.subheader("Predictions in Progress...")
 
115
 
116
- # Create CatBoost pool
117
- cat_features = CATEGORICAL_COLUMNS
118
- input_pool = Pool(input_df, cat_features=cat_features)
119
-
120
- catboost_probs = catboost.predict_proba(input_pool)[:, 1]
121
-
122
- # βœ… Adjust decision threshold
123
- THRESHOLD = 0.6 # Reduce false positives
124
- catboost_preds = (catboost_probs >= THRESHOLD).astype(int)
125
-
126
- # Ensure all required columns exist for XGBoost
127
- for col in xgb.feature_names_in_:
128
- if col not in input_df.columns:
129
- input_df[col] = 0
130
-
131
- xgb_probs = xgb.predict_proba(input_df[xgb.feature_names_in_])[:, 1]
132
- xgb_preds = (xgb_probs >= THRESHOLD).astype(int)
133
 
134
- # Ensure all required columns exist for RandomForest
135
- for col in rf.feature_names_in_:
136
- if col not in input_df.columns:
137
- input_df[col] = 0
138
 
139
- rf_probs = rf.predict_proba(input_df[rf.feature_names_in_])[:, 1]
140
- rf_preds = (rf_probs >= THRESHOLD).astype(int)
141
 
142
- # βœ… Fix: Debug probability distributions to verify realistic predictions
143
- print("πŸ” Probability distributions:")
144
- print("CatBoost:", pd.Series(catboost_probs).describe())
145
- print("XGBoost:", pd.Series(xgb_probs).describe())
146
- print("RandomForest:", pd.Series(rf_probs).describe())
147
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
  # Combine results
149
  predictions_df = pd.DataFrame({
150
  "CatBoost": catboost_preds,
151
  "XGBoost": xgb_preds,
152
- "RandomForest": rf_preds
153
  })
154
-
155
  # Apply "at least one model predicts 1" rule
156
  predictions_df["is_click_predicted"] = predictions_df.max(axis=1)
157
 
 
 
 
 
 
 
 
158
  # Save results
159
- predictions_df.to_csv("binary_predictions.csv", index=False)
160
- predictions_df[predictions_df["is_click_predicted"] == 1].to_csv("filtered_predictions.csv", index=False)
 
 
 
 
 
161
 
162
  st.success("Predictions completed! Download results below.")
163
- st.dataframe(predictions_df)
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import os, shutil
3
+
4
  import streamlit as st
5
  import pandas as pd
6
  import numpy as np
7
  import joblib
8
+ import os
9
  from huggingface_hub import hf_hub_download
10
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
11
  from catboost import Pool
12
 
13
  # Hugging Face Model Repo
 
37
 
38
  FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
39
 
40
+ from sklearn.preprocessing import LabelEncoder, StandardScaler
41
+ from catboost import Pool
42
 
43
+
44
+ def preprocess_input(input_df, expected_feature_order):
45
+ """
46
+ Ensure preprocessing is correct:
47
+ - Removes duplicate columns
48
+ - Computes aggregations using only test data
49
+ - Ensures categorical variables are properly encoded
50
+ - Normalizes numerical features
51
+ - Adds `is_click` column with 0 for compatibility
52
+ - Orders columns as expected by the model
53
+ """
54
+ # Drop the DateTime column if it exists
55
  if "DateTime" in input_df.columns:
56
  input_df.drop(columns=["DateTime"], inplace=True)
57
 
 
59
  input_df = input_df.loc[:, ~input_df.columns.duplicated()]
60
  input_df.fillna(0, inplace=True)
61
 
62
+ # Aggregate by age & gender vs product
63
+ age_sex_product_agg = input_df.groupby(["age_level", "gender", "product"]).agg({
64
+ "campaign_id": "nunique",
65
+ "webpage_id": "nunique"
66
+ }).reset_index()
67
+
68
+ # Fix renaming: Remove missing columns
69
+ age_sex_product_agg.columns = ["age_level", "gender", "product",
70
+ "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"]
71
+
72
+ input_df = input_df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left")
73
 
74
+ # Aggregate by city, age, product
75
+ city_age_product_agg = input_df.groupby(["city_development_index", "age_level", "product"]).agg({
76
+ "campaign_id": "nunique",
77
+ "webpage_id": "nunique"
78
+ }).reset_index()
79
+
80
+ # Fix renaming: Remove missing columns
81
+ city_age_product_agg.columns = ["city_development_index", "age_level", "product",
82
+ "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"]
83
+
84
+ input_df = input_df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left")
85
+ input_df.fillna(0, inplace=True)
86
+
87
+ # **Ensure missing columns exist (Important Fix)**
88
+ missing_columns = ["click_sum_age_sex_prod", "click_count_age_sex_prod",
89
+ "click_sum_city_age_prod", "click_count_city_age_prod"]
90
+
91
+ for col in missing_columns:
92
+ if col not in input_df.columns:
93
+ print(f"Warning: Missing column {col}. Filling with 0.")
94
+ input_df[col] = 0 # Fill missing columns with default values
95
+
96
+ # **Add `is_click` column with 0 for compatibility**
97
+ if "is_click" not in input_df.columns:
98
+ print("Adding `is_click` column with all values set to 0.")
99
+ input_df["is_click"] = 0 # Model will ignore this for prediction
100
+
101
+ # Feature List (Now includes `is_click`)
102
+ features = ["age_level", "gender", "product", "campaign_id", "webpage_id",
103
+ "product_category_1", "product_category_2", "user_group_id",
104
+ "user_depth", "city_development_index", "var_1",
105
+ "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
106
+ "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod",
107
+ "click_sum_age_sex_prod", "click_count_age_sex_prod",
108
+ "click_sum_city_age_prod", "click_count_city_age_prod",
109
+ "is_click"] # Included for compatibility
110
+
111
+ categorical_columns = ["gender", "product", "campaign_id", "webpage_id"]
112
+
113
+ # ===========================
114
+ # ENCODE CATEGORICAL FEATURES
115
+ # ===========================
116
+
117
+ label_encoders = {}
118
+ for col in categorical_columns:
119
+ le = LabelEncoder()
120
+ input_df[col] = le.fit_transform(input_df[col].astype(str)) # Apply transformation correctly
121
+ label_encoders[col] = le # Store encoder for reference
122
+
123
+ # Normalize numerical features
124
+ numerical_columns = [col for col in features if col not in categorical_columns]
125
+ scaler = StandardScaler()
126
+ input_df[numerical_columns] = scaler.fit_transform(input_df[numerical_columns])
127
+
128
+ # ===========================
129
+ # ENFORCE FEATURE ORDER
130
+ # ===========================
131
+ missing_features = set(expected_feature_order) - set(input_df.columns)
132
+ extra_features = set(input_df.columns) - set(expected_feature_order)
133
+
134
+ # Add missing features with default values
135
+ for col in missing_features:
136
+ print(f"Warning: Missing feature {col}. Filling with 0.")
137
+ input_df[col] = 0
138
+
139
+ # Drop unexpected features
140
+ if extra_features:
141
+ print(f"Warning: Dropping unexpected features: {extra_features}")
142
+ input_df = input_df.drop(columns=list(extra_features))
143
+
144
+ # Reorder columns to match the model's expected input
145
  input_df = input_df[expected_feature_order]
146
 
147
  return input_df
148
 
149
 
150
+ def download_model(filename, local_path):
151
+ """Download model from Hugging Face and move it to the correct location."""
152
+ temp_path = hf_hub_download(repo_id=MODEL_REPO, filename=filename, local_dir=MODEL_DIR)
153
+
154
+ # Ensure correct file placement
155
+ if temp_path != local_path:
156
+ shutil.move(temp_path, local_path)
157
+
158
+ return local_path
159
+
160
+
161
  def load_models():
162
+ """Download and load models from Hugging Face."""
163
  try:
164
  print("πŸ”„ Checking and downloading models...")
165
 
166
+ # Ensure models are downloaded and placed correctly
167
  if not os.path.exists(CATBOOST_MODEL_PATH):
168
+ print("πŸš€ Downloading CatBoost model...")
169
  download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH)
170
 
171
  if not os.path.exists(XGB_MODEL_PATH):
172
+ print("πŸš€ Downloading XGBoost model...")
173
  download_model(XGB_MODEL_FILENAME, XGB_MODEL_PATH)
174
 
175
  if not os.path.exists(RF_MODEL_PATH):
176
+ print("πŸš€ Downloading RandomForest model...")
177
  download_model(RF_MODEL_FILENAME, RF_MODEL_PATH)
178
 
179
+ # βœ… Load models
180
  print("πŸ“¦ Loading models...")
181
  catboost_model = joblib.load(CATBOOST_MODEL_PATH)
182
  xgb_model = joblib.load(XGB_MODEL_PATH)
 
189
  print(f"❌ Error loading models: {e}")
190
  return None, None, None
191
 
 
192
  # Streamlit UI
193
  st.title("Is_Click Predictor - ML Model Inference")
194
  st.info("Upload a CSV file, and the trained models will predict click probability.")
195
 
196
  catboost, xgb, rf = load_models()
197
 
 
 
 
 
198
  expected_feature_order = catboost.feature_names_
199
  print("Expected Feature Order:", expected_feature_order)
 
200
  # Upload File
201
  uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
202
  if uploaded_file:
 
206
  # βœ… Compute aggregations & preprocess
207
  input_df = preprocess_input(input_df, expected_feature_order)
208
 
 
 
 
 
209
  # βœ… Make Predictions
210
  st.subheader("Predictions in Progress...")
211
+ from catboost import Pool
212
 
213
+ # Define categorical features (MUST MATCH what was used during training)
214
+ cat_features = ["gender", "product", "campaign_id", "webpage_id"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
+ # Convert categorical features to strings (MUST be string, not float)
217
+ for col in cat_features:
218
+ input_df[col] = input_df[col].astype(str)
 
219
 
220
+ expected_feature_order = catboost.feature_names_
221
+ print("Expected Feature Order:", expected_feature_order)
222
 
223
+ # Ensure input_df has the correct column order
224
+ input_df = input_df[expected_feature_order]
 
 
 
225
 
226
+ input_pool = Pool(input_df, cat_features=cat_features)
227
+ catboost_preds = catboost.predict(input_pool)
228
+ catboost_probs = catboost.predict_proba(input_df)[:, 1]
229
+ label_encoders = {} # Store encoders to ensure consistency
230
+
231
+ for col in cat_features:
232
+ le = LabelEncoder()
233
+ input_df[col] = input_df[col].astype(str) # Ensure it's a string
234
+ le.fit(input_df[col]) # Fit only on input_df (since training is done)
235
+ label_encoders[col] = le # Save encoder for reference
236
+ input_df[col] = le.transform(input_df[col])
237
+
238
+ # List of features used during training for XGBoost
239
+ xgb_training_features = [
240
+ "age_level", "gender", "product", "campaign_id", "webpage_id",
241
+ "product_category_1", "product_category_2", "user_group_id",
242
+ "user_depth", "city_development_index", "var_1",
243
+ "click_sum_age_sex_prod", "click_count_age_sex_prod",
244
+ "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
245
+ "click_sum_city_age_prod", "click_count_city_age_prod",
246
+ "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
247
+ ]
248
+
249
+ xgb_preds = xgb.predict(input_df[xgb_training_features])
250
+
251
+ # # πŸ”₯ List of features RandomForest was trained with
252
+ # rf_training_features = [
253
+ # "age_level", "gender", "product", "campaign_id", "webpage_id",
254
+ # "product_category_1", "product_category_2", "user_group_id",
255
+ # "user_depth", "city_development_index", "var_1",
256
+ # "click_sum_age_sex_prod", "click_count_age_sex_prod",
257
+ # "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
258
+ # "click_sum_city_age_prod", "click_count_city_age_prod",
259
+ # "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
260
+ # ]
261
+ #
262
+ # # βœ… Ensure all training features exist in `input_df`
263
+ # for col in rf_training_features:
264
+ # if col not in input_df.columns:
265
+ # input_df[col] = 0 # Default missing columns to 0
266
+ #
267
+ # # Get intersection of trained features and current input_df columns
268
+ # common_features = list(set(rf.feature_names_in_) & set(input_df.columns))
269
+ #
270
+ # # Select only the matching features
271
+ # input_df_rf = input_df[common_features]
272
+ #
273
+ # # Predict without needing to add missing features
274
+ # rf_preds = rf.predict(input_df_rf)
275
+ #
276
+ #
277
+ # print("RF Model Trained Features:", rf.feature_names_in_)
278
+ # print("Input Data Features:", input_df_rf.columns.tolist())
279
+ #
280
+ # # Debugging: Check for missing or extra features
281
+ # missing_features = set(rf.feature_names_in_) - set(input_df_rf.columns)
282
+ # extra_features = set(input_df_rf.columns) - set(rf.feature_names_in_)
283
+ #
284
+ # print("Missing Features in Input:", missing_features)
285
+ # print("Extra Features in Input:", extra_features)
286
+ # # βœ… Make Predictions with RandomForest
287
+ # rf_preds = rf.predict(input_df_rf)
288
+
289
+ xgb_probs = xgb.predict_proba(input_df)[:, 1]
290
+ #rf_probs = rf.predict_proba(input_df)[:, 1]
291
+ #test
292
  # Combine results
293
  predictions_df = pd.DataFrame({
294
  "CatBoost": catboost_preds,
295
  "XGBoost": xgb_preds,
296
+ # "RandomForest": rf_preds
297
  })
 
298
  # Apply "at least one model predicts 1" rule
299
  predictions_df["is_click_predicted"] = predictions_df.max(axis=1)
300
 
301
+ # Generate probability file
302
+ probabilities_df = pd.DataFrame({
303
+ "CatBoost_Prob": catboost_probs,
304
+ "XGBoost_Prob": xgb_probs,
305
+ # "RandomForest_Prob": rf_probs
306
+ })
307
+
308
  # Save results
309
+ binary_predictions_path = "binary_predictions.csv"
310
+ filtered_predictions_path = "filtered_predictions.csv"
311
+ probabilities_path = "model_probabilities.csv"
312
+
313
+ predictions_df.to_csv(binary_predictions_path, index=False)
314
+ predictions_df[predictions_df["is_click_predicted"] == 1].to_csv(filtered_predictions_path, index=False)
315
+ probabilities_df.to_csv(probabilities_path, index=False)
316
 
317
  st.success("Predictions completed! Download results below.")
318
+
319
+ # Download Buttons
320
+ with open(binary_predictions_path, "rb") as f:
321
+ st.download_button("Download Binary Predictions (0/1)", f, file_name="binary_predictions.csv")
322
+
323
+ with open(filtered_predictions_path, "rb") as f:
324
+ st.download_button("Download Clicked Predictions (Only 1s)", f, file_name="filtered_predictions.csv")
325
+
326
+ with open(probabilities_path, "rb") as f:
327
+ st.download_button("Download Probability Predictions", f, file_name="model_probabilities.csv")