chkp-talexm commited on
Commit
617b96b
Β·
1 Parent(s): 8a5806f
Files changed (1) hide show
  1. app.py +56 -230
app.py CHANGED
@@ -1,13 +1,9 @@
1
-
2
- import os, shutil
3
-
4
  import streamlit as st
5
  import pandas as pd
6
- import numpy as np
7
  import joblib
8
- import os
9
  from huggingface_hub import hf_hub_download
10
- from sklearn.preprocessing import LabelEncoder, StandardScaler
11
  from catboost import Pool
12
 
13
  # Hugging Face Model Repo
@@ -16,14 +12,14 @@ MODEL_DIR = "models"
16
  os.makedirs(MODEL_DIR, exist_ok=True)
17
 
18
  # Model Filenames
19
- CATBOOST_MODEL_FILENAME = "models/catboost_model.pkl"
20
- XGB_MODEL_FILENAME = "models/xgb_model.pkl"
21
- RF_MODEL_FILENAME = "models/rf_model.pkl"
22
 
23
  # Local Paths
24
- CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.pkl")
25
- XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.pkl")
26
- RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")
27
 
28
  # Define Features
29
  CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"]
@@ -37,121 +33,12 @@ NUMERICAL_COLUMNS = [
37
 
38
  FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
39
 
40
- from sklearn.preprocessing import LabelEncoder, StandardScaler
41
- from catboost import Pool
42
-
43
-
44
- def preprocess_input(input_df, expected_feature_order):
45
- """
46
- Ensure preprocessing is correct:
47
- - Removes duplicate columns
48
- - Computes aggregations using only test data
49
- - Ensures categorical variables are properly encoded
50
- - Normalizes numerical features
51
- - Adds `is_click` column with 0 for compatibility
52
- - Orders columns as expected by the model
53
- """
54
- # Drop the DateTime column if it exists
55
- if "DateTime" in input_df.columns:
56
- input_df.drop(columns=["DateTime"], inplace=True)
57
-
58
- # Remove duplicate columns
59
- input_df = input_df.loc[:, ~input_df.columns.duplicated()]
60
- input_df.fillna(0, inplace=True)
61
-
62
- # Aggregate by age & gender vs product
63
- age_sex_product_agg = input_df.groupby(["age_level", "gender", "product"]).agg({
64
- "campaign_id": "nunique",
65
- "webpage_id": "nunique"
66
- }).reset_index()
67
-
68
- # Fix renaming: Remove missing columns
69
- age_sex_product_agg.columns = ["age_level", "gender", "product",
70
- "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"]
71
-
72
- input_df = input_df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left")
73
-
74
- # Aggregate by city, age, product
75
- city_age_product_agg = input_df.groupby(["city_development_index", "age_level", "product"]).agg({
76
- "campaign_id": "nunique",
77
- "webpage_id": "nunique"
78
- }).reset_index()
79
-
80
- # Fix renaming: Remove missing columns
81
- city_age_product_agg.columns = ["city_development_index", "age_level", "product",
82
- "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"]
83
-
84
- input_df = input_df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left")
85
- input_df.fillna(0, inplace=True)
86
-
87
- # **Ensure missing columns exist (Important Fix)**
88
- missing_columns = ["click_sum_age_sex_prod", "click_count_age_sex_prod",
89
- "click_sum_city_age_prod", "click_count_city_age_prod"]
90
-
91
- for col in missing_columns:
92
- if col not in input_df.columns:
93
- print(f"Warning: Missing column {col}. Filling with 0.")
94
- input_df[col] = 0 # Fill missing columns with default values
95
-
96
- # **Add `is_click` column with 0 for compatibility**
97
- if "is_click" not in input_df.columns:
98
- print("Adding `is_click` column with all values set to 0.")
99
- input_df["is_click"] = 0 # Model will ignore this for prediction
100
-
101
- # Feature List (Now includes `is_click`)
102
- features = ["age_level", "gender", "product", "campaign_id", "webpage_id",
103
- "product_category_1", "product_category_2", "user_group_id",
104
- "user_depth", "city_development_index", "var_1",
105
- "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
106
- "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod",
107
- "click_sum_age_sex_prod", "click_count_age_sex_prod",
108
- "click_sum_city_age_prod", "click_count_city_age_prod",
109
- "is_click"] # Included for compatibility
110
-
111
- categorical_columns = ["gender", "product", "campaign_id", "webpage_id"]
112
-
113
- # ===========================
114
- # ENCODE CATEGORICAL FEATURES
115
- # ===========================
116
-
117
- label_encoders = {}
118
- for col in categorical_columns:
119
- le = LabelEncoder()
120
- input_df[col] = le.fit_transform(input_df[col].astype(str)) # Apply transformation correctly
121
- label_encoders[col] = le # Store encoder for reference
122
-
123
- # Normalize numerical features
124
- numerical_columns = [col for col in features if col not in categorical_columns]
125
- scaler = StandardScaler()
126
- input_df[numerical_columns] = scaler.fit_transform(input_df[numerical_columns])
127
-
128
- # ===========================
129
- # ENFORCE FEATURE ORDER
130
- # ===========================
131
- missing_features = set(expected_feature_order) - set(input_df.columns)
132
- extra_features = set(input_df.columns) - set(expected_feature_order)
133
-
134
- # Add missing features with default values
135
- for col in missing_features:
136
- print(f"Warning: Missing feature {col}. Filling with 0.")
137
- input_df[col] = 0
138
-
139
- # Drop unexpected features
140
- if extra_features:
141
- print(f"Warning: Dropping unexpected features: {extra_features}")
142
- input_df = input_df.drop(columns=list(extra_features))
143
-
144
- # Reorder columns to match the model's expected input
145
- input_df = input_df[expected_feature_order]
146
-
147
- return input_df
148
-
149
 
150
  def download_model(filename, local_path):
151
  """Download model from Hugging Face and move it to the correct location."""
 
152
  temp_path = hf_hub_download(repo_id=MODEL_REPO, filename=filename, local_dir=MODEL_DIR)
153
 
154
- # Ensure correct file placement
155
  if temp_path != local_path:
156
  shutil.move(temp_path, local_path)
157
 
@@ -163,20 +50,15 @@ def load_models():
163
  try:
164
  print("πŸ”„ Checking and downloading models...")
165
 
166
- # Ensure models are downloaded and placed correctly
167
  if not os.path.exists(CATBOOST_MODEL_PATH):
168
- print("πŸš€ Downloading CatBoost model...")
169
  download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH)
170
 
171
  if not os.path.exists(XGB_MODEL_PATH):
172
- print("πŸš€ Downloading XGBoost model...")
173
  download_model(XGB_MODEL_FILENAME, XGB_MODEL_PATH)
174
 
175
  if not os.path.exists(RF_MODEL_PATH):
176
- print("πŸš€ Downloading RandomForest model...")
177
  download_model(RF_MODEL_FILENAME, RF_MODEL_PATH)
178
 
179
- # βœ… Load models
180
  print("πŸ“¦ Loading models...")
181
  catboost_model = joblib.load(CATBOOST_MODEL_PATH)
182
  xgb_model = joblib.load(XGB_MODEL_PATH)
@@ -189,139 +71,83 @@ def load_models():
189
  print(f"❌ Error loading models: {e}")
190
  return None, None, None
191
 
 
192
  # Streamlit UI
193
  st.title("Is_Click Predictor - ML Model Inference")
194
  st.info("Upload a CSV file, and the trained models will predict click probability.")
195
 
196
  catboost, xgb, rf = load_models()
197
 
 
 
 
 
198
  expected_feature_order = catboost.feature_names_
199
  print("Expected Feature Order:", expected_feature_order)
 
200
  # Upload File
201
  uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
202
  if uploaded_file:
203
  input_df = pd.read_csv(uploaded_file)
204
  st.success("File uploaded successfully!")
205
 
206
- # βœ… Compute aggregations & preprocess
207
- input_df = preprocess_input(input_df, expected_feature_order)
 
 
 
 
 
 
208
 
209
  # βœ… Make Predictions
210
  st.subheader("Predictions in Progress...")
211
- from catboost import Pool
212
 
213
- # Define categorical features (MUST MATCH what was used during training)
214
- cat_features = ["gender", "product", "campaign_id", "webpage_id"]
 
215
 
216
- # Convert categorical features to strings (MUST be string, not float)
217
- for col in cat_features:
218
- input_df[col] = input_df[col].astype(str)
219
 
220
- expected_feature_order = catboost.feature_names_
221
- print("Expected Feature Order:", expected_feature_order)
 
222
 
223
- # Ensure input_df has the correct column order
224
- input_df = input_df[expected_feature_order]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
 
226
- input_pool = Pool(input_df, cat_features=cat_features)
227
- catboost_preds = catboost.predict(input_pool)
228
- catboost_probs = catboost.predict_proba(input_df)[:, 1]
229
- label_encoders = {} # Store encoders to ensure consistency
230
-
231
- for col in cat_features:
232
- le = LabelEncoder()
233
- input_df[col] = input_df[col].astype(str) # Ensure it's a string
234
- le.fit(input_df[col]) # Fit only on input_df (since training is done)
235
- label_encoders[col] = le # Save encoder for reference
236
- input_df[col] = le.transform(input_df[col])
237
-
238
- # List of features used during training for XGBoost
239
- xgb_training_features = [
240
- "age_level", "gender", "product", "campaign_id", "webpage_id",
241
- "product_category_1", "product_category_2", "user_group_id",
242
- "user_depth", "city_development_index", "var_1",
243
- "click_sum_age_sex_prod", "click_count_age_sex_prod",
244
- "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
245
- "click_sum_city_age_prod", "click_count_city_age_prod",
246
- "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
247
- ]
248
-
249
- xgb_preds = xgb.predict(input_df[xgb_training_features])
250
-
251
- # # πŸ”₯ List of features RandomForest was trained with
252
- # rf_training_features = [
253
- # "age_level", "gender", "product", "campaign_id", "webpage_id",
254
- # "product_category_1", "product_category_2", "user_group_id",
255
- # "user_depth", "city_development_index", "var_1",
256
- # "click_sum_age_sex_prod", "click_count_age_sex_prod",
257
- # "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
258
- # "click_sum_city_age_prod", "click_count_city_age_prod",
259
- # "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
260
- # ]
261
- #
262
- # # βœ… Ensure all training features exist in `input_df`
263
- # for col in rf_training_features:
264
- # if col not in input_df.columns:
265
- # input_df[col] = 0 # Default missing columns to 0
266
- #
267
- # # Get intersection of trained features and current input_df columns
268
- # common_features = list(set(rf.feature_names_in_) & set(input_df.columns))
269
- #
270
- # # Select only the matching features
271
- # input_df_rf = input_df[common_features]
272
- #
273
- # # Predict without needing to add missing features
274
- # rf_preds = rf.predict(input_df_rf)
275
- #
276
- #
277
- # print("RF Model Trained Features:", rf.feature_names_in_)
278
- # print("Input Data Features:", input_df_rf.columns.tolist())
279
- #
280
- # # Debugging: Check for missing or extra features
281
- # missing_features = set(rf.feature_names_in_) - set(input_df_rf.columns)
282
- # extra_features = set(input_df_rf.columns) - set(rf.feature_names_in_)
283
- #
284
- # print("Missing Features in Input:", missing_features)
285
- # print("Extra Features in Input:", extra_features)
286
- # # βœ… Make Predictions with RandomForest
287
- # rf_preds = rf.predict(input_df_rf)
288
-
289
- xgb_probs = xgb.predict_proba(input_df)[:, 1]
290
- #rf_probs = rf.predict_proba(input_df)[:, 1]
291
- #test
292
  # Combine results
293
  predictions_df = pd.DataFrame({
294
  "CatBoost": catboost_preds,
295
  "XGBoost": xgb_preds,
296
- # "RandomForest": rf_preds
297
  })
 
298
  # Apply "at least one model predicts 1" rule
299
  predictions_df["is_click_predicted"] = predictions_df.max(axis=1)
300
 
301
- # Generate probability file
302
- probabilities_df = pd.DataFrame({
303
- "CatBoost_Prob": catboost_probs,
304
- "XGBoost_Prob": xgb_probs,
305
- # "RandomForest_Prob": rf_probs
306
- })
307
-
308
  # Save results
309
- binary_predictions_path = "binary_predictions.csv"
310
- filtered_predictions_path = "filtered_predictions.csv"
311
- probabilities_path = "model_probabilities.csv"
312
-
313
- predictions_df.to_csv(binary_predictions_path, index=False)
314
- predictions_df[predictions_df["is_click_predicted"] == 1].to_csv(filtered_predictions_path, index=False)
315
- probabilities_df.to_csv(probabilities_path, index=False)
316
 
317
  st.success("Predictions completed! Download results below.")
318
-
319
- # Download Buttons
320
- with open(binary_predictions_path, "rb") as f:
321
- st.download_button("Download Binary Predictions (0/1)", f, file_name="binary_predictions.csv")
322
-
323
- with open(filtered_predictions_path, "rb") as f:
324
- st.download_button("Download Clicked Predictions (Only 1s)", f, file_name="filtered_predictions.csv")
325
-
326
- with open(probabilities_path, "rb") as f:
327
- st.download_button("Download Probability Predictions", f, file_name="model_probabilities.csv")
 
1
+ import os
2
+ import shutil
 
3
  import streamlit as st
4
  import pandas as pd
 
5
  import joblib
 
6
  from huggingface_hub import hf_hub_download
 
7
  from catboost import Pool
8
 
9
  # Hugging Face Model Repo
 
12
  os.makedirs(MODEL_DIR, exist_ok=True)
13
 
14
  # Model Filenames
15
+ CATBOOST_MODEL_FILENAME = "catboost_model.pkl"
16
+ XGB_MODEL_FILENAME = "xgb_model.pkl"
17
+ RF_MODEL_FILENAME = "rf_model.pkl"
18
 
19
  # Local Paths
20
+ CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, CATBOOST_MODEL_FILENAME)
21
+ XGB_MODEL_PATH = os.path.join(MODEL_DIR, XGB_MODEL_FILENAME)
22
+ RF_MODEL_PATH = os.path.join(MODEL_DIR, RF_MODEL_FILENAME)
23
 
24
  # Define Features
25
  CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"]
 
33
 
34
  FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
36
 
37
  def download_model(filename, local_path):
38
  """Download model from Hugging Face and move it to the correct location."""
39
+ print(f"πŸ“₯ Downloading {filename} from Hugging Face...")
40
  temp_path = hf_hub_download(repo_id=MODEL_REPO, filename=filename, local_dir=MODEL_DIR)
41
 
 
42
  if temp_path != local_path:
43
  shutil.move(temp_path, local_path)
44
 
 
50
  try:
51
  print("πŸ”„ Checking and downloading models...")
52
 
 
53
  if not os.path.exists(CATBOOST_MODEL_PATH):
 
54
  download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH)
55
 
56
  if not os.path.exists(XGB_MODEL_PATH):
 
57
  download_model(XGB_MODEL_FILENAME, XGB_MODEL_PATH)
58
 
59
  if not os.path.exists(RF_MODEL_PATH):
 
60
  download_model(RF_MODEL_FILENAME, RF_MODEL_PATH)
61
 
 
62
  print("πŸ“¦ Loading models...")
63
  catboost_model = joblib.load(CATBOOST_MODEL_PATH)
64
  xgb_model = joblib.load(XGB_MODEL_PATH)
 
71
  print(f"❌ Error loading models: {e}")
72
  return None, None, None
73
 
74
+
75
  # Streamlit UI
76
  st.title("Is_Click Predictor - ML Model Inference")
77
  st.info("Upload a CSV file, and the trained models will predict click probability.")
78
 
79
  catboost, xgb, rf = load_models()
80
 
81
+ if not catboost:
82
+ st.error("❌ Error: Failed to load models. Please check your Hugging Face repo.")
83
+ st.stop()
84
+
85
  expected_feature_order = catboost.feature_names_
86
  print("Expected Feature Order:", expected_feature_order)
87
+
88
  # Upload File
89
  uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
90
  if uploaded_file:
91
  input_df = pd.read_csv(uploaded_file)
92
  st.success("File uploaded successfully!")
93
 
94
+ # Ensure all expected columns exist in the test set
95
+ for col in expected_feature_order:
96
+ if col not in input_df.columns:
97
+ print(f"⚠️ Warning: Missing feature {col}. Filling with 0.")
98
+ input_df[col] = 0
99
+
100
+ # Reorder columns before prediction
101
+ input_df = input_df[expected_feature_order]
102
 
103
  # βœ… Make Predictions
104
  st.subheader("Predictions in Progress...")
 
105
 
106
+ # Create CatBoost pool
107
+ cat_features = CATEGORICAL_COLUMNS
108
+ input_pool = Pool(input_df, cat_features=cat_features)
109
 
110
+ catboost_probs = catboost.predict_proba(input_pool)[:, 1]
 
 
111
 
112
+ # βœ… Adjust decision threshold
113
+ THRESHOLD = 0.6 # Reduce false positives
114
+ catboost_preds = (catboost_probs >= THRESHOLD).astype(int)
115
 
116
+ # Ensure all required columns exist for XGBoost
117
+ for col in xgb.feature_names_in_:
118
+ if col not in input_df.columns:
119
+ input_df[col] = 0
120
+
121
+ xgb_probs = xgb.predict_proba(input_df[xgb.feature_names_in_])[:, 1]
122
+ xgb_preds = (xgb_probs >= THRESHOLD).astype(int)
123
+
124
+ # Ensure all required columns exist for RandomForest
125
+ for col in rf.feature_names_in_:
126
+ if col not in input_df.columns:
127
+ input_df[col] = 0
128
+
129
+ rf_probs = rf.predict_proba(input_df[rf.feature_names_in_])[:, 1]
130
+ rf_preds = (rf_probs >= THRESHOLD).astype(int)
131
+
132
+ # βœ… Debugging: Check probability distributions
133
+ print("πŸ” Probability distributions:")
134
+ print("CatBoost:", pd.Series(catboost_probs).describe())
135
+ print("XGBoost:", pd.Series(xgb_probs).describe())
136
+ print("RandomForest:", pd.Series(rf_probs).describe())
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  # Combine results
139
  predictions_df = pd.DataFrame({
140
  "CatBoost": catboost_preds,
141
  "XGBoost": xgb_preds,
142
+ "RandomForest": rf_preds
143
  })
144
+
145
  # Apply "at least one model predicts 1" rule
146
  predictions_df["is_click_predicted"] = predictions_df.max(axis=1)
147
 
 
 
 
 
 
 
 
148
  # Save results
149
+ predictions_df.to_csv("binary_predictions.csv", index=False)
150
+ predictions_df[predictions_df["is_click_predicted"] == 1].to_csv("filtered_predictions.csv", index=False)
 
 
 
 
 
151
 
152
  st.success("Predictions completed! Download results below.")
153
+ st.dataframe(predictions_df)