chkp-talexm commited on
Commit
1812a7a
Β·
1 Parent(s): 57da9af
Files changed (1) hide show
  1. app.py +19 -67
app.py CHANGED
@@ -1,27 +1,31 @@
 
 
 
1
  import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
- import os, shutil
5
  import joblib
 
6
  from huggingface_hub import hf_hub_download
7
  from sklearn.preprocessing import LabelEncoder, StandardScaler
 
8
 
9
  # Hugging Face Model Repo
10
  MODEL_REPO = "chagu13/is_click_predictor"
11
  MODEL_DIR = "models"
12
- os.makedirs(MODEL_DIR, exist_ok=True) # Ensure directory exists
13
 
14
- # Model Filenames (on Hugging Face)
15
  CATBOOST_MODEL_FILENAME = "models/catboost_model.pkl"
16
  XGB_MODEL_FILENAME = "models/xgb_model.pkl"
17
  RF_MODEL_FILENAME = "models/rf_model.pkl"
18
 
19
- # Expected Local Paths
20
  CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.pkl")
21
  XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.pkl")
22
  RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")
23
 
24
- # Define feature lists
25
  CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"]
26
  NUMERICAL_COLUMNS = [
27
  "age_level", "city_development_index", "user_group_id", "user_depth", "var_1",
@@ -33,75 +37,23 @@ NUMERICAL_COLUMNS = [
33
 
34
  FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
35
 
36
- def preprocess_input(input_df, train_df=None, model_type="catboost"):
37
- """
38
- Preprocess input data before passing it to ML models.
39
- - Removes DateTime columns
40
- - Computes aggregations
41
- - Ensures categorical variables are properly encoded
42
- - Normalizes numerical features
43
- - Selects only required features for the given model
44
- """
45
 
46
- # πŸš€ Step 1: Drop DateTime Columns
47
- datetime_columns = input_df.select_dtypes(include=["datetime", "object"]).columns
48
- for col in datetime_columns:
49
- try:
50
- input_df[col] = pd.to_datetime(input_df[col])
51
- input_df.drop(columns=[col], inplace=True)
52
- except Exception:
53
- continue
54
 
55
- # πŸš€ Step 2: Fill missing values before aggregations
56
- input_df.fillna(0, inplace=True)
57
 
58
- # πŸš€ Step 3: Compute Aggregations (Requires a reference training dataset)
59
- if train_df is not None:
60
- input_df = apply_aggregations(input_df, train_df)
61
 
62
- # πŸš€ Step 4: Ensure Categorical Features Stay as Strings
63
- categorical_columns = ["gender", "product", "campaign_id", "webpage_id"]
64
- for col in categorical_columns:
65
  input_df[col] = input_df[col].astype(str).fillna("missing")
66
 
67
- # πŸš€ Step 5: Ensure Consistent Label Encoding
68
- label_encoders = {}
69
- for col in categorical_columns:
70
- le = LabelEncoder()
71
- input_df[col] = input_df[col].astype(str)
72
- le.fit(input_df[col].unique())
73
- label_encoders[col] = le
74
- input_df[col] = input_df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
75
-
76
- # πŸš€ Step 6: Normalize Numerical Features
77
- numerical_columns = [
78
- "age_level", "city_development_index", "user_group_id", "user_depth", "var_1",
79
- "click_sum_age_sex_prod", "click_count_age_sex_prod",
80
- "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
81
- "click_sum_city_age_prod", "click_count_city_age_prod",
82
- "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
83
- ]
84
-
85
- # Check if all numerical columns exist
86
- numerical_columns = [col for col in numerical_columns if col in input_df.columns]
87
  scaler = StandardScaler()
88
- input_df[numerical_columns] = scaler.fit_transform(input_df[numerical_columns])
89
-
90
- # πŸš€ Step 7: Select Features Based on Model Type
91
- model_features = {
92
- "catboost": ["age_level", "gender", "product", "campaign_id", "webpage_id"] + numerical_columns,
93
- "xgboost": ["age_level", "gender", "product", "campaign_id", "webpage_id"] + numerical_columns,
94
- "random_forest": [
95
- "age_level", "gender", "product", "campaign_id", "webpage_id",
96
- "product_category_1", "product_category_2", "user_group_id",
97
- "user_depth", "city_development_index", "var_1"
98
- ] + numerical_columns
99
- }
100
-
101
- selected_features = model_features.get(model_type, input_df.columns)
102
-
103
- # πŸš€ Ensure only required features are passed to the model
104
- input_df = input_df[selected_features]
105
 
106
  return input_df
107
 
 
1
+
2
+ import os, shutil
3
+
4
  import streamlit as st
5
  import pandas as pd
6
  import numpy as np
 
7
  import joblib
8
+ import os
9
  from huggingface_hub import hf_hub_download
10
  from sklearn.preprocessing import LabelEncoder, StandardScaler
11
+ from catboost import Pool
12
 
13
  # Hugging Face Model Repo
14
  MODEL_REPO = "chagu13/is_click_predictor"
15
  MODEL_DIR = "models"
16
+ os.makedirs(MODEL_DIR, exist_ok=True)
17
 
18
+ # Model Filenames
19
  CATBOOST_MODEL_FILENAME = "models/catboost_model.pkl"
20
  XGB_MODEL_FILENAME = "models/xgb_model.pkl"
21
  RF_MODEL_FILENAME = "models/rf_model.pkl"
22
 
23
+ # Local Paths
24
  CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.pkl")
25
  XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.pkl")
26
  RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")
27
 
28
+ # Define Features
29
  CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"]
30
  NUMERICAL_COLUMNS = [
31
  "age_level", "city_development_index", "user_group_id", "user_depth", "var_1",
 
37
 
38
  FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
39
 
 
 
 
 
 
 
 
 
 
40
 
41
+ def preprocess_input(input_df):
42
+ """Ensure proper preprocessing, handling duplicates, missing values, and encoding."""
 
 
 
 
 
 
43
 
44
+ # Drop duplicates in columns
45
+ input_df = input_df.loc[:, ~input_df.columns.duplicated()]
46
 
47
+ # Fill missing values
48
+ input_df.fillna(0, inplace=True)
 
49
 
50
+ # Convert categorical to string
51
+ for col in CATEGORICAL_COLUMNS:
 
52
  input_df[col] = input_df[col].astype(str).fillna("missing")
53
 
54
+ # Normalize numerical columns
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  scaler = StandardScaler()
56
+ input_df[NUMERICAL_COLUMNS] = scaler.fit_transform(input_df[NUMERICAL_COLUMNS])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  return input_df
59