chkp-talexm commited on
Commit
57da9af
Β·
1 Parent(s): 855e055
Files changed (1) hide show
  1. app.py +88 -59
app.py CHANGED
@@ -33,21 +33,14 @@ NUMERICAL_COLUMNS = [
33
 
34
  FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
35
 
36
- import pandas as pd
37
- from sklearn.preprocessing import LabelEncoder, StandardScaler
38
-
39
- import pandas as pd
40
- import numpy as np
41
- from sklearn.preprocessing import LabelEncoder, StandardScaler
42
-
43
-
44
- def preprocess_input(input_df):
45
  """
46
- Preprocess input data before passing it to CatBoost in the Streamlit app.
47
  - Removes DateTime columns
48
  - Computes aggregations
49
- - Ensures categorical variables are kept as strings
50
  - Normalizes numerical features
 
51
  """
52
 
53
  # πŸš€ Step 1: Drop DateTime Columns
@@ -62,55 +55,25 @@ def preprocess_input(input_df):
62
  # πŸš€ Step 2: Fill missing values before aggregations
63
  input_df.fillna(0, inplace=True)
64
 
65
- # πŸš€ Step 3: Compute Aggregations
66
- def compute_aggregations(df, group_cols, agg_dict, suffix):
67
- if not all(col in df.columns for col in group_cols):
68
- print(f"πŸ›‘ Missing grouping columns for aggregation: {group_cols}")
69
- return df # Skip if group columns are missing
70
-
71
- agg_df = df.groupby(group_cols).agg(agg_dict).reset_index()
72
- if agg_df.empty:
73
- print(f"πŸ›‘ Aggregation resulted in an empty DataFrame for {group_cols}")
74
- return df
75
-
76
- agg_df.columns = group_cols + [f"{col}_{suffix}" for col in agg_df.columns[len(group_cols):]]
77
- return df.merge(agg_df, on=group_cols, how="left")
78
-
79
- # Check if required columns are present before aggregating
80
- required_columns = ["age_level", "gender", "product", "city_development_index"]
81
- if not all(col in input_df.columns for col in required_columns):
82
- print("πŸ›‘ Missing required columns for aggregations. Skipping aggregation steps.")
83
- else:
84
- # Aggregation: Age & Gender vs Product
85
- input_df = compute_aggregations(input_df, ["age_level", "gender", "product"], {
86
- "campaign_id": "nunique",
87
- "webpage_id": "nunique"
88
- }, suffix="age_sex_prod")
89
-
90
- # Aggregation: City, Age, Product
91
- input_df = compute_aggregations(input_df, ["city_development_index", "age_level", "product"], {
92
- "campaign_id": "nunique",
93
- "webpage_id": "nunique"
94
- }, suffix="city_age_prod")
95
-
96
- # πŸš€ Step 4: Add Missing Aggregated Columns with Default Values
97
- aggregated_features = [
98
- "click_sum_age_sex_prod", "click_count_age_sex_prod", "unique_campaigns_age_sex_prod",
99
- "unique_webpages_age_sex_prod",
100
- "click_sum_city_age_prod", "click_count_city_age_prod", "unique_campaigns_city_age_prod",
101
- "unique_webpages_city_age_prod"
102
- ]
103
-
104
- for col in aggregated_features:
105
- if col not in input_df.columns:
106
- input_df[col] = 0 # Fill missing aggregated columns with default values
107
 
108
- # πŸš€ Step 5: **Ensure Categorical Features Stay as Strings**
109
  categorical_columns = ["gender", "product", "campaign_id", "webpage_id"]
110
  for col in categorical_columns:
111
- input_df[col] = input_df[col].astype(str).fillna("missing") # **Convert to string**
 
 
 
 
 
 
 
 
 
112
 
113
- # πŸš€ Step 6: **Normalize Only Numerical Features** (DO NOT Normalize Categorical Columns)
114
  numerical_columns = [
115
  "age_level", "city_development_index", "user_group_id", "user_depth", "var_1",
116
  "click_sum_age_sex_prod", "click_count_age_sex_prod",
@@ -119,13 +82,79 @@ def preprocess_input(input_df):
119
  "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
120
  ]
121
 
122
- # Ensure all numerical columns exist
123
  numerical_columns = [col for col in numerical_columns if col in input_df.columns]
124
-
125
- # Normalize numerical features
126
  scaler = StandardScaler()
127
  input_df[numerical_columns] = scaler.fit_transform(input_df[numerical_columns])
128
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
  return input_df
130
 
131
 
 
33
 
34
  FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
35
 
36
+ def preprocess_input(input_df, train_df=None, model_type="catboost"):
 
 
 
 
 
 
 
 
37
  """
38
+ Preprocess input data before passing it to ML models.
39
  - Removes DateTime columns
40
  - Computes aggregations
41
+ - Ensures categorical variables are properly encoded
42
  - Normalizes numerical features
43
+ - Selects only required features for the given model
44
  """
45
 
46
  # πŸš€ Step 1: Drop DateTime Columns
 
55
  # πŸš€ Step 2: Fill missing values before aggregations
56
  input_df.fillna(0, inplace=True)
57
 
58
+ # πŸš€ Step 3: Compute Aggregations (Requires a reference training dataset)
59
+ if train_df is not None:
60
+ input_df = apply_aggregations(input_df, train_df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
+ # πŸš€ Step 4: Ensure Categorical Features Stay as Strings
63
  categorical_columns = ["gender", "product", "campaign_id", "webpage_id"]
64
  for col in categorical_columns:
65
+ input_df[col] = input_df[col].astype(str).fillna("missing")
66
+
67
+ # πŸš€ Step 5: Ensure Consistent Label Encoding
68
+ label_encoders = {}
69
+ for col in categorical_columns:
70
+ le = LabelEncoder()
71
+ input_df[col] = input_df[col].astype(str)
72
+ le.fit(input_df[col].unique())
73
+ label_encoders[col] = le
74
+ input_df[col] = input_df[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
75
 
76
+ # πŸš€ Step 6: Normalize Numerical Features
77
  numerical_columns = [
78
  "age_level", "city_development_index", "user_group_id", "user_depth", "var_1",
79
  "click_sum_age_sex_prod", "click_count_age_sex_prod",
 
82
  "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
83
  ]
84
 
85
+ # Check if all numerical columns exist
86
  numerical_columns = [col for col in numerical_columns if col in input_df.columns]
 
 
87
  scaler = StandardScaler()
88
  input_df[numerical_columns] = scaler.fit_transform(input_df[numerical_columns])
89
 
90
+ # πŸš€ Step 7: Select Features Based on Model Type
91
+ model_features = {
92
+ "catboost": ["age_level", "gender", "product", "campaign_id", "webpage_id"] + numerical_columns,
93
+ "xgboost": ["age_level", "gender", "product", "campaign_id", "webpage_id"] + numerical_columns,
94
+ "random_forest": [
95
+ "age_level", "gender", "product", "campaign_id", "webpage_id",
96
+ "product_category_1", "product_category_2", "user_group_id",
97
+ "user_depth", "city_development_index", "var_1"
98
+ ] + numerical_columns
99
+ }
100
+
101
+ selected_features = model_features.get(model_type, input_df.columns)
102
+
103
+ # πŸš€ Ensure only required features are passed to the model
104
+ input_df = input_df[selected_features]
105
+
106
+ return input_df
107
+
108
+
109
+ def apply_aggregations(input_df, train_df):
110
+ """
111
+ Applies necessary aggregations to the input DataFrame using reference training data.
112
+ """
113
+
114
+ # πŸš€ Step 1: Aggregate by age & gender vs product
115
+ age_sex_product_agg = train_df.groupby(["age_level", "gender", "product"]).agg({
116
+ "is_click": ["sum", "count"],
117
+ "campaign_id": "nunique",
118
+ "webpage_id": "nunique"
119
+ }).reset_index()
120
+
121
+ # Rename columns after aggregation
122
+ age_sex_product_agg.columns = ["age_level", "gender", "product",
123
+ "click_sum_age_sex_prod", "click_count_age_sex_prod",
124
+ "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"]
125
+
126
+ # Merge back into input data
127
+ input_df = input_df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left")
128
+
129
+ # πŸš€ Step 2: Aggregate by city, age, product
130
+ city_age_product_agg = train_df.groupby(["city_development_index", "age_level", "product"]).agg({
131
+ "is_click": ["sum", "count"],
132
+ "campaign_id": "nunique",
133
+ "webpage_id": "nunique"
134
+ }).reset_index()
135
+
136
+ # Rename columns after aggregation
137
+ city_age_product_agg.columns = ["city_development_index", "age_level", "product",
138
+ "click_sum_city_age_prod", "click_count_city_age_prod",
139
+ "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"]
140
+
141
+ # Merge into input data
142
+ input_df = input_df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left")
143
+
144
+ # πŸš€ Step 3: Fill NaN values introduced by merging
145
+ aggregated_features = [
146
+ "click_sum_age_sex_prod", "click_count_age_sex_prod", "unique_campaigns_age_sex_prod",
147
+ "unique_webpages_age_sex_prod",
148
+ "click_sum_city_age_prod", "click_count_city_age_prod", "unique_campaigns_city_age_prod",
149
+ "unique_webpages_city_age_prod"
150
+ ]
151
+
152
+ for col in aggregated_features:
153
+ if col not in input_df.columns:
154
+ input_df[col] = 0 # Fill missing aggregated columns with default values
155
+ else:
156
+ input_df[col].fillna(0, inplace=True)
157
+
158
  return input_df
159
 
160