Spaces:

chagu13
/

is_click

Build error

App Files Files Community

chkp-talexm commited on Feb 18, 2025

Commit

8a5806f

1 Parent(s): 615184d

update

Browse files

Files changed (1) hide show

app.py +219 -55

app.py CHANGED Viewed

@@ -1,10 +1,13 @@
-import os
-import shutil
 import streamlit as st
 import pandas as pd
 import numpy as np
 import joblib
 from huggingface_hub import hf_hub_download
 from catboost import Pool
 # Hugging Face Model Repo
@@ -34,10 +37,21 @@ NUMERICAL_COLUMNS = [
 FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
-def preprocess_input(input_df, expected_feature_order):
-    """Prepares test data to match the training format"""
-    # Drop DateTime column if present
     if "DateTime" in input_df.columns:
         input_df.drop(columns=["DateTime"], inplace=True)
@@ -45,32 +59,124 @@ def preprocess_input(input_df, expected_feature_order):
     input_df = input_df.loc[:, ~input_df.columns.duplicated()]
     input_df.fillna(0, inplace=True)
-    # Ensure missing columns exist
-    for col in expected_feature_order:
-        if col not in input_df.columns:
-            print(f"⚠️ Warning: Missing feature {col}. Filling with 0.")
-            input_df[col] = 0
-    # Reorder columns before prediction
     input_df = input_df[expected_feature_order]
     return input_df
 def load_models():
-    """Downloads and loads models from Hugging Face."""
     try:
         print("🔄 Checking and downloading models...")
         if not os.path.exists(CATBOOST_MODEL_PATH):
             download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH)
         if not os.path.exists(XGB_MODEL_PATH):
             download_model(XGB_MODEL_FILENAME, XGB_MODEL_PATH)
         if not os.path.exists(RF_MODEL_PATH):
             download_model(RF_MODEL_FILENAME, RF_MODEL_PATH)
         print("📦 Loading models...")
         catboost_model = joblib.load(CATBOOST_MODEL_PATH)
         xgb_model = joblib.load(XGB_MODEL_PATH)
@@ -83,20 +189,14 @@ def load_models():
         print(f"❌ Error loading models: {e}")
         return None, None, None
 # Streamlit UI
 st.title("Is_Click Predictor - ML Model Inference")
 st.info("Upload a CSV file, and the trained models will predict click probability.")
 catboost, xgb, rf = load_models()
-if not catboost:
-    st.error("❌ Error: Failed to load models. Please check your Hugging Face repo.")
-    st.stop()
 expected_feature_order = catboost.feature_names_
 print("Expected Feature Order:", expected_feature_order)
 # Upload File
 uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
 if uploaded_file:
@@ -106,58 +206,122 @@ if uploaded_file:
     # ✅ Compute aggregations & preprocess
     input_df = preprocess_input(input_df, expected_feature_order)
-    # ✅ Debugging: Check probability distribution before prediction
-    print("🔍 Checking feature distributions before prediction...")
-    print(input_df.describe())
     # ✅ Make Predictions
     st.subheader("Predictions in Progress...")
-    # Create CatBoost pool
-    cat_features = CATEGORICAL_COLUMNS
-    input_pool = Pool(input_df, cat_features=cat_features)
-    catboost_probs = catboost.predict_proba(input_pool)[:, 1]
-    # ✅ Adjust decision threshold
-    THRESHOLD = 0.6  # Reduce false positives
-    catboost_preds = (catboost_probs >= THRESHOLD).astype(int)
-    # Ensure all required columns exist for XGBoost
-    for col in xgb.feature_names_in_:
-        if col not in input_df.columns:
-            input_df[col] = 0
-    xgb_probs = xgb.predict_proba(input_df[xgb.feature_names_in_])[:, 1]
-    xgb_preds = (xgb_probs >= THRESHOLD).astype(int)
-    # Ensure all required columns exist for RandomForest
-    for col in rf.feature_names_in_:
-        if col not in input_df.columns:
-            input_df[col] = 0
-    rf_probs = rf.predict_proba(input_df[rf.feature_names_in_])[:, 1]
-    rf_preds = (rf_probs >= THRESHOLD).astype(int)
-    # ✅ Fix: Debug probability distributions to verify realistic predictions
-    print("🔍 Probability distributions:")
-    print("CatBoost:", pd.Series(catboost_probs).describe())
-    print("XGBoost:", pd.Series(xgb_probs).describe())
-    print("RandomForest:", pd.Series(rf_probs).describe())
     # Combine results
     predictions_df = pd.DataFrame({
         "CatBoost": catboost_preds,
         "XGBoost": xgb_preds,
-        "RandomForest": rf_preds
     })
     # Apply "at least one model predicts 1" rule
     predictions_df["is_click_predicted"] = predictions_df.max(axis=1)
     # Save results
-    predictions_df.to_csv("binary_predictions.csv", index=False)
-    predictions_df[predictions_df["is_click_predicted"] == 1].to_csv("filtered_predictions.csv", index=False)
     st.success("Predictions completed! Download results below.")
-    st.dataframe(predictions_df)

+import os, shutil
 import streamlit as st
 import pandas as pd
 import numpy as np
 import joblib
+import os
 from huggingface_hub import hf_hub_download
+from sklearn.preprocessing import LabelEncoder, StandardScaler
 from catboost import Pool
 # Hugging Face Model Repo
 FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
+from sklearn.preprocessing import LabelEncoder, StandardScaler
+from catboost import Pool
+def preprocess_input(input_df, expected_feature_order):
+    """
+    Ensure preprocessing is correct:
+    - Removes duplicate columns
+    - Computes aggregations using only test data
+    - Ensures categorical variables are properly encoded
+    - Normalizes numerical features
+    - Adds `is_click` column with 0 for compatibility
+    - Orders columns as expected by the model
+    """
+    # Drop the DateTime column if it exists
     if "DateTime" in input_df.columns:
         input_df.drop(columns=["DateTime"], inplace=True)
     input_df = input_df.loc[:, ~input_df.columns.duplicated()]
     input_df.fillna(0, inplace=True)
+    # Aggregate by age & gender vs product
+    age_sex_product_agg = input_df.groupby(["age_level", "gender", "product"]).agg({
+        "campaign_id": "nunique",
+        "webpage_id": "nunique"
+    }).reset_index()
+    # Fix renaming: Remove missing columns
+    age_sex_product_agg.columns = ["age_level", "gender", "product",
+                                   "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"]
+    input_df = input_df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left")
+    # Aggregate by city, age, product
+    city_age_product_agg = input_df.groupby(["city_development_index", "age_level", "product"]).agg({
+        "campaign_id": "nunique",
+        "webpage_id": "nunique"
+    }).reset_index()
+    # Fix renaming: Remove missing columns
+    city_age_product_agg.columns = ["city_development_index", "age_level", "product",
+                                    "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"]
+    input_df = input_df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left")
+    input_df.fillna(0, inplace=True)
+    # **Ensure missing columns exist (Important Fix)**
+    missing_columns = ["click_sum_age_sex_prod", "click_count_age_sex_prod",
+                       "click_sum_city_age_prod", "click_count_city_age_prod"]
+    for col in missing_columns:
+        if col not in input_df.columns:
+            print(f"Warning: Missing column {col}. Filling with 0.")
+            input_df[col] = 0  # Fill missing columns with default values
+    # **Add `is_click` column with 0 for compatibility**
+    if "is_click" not in input_df.columns:
+        print("Adding `is_click` column with all values set to 0.")
+        input_df["is_click"] = 0  # Model will ignore this for prediction
+    # Feature List (Now includes `is_click`)
+    features = ["age_level", "gender", "product", "campaign_id", "webpage_id",
+                "product_category_1", "product_category_2", "user_group_id",
+                "user_depth", "city_development_index", "var_1",
+                "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
+                "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod",
+                "click_sum_age_sex_prod", "click_count_age_sex_prod",
+                "click_sum_city_age_prod", "click_count_city_age_prod",
+                "is_click"]  # Included for compatibility
+    categorical_columns = ["gender", "product", "campaign_id", "webpage_id"]
+    # ===========================
+    #  ENCODE CATEGORICAL FEATURES
+    # ===========================
+    label_encoders = {}
+    for col in categorical_columns:
+        le = LabelEncoder()
+        input_df[col] = le.fit_transform(input_df[col].astype(str))  # Apply transformation correctly
+        label_encoders[col] = le  # Store encoder for reference
+    # Normalize numerical features
+    numerical_columns = [col for col in features if col not in categorical_columns]
+    scaler = StandardScaler()
+    input_df[numerical_columns] = scaler.fit_transform(input_df[numerical_columns])
+    # ===========================
+    #  ENFORCE FEATURE ORDER
+    # ===========================
+    missing_features = set(expected_feature_order) - set(input_df.columns)
+    extra_features = set(input_df.columns) - set(expected_feature_order)
+    # Add missing features with default values
+    for col in missing_features:
+        print(f"Warning: Missing feature {col}. Filling with 0.")
+        input_df[col] = 0
+    # Drop unexpected features
+    if extra_features:
+        print(f"Warning: Dropping unexpected features: {extra_features}")
+        input_df = input_df.drop(columns=list(extra_features))
+    # Reorder columns to match the model's expected input
     input_df = input_df[expected_feature_order]
     return input_df
+def download_model(filename, local_path):
+    """Download model from Hugging Face and move it to the correct location."""
+    temp_path = hf_hub_download(repo_id=MODEL_REPO, filename=filename, local_dir=MODEL_DIR)
+    # Ensure correct file placement
+    if temp_path != local_path:
+        shutil.move(temp_path, local_path)
+    return local_path
 def load_models():
+    """Download and load models from Hugging Face."""
     try:
         print("🔄 Checking and downloading models...")
+        # Ensure models are downloaded and placed correctly
         if not os.path.exists(CATBOOST_MODEL_PATH):
+            print("🚀 Downloading CatBoost model...")
             download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH)
         if not os.path.exists(XGB_MODEL_PATH):
+            print("🚀 Downloading XGBoost model...")
             download_model(XGB_MODEL_FILENAME, XGB_MODEL_PATH)
         if not os.path.exists(RF_MODEL_PATH):
+            print("🚀 Downloading RandomForest model...")
             download_model(RF_MODEL_FILENAME, RF_MODEL_PATH)
+        # ✅ Load models
         print("📦 Loading models...")
         catboost_model = joblib.load(CATBOOST_MODEL_PATH)
         xgb_model = joblib.load(XGB_MODEL_PATH)
         print(f"❌ Error loading models: {e}")
         return None, None, None
 # Streamlit UI
 st.title("Is_Click Predictor - ML Model Inference")
 st.info("Upload a CSV file, and the trained models will predict click probability.")
 catboost, xgb, rf = load_models()
 expected_feature_order = catboost.feature_names_
 print("Expected Feature Order:", expected_feature_order)
 # Upload File
 uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
 if uploaded_file:
     # ✅ Compute aggregations & preprocess
     input_df = preprocess_input(input_df, expected_feature_order)
     # ✅ Make Predictions
     st.subheader("Predictions in Progress...")
+    from catboost import Pool
+    # Define categorical features (MUST MATCH what was used during training)
+    cat_features = ["gender", "product", "campaign_id", "webpage_id"]
+    # Convert categorical features to strings (MUST be string, not float)
+    for col in cat_features:
+        input_df[col] = input_df[col].astype(str)
+    expected_feature_order = catboost.feature_names_
+    print("Expected Feature Order:", expected_feature_order)
+    # Ensure input_df has the correct column order
+    input_df = input_df[expected_feature_order]
+    input_pool = Pool(input_df, cat_features=cat_features)
+    catboost_preds = catboost.predict(input_pool)
+    catboost_probs = catboost.predict_proba(input_df)[:, 1]
+    label_encoders = {}  # Store encoders to ensure consistency
+    for col in cat_features:
+        le = LabelEncoder()
+        input_df[col] = input_df[col].astype(str)  # Ensure it's a string
+        le.fit(input_df[col])  # Fit only on input_df (since training is done)
+        label_encoders[col] = le  # Save encoder for reference
+        input_df[col] = le.transform(input_df[col])
+    # List of features used during training for XGBoost
+    xgb_training_features = [
+        "age_level", "gender", "product", "campaign_id", "webpage_id",
+        "product_category_1", "product_category_2", "user_group_id",
+        "user_depth", "city_development_index", "var_1",
+        "click_sum_age_sex_prod", "click_count_age_sex_prod",
+        "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
+        "click_sum_city_age_prod", "click_count_city_age_prod",
+        "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
+    ]
+    xgb_preds = xgb.predict(input_df[xgb_training_features])
+    # # 🔥 List of features RandomForest was trained with
+    # rf_training_features = [
+    #     "age_level", "gender", "product", "campaign_id", "webpage_id",
+    #     "product_category_1", "product_category_2", "user_group_id",
+    #     "user_depth", "city_development_index", "var_1",
+    #     "click_sum_age_sex_prod", "click_count_age_sex_prod",
+    #     "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
+    #     "click_sum_city_age_prod", "click_count_city_age_prod",
+    #     "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
+    # ]
+    #
+    # # ✅ Ensure all training features exist in `input_df`
+    # for col in rf_training_features:
+    #     if col not in input_df.columns:
+    #         input_df[col] = 0  # Default missing columns to 0
+    #
+    # # Get intersection of trained features and current input_df columns
+    # common_features = list(set(rf.feature_names_in_) & set(input_df.columns))
+    #
+    # # Select only the matching features
+    # input_df_rf = input_df[common_features]
+    #
+    # # Predict without needing to add missing features
+    # rf_preds = rf.predict(input_df_rf)
+    #
+    #
+    # print("RF Model Trained Features:", rf.feature_names_in_)
+    # print("Input Data Features:", input_df_rf.columns.tolist())
+    #
+    # # Debugging: Check for missing or extra features
+    # missing_features = set(rf.feature_names_in_) - set(input_df_rf.columns)
+    # extra_features = set(input_df_rf.columns) - set(rf.feature_names_in_)
+    #
+    # print("Missing Features in Input:", missing_features)
+    # print("Extra Features in Input:", extra_features)
+    # # ✅ Make Predictions with RandomForest
+    # rf_preds = rf.predict(input_df_rf)
+    xgb_probs = xgb.predict_proba(input_df)[:, 1]
+    #rf_probs = rf.predict_proba(input_df)[:, 1]
+ #test
     # Combine results
     predictions_df = pd.DataFrame({
         "CatBoost": catboost_preds,
         "XGBoost": xgb_preds,
+      #  "RandomForest": rf_preds
     })
     # Apply "at least one model predicts 1" rule
     predictions_df["is_click_predicted"] = predictions_df.max(axis=1)
+    # Generate probability file
+    probabilities_df = pd.DataFrame({
+        "CatBoost_Prob": catboost_probs,
+        "XGBoost_Prob": xgb_probs,
+      #  "RandomForest_Prob": rf_probs
+    })
     # Save results
+    binary_predictions_path = "binary_predictions.csv"
+    filtered_predictions_path = "filtered_predictions.csv"
+    probabilities_path = "model_probabilities.csv"
+    predictions_df.to_csv(binary_predictions_path, index=False)
+    predictions_df[predictions_df["is_click_predicted"] == 1].to_csv(filtered_predictions_path, index=False)
+    probabilities_df.to_csv(probabilities_path, index=False)
     st.success("Predictions completed! Download results below.")
+    # Download Buttons
+    with open(binary_predictions_path, "rb") as f:
+        st.download_button("Download Binary Predictions (0/1)", f, file_name="binary_predictions.csv")
+    with open(filtered_predictions_path, "rb") as f:
+        st.download_button("Download Clicked Predictions (Only 1s)", f, file_name="filtered_predictions.csv")
+    with open(probabilities_path, "rb") as f:
+        st.download_button("Download Probability Predictions", f, file_name="model_probabilities.csv")