Spaces:

chagu13
/

is_click

Build error

App Files Files Community

chkp-talexm commited on Feb 18, 2025

Commit

14e9f1b

1 Parent(s): e285cce

update

Browse files

Files changed (1) hide show

app.py +49 -171

app.py CHANGED Viewed

@@ -1,11 +1,8 @@
-import os, shutil
 import streamlit as st
 import pandas as pd
-import numpy as np
 import joblib
-import os
 from huggingface_hub import hf_hub_download
 from sklearn.preprocessing import LabelEncoder, StandardScaler
 from catboost import Pool
@@ -16,14 +13,14 @@ MODEL_DIR = "models"
 os.makedirs(MODEL_DIR, exist_ok=True)
 # Model Filenames
-CATBOOST_MODEL_FILENAME = "models/catboost_model.pkl"
-XGB_MODEL_FILENAME = "models/xgb_model.pkl"
-RF_MODEL_FILENAME = "models/rf_model.pkl"
 # Local Paths
-CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.pkl")
-XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.pkl")
-RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")
 # Define Features
 CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"]
@@ -37,21 +34,11 @@ NUMERICAL_COLUMNS = [
 FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
-from sklearn.preprocessing import LabelEncoder, StandardScaler
-from catboost import Pool
 def preprocess_input(input_df, expected_feature_order):
-    """
-    Ensure preprocessing is correct:
-    - Removes duplicate columns
-    - Computes aggregations using only test data
-    - Ensures categorical variables are properly encoded
-    - Normalizes numerical features
-    - Adds `is_click` column with 0 for compatibility
-    - Orders columns as expected by the model
-    """
-    # Drop the DateTime column if it exists
     if "DateTime" in input_df.columns:
         input_df.drop(columns=["DateTime"], inplace=True)
@@ -65,7 +52,6 @@ def preprocess_input(input_df, expected_feature_order):
         "webpage_id": "nunique"
     }).reset_index()
-    # Fix renaming: Remove missing columns
     age_sex_product_agg.columns = ["age_level", "gender", "product",
                                    "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"]
@@ -77,7 +63,6 @@ def preprocess_input(input_df, expected_feature_order):
         "webpage_id": "nunique"
     }).reset_index()
-    # Fix renaming: Remove missing columns
     city_age_product_agg.columns = ["city_development_index", "age_level", "product",
                                     "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"]
@@ -96,52 +81,17 @@ def preprocess_input(input_df, expected_feature_order):
     # **Add `is_click` column with 0 for compatibility**
     if "is_click" not in input_df.columns:
         print("Adding `is_click` column with all values set to 0.")
-        input_df["is_click"] = 0  # Model will ignore this for prediction
-    # Feature List (Now includes `is_click`)
-    features = ["age_level", "gender", "product", "campaign_id", "webpage_id",
-                "product_category_1", "product_category_2", "user_group_id",
-                "user_depth", "city_development_index", "var_1",
-                "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
-                "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod",
-                "click_sum_age_sex_prod", "click_count_age_sex_prod",
-                "click_sum_city_age_prod", "click_count_city_age_prod",
-                "is_click"]  # Included for compatibility
-    categorical_columns = ["gender", "product", "campaign_id", "webpage_id"]
-    # ===========================
-    #  ENCODE CATEGORICAL FEATURES
-    # ===========================
-    label_encoders = {}
-    for col in categorical_columns:
-        le = LabelEncoder()
-        input_df[col] = le.fit_transform(input_df[col].astype(str))  # Apply transformation correctly
-        label_encoders[col] = le  # Store encoder for reference
-    # Normalize numerical features
-    numerical_columns = [col for col in features if col not in categorical_columns]
-    scaler = StandardScaler()
-    input_df[numerical_columns] = scaler.fit_transform(input_df[numerical_columns])
     # ===========================
     #  ENFORCE FEATURE ORDER
     # ===========================
-    missing_features = set(expected_feature_order) - set(input_df.columns)
-    extra_features = set(input_df.columns) - set(expected_feature_order)
-    # Add missing features with default values
-    for col in missing_features:
-        print(f"Warning: Missing feature {col}. Filling with 0.")
-        input_df[col] = 0
-    # Drop unexpected features
-    if extra_features:
-        print(f"Warning: Dropping unexpected features: {extra_features}")
-        input_df = input_df.drop(columns=list(extra_features))
-    # Reorder columns to match the model's expected input
     input_df = input_df[expected_feature_order]
     return input_df
@@ -151,7 +101,6 @@ def download_model(filename, local_path):
     """Download model from Hugging Face and move it to the correct location."""
     temp_path = hf_hub_download(repo_id=MODEL_REPO, filename=filename, local_dir=MODEL_DIR)
-    # Ensure correct file placement
     if temp_path != local_path:
         shutil.move(temp_path, local_path)
@@ -163,7 +112,6 @@ def load_models():
     try:
         print("🔄 Checking and downloading models...")
-        # Ensure models are downloaded and placed correctly
         if not os.path.exists(CATBOOST_MODEL_PATH):
             print("🚀 Downloading CatBoost model...")
             download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH)
@@ -176,7 +124,6 @@ def load_models():
             print("🚀 Downloading RandomForest model...")
             download_model(RF_MODEL_FILENAME, RF_MODEL_PATH)
-        # ✅ Load models
         print("📦 Loading models...")
         catboost_model = joblib.load(CATBOOST_MODEL_PATH)
         xgb_model = joblib.load(XGB_MODEL_PATH)
@@ -189,11 +136,19 @@ def load_models():
         print(f"❌ Error loading models: {e}")
         return None, None, None
 # Streamlit UI
 st.title("Is_Click Predictor - ML Model Inference")
 st.info("Upload a CSV file, and the trained models will predict click probability.")
-catboost, xgb, rf = load_models()
 # Upload File
 uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
@@ -202,124 +157,47 @@ if uploaded_file:
     st.success("File uploaded successfully!")
     # ✅ Compute aggregations & preprocess
-    input_df = preprocess_input(input_df)
     # ✅ Make Predictions
     st.subheader("Predictions in Progress...")
-    from catboost import Pool
     # Define categorical features (MUST MATCH what was used during training)
     cat_features = ["gender", "product", "campaign_id", "webpage_id"]
-    # Convert categorical features to strings (MUST be string, not float)
     for col in cat_features:
         input_df[col] = input_df[col].astype(str)
-    expected_feature_order = catboost.feature_names_
-    print("Expected Feature Order:", expected_feature_order)
-    # Ensure input_df has the correct column order
-    input_df = input_df[expected_feature_order]
-    input_pool = Pool(input_df, cat_features=cat_features)
-    catboost_preds = catboost.predict(input_pool)
-    catboost_probs = catboost.predict_proba(input_df)[:, 1]
-    label_encoders = {}  # Store encoders to ensure consistency
-    for col in cat_features:
-        le = LabelEncoder()
-        input_df[col] = input_df[col].astype(str)  # Ensure it's a string
-        le.fit(input_df[col])  # Fit only on input_df (since training is done)
-        label_encoders[col] = le  # Save encoder for reference
-        input_df[col] = le.transform(input_df[col])
-    # List of features used during training for XGBoost
-    xgb_training_features = [
-        "age_level", "gender", "product", "campaign_id", "webpage_id",
-        "product_category_1", "product_category_2", "user_group_id",
-        "user_depth", "city_development_index", "var_1",
-        "click_sum_age_sex_prod", "click_count_age_sex_prod",
-        "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
-        "click_sum_city_age_prod", "click_count_city_age_prod",
-        "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
-    ]
-    xgb_preds = xgb.predict(input_df[xgb_training_features])
-    # # 🔥 List of features RandomForest was trained with
-    # rf_training_features = [
-    #     "age_level", "gender", "product", "campaign_id", "webpage_id",
-    #     "product_category_1", "product_category_2", "user_group_id",
-    #     "user_depth", "city_development_index", "var_1",
-    #     "click_sum_age_sex_prod", "click_count_age_sex_prod",
-    #     "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
-    #     "click_sum_city_age_prod", "click_count_city_age_prod",
-    #     "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
-    # ]
-    #
-    # # ✅ Ensure all training features exist in `input_df`
-    # for col in rf_training_features:
-    #     if col not in input_df.columns:
-    #         input_df[col] = 0  # Default missing columns to 0
-    #
-    # # Get intersection of trained features and current input_df columns
-    # common_features = list(set(rf.feature_names_in_) & set(input_df.columns))
-    #
-    # # Select only the matching features
-    # input_df_rf = input_df[common_features]
-    #
-    # # Predict without needing to add missing features
-    # rf_preds = rf.predict(input_df_rf)
-    #
-    #
-    # print("RF Model Trained Features:", rf.feature_names_in_)
-    # print("Input Data Features:", input_df_rf.columns.tolist())
-    #
-    # # Debugging: Check for missing or extra features
-    # missing_features = set(rf.feature_names_in_) - set(input_df_rf.columns)
-    # extra_features = set(input_df_rf.columns) - set(rf.feature_names_in_)
-    #
-    # print("Missing Features in Input:", missing_features)
-    # print("Extra Features in Input:", extra_features)
-    # # ✅ Make Predictions with RandomForest
-    # rf_preds = rf.predict(input_df_rf)
-    xgb_probs = xgb.predict_proba(input_df)[:, 1]
-    #rf_probs = rf.predict_proba(input_df)[:, 1]
- #test
     # Combine results
     predictions_df = pd.DataFrame({
         "CatBoost": catboost_preds,
         "XGBoost": xgb_preds,
-      #  "RandomForest": rf_preds
     })
-    # Apply "at least one model predicts 1" rule
-    predictions_df["is_click_predicted"] = predictions_df.max(axis=1)
-    # Generate probability file
-    probabilities_df = pd.DataFrame({
-        "CatBoost_Prob": catboost_probs,
-        "XGBoost_Prob": xgb_probs,
-      #  "RandomForest_Prob": rf_probs
-    })
-    # Save results
-    binary_predictions_path = "binary_predictions.csv"
-    filtered_predictions_path = "filtered_predictions.csv"
-    probabilities_path = "model_probabilities.csv"
-    predictions_df.to_csv(binary_predictions_path, index=False)
-    predictions_df[predictions_df["is_click_predicted"] == 1].to_csv(filtered_predictions_path, index=False)
-    probabilities_df.to_csv(probabilities_path, index=False)
     st.success("Predictions completed! Download results below.")
-    # Download Buttons
-    with open(binary_predictions_path, "rb") as f:
-        st.download_button("Download Binary Predictions (0/1)", f, file_name="binary_predictions.csv")
-    with open(filtered_predictions_path, "rb") as f:
-        st.download_button("Download Clicked Predictions (Only 1s)", f, file_name="filtered_predictions.csv")
-    with open(probabilities_path, "rb") as f:
-        st.download_button("Download Probability Predictions", f, file_name="model_probabilities.csv")

+import os
+import shutil
 import streamlit as st
 import pandas as pd
 import joblib
 from huggingface_hub import hf_hub_download
 from sklearn.preprocessing import LabelEncoder, StandardScaler
 from catboost import Pool
 os.makedirs(MODEL_DIR, exist_ok=True)
 # Model Filenames
+CATBOOST_MODEL_FILENAME = "catboost_model.pkl"
+XGB_MODEL_FILENAME = "xgb_model.pkl"
+RF_MODEL_FILENAME = "rf_model.pkl"
 # Local Paths
+CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, CATBOOST_MODEL_FILENAME)
+XGB_MODEL_PATH = os.path.join(MODEL_DIR, XGB_MODEL_FILENAME)
+RF_MODEL_PATH = os.path.join(MODEL_DIR, RF_MODEL_FILENAME)
 # Define Features
 CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"]
 FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
 def preprocess_input(input_df, expected_feature_order):
+    """ Preprocess input data before making predictions """
+    # Drop DateTime if present
     if "DateTime" in input_df.columns:
         input_df.drop(columns=["DateTime"], inplace=True)
         "webpage_id": "nunique"
     }).reset_index()
     age_sex_product_agg.columns = ["age_level", "gender", "product",
                                    "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"]
         "webpage_id": "nunique"
     }).reset_index()
     city_age_product_agg.columns = ["city_development_index", "age_level", "product",
                                     "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"]
     # **Add `is_click` column with 0 for compatibility**
     if "is_click" not in input_df.columns:
         print("Adding `is_click` column with all values set to 0.")
+        input_df["is_click"] = 0
     # ===========================
     #  ENFORCE FEATURE ORDER
     # ===========================
+    for col in expected_feature_order:
+        if col not in input_df.columns:
+            print(f"Warning: Missing feature {col}. Filling with 0.")
+            input_df[col] = 0
+    # Reorder columns
     input_df = input_df[expected_feature_order]
     return input_df
     """Download model from Hugging Face and move it to the correct location."""
     temp_path = hf_hub_download(repo_id=MODEL_REPO, filename=filename, local_dir=MODEL_DIR)
     if temp_path != local_path:
         shutil.move(temp_path, local_path)
     try:
         print("🔄 Checking and downloading models...")
         if not os.path.exists(CATBOOST_MODEL_PATH):
             print("🚀 Downloading CatBoost model...")
             download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH)
             print("🚀 Downloading RandomForest model...")
             download_model(RF_MODEL_FILENAME, RF_MODEL_PATH)
         print("📦 Loading models...")
         catboost_model = joblib.load(CATBOOST_MODEL_PATH)
         xgb_model = joblib.load(XGB_MODEL_PATH)
         print(f"❌ Error loading models: {e}")
         return None, None, None
 # Streamlit UI
 st.title("Is_Click Predictor - ML Model Inference")
 st.info("Upload a CSV file, and the trained models will predict click probability.")
+catboost_model, xgb_model, rf_model = load_models()
+if not catboost_model:
+    st.error("❌ Error: Failed to load models. Please check your Hugging Face repo.")
+    st.stop()
+expected_feature_order = catboost_model.feature_names_
+print("Expected Feature Order:", expected_feature_order)
 # Upload File
 uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
     st.success("File uploaded successfully!")
     # ✅ Compute aggregations & preprocess
+    input_df = preprocess_input(input_df, expected_feature_order)
     # ✅ Make Predictions
     st.subheader("Predictions in Progress...")
     # Define categorical features (MUST MATCH what was used during training)
     cat_features = ["gender", "product", "campaign_id", "webpage_id"]
+    # Convert categorical features to string type
     for col in cat_features:
         input_df[col] = input_df[col].astype(str)
+    # Create CatBoost pool
+    input_pool = Pool(input_df, cat_features=cat_features)
+    catboost_preds = catboost_model.predict(input_pool)
+    catboost_probs = catboost_model.predict_proba(input_pool)[:, 1]
+    # Ensure all required columns exist for XGBoost
+    for col in xgb_model.feature_names_in_:
+        if col not in input_df.columns:
+            input_df[col] = 0
+    xgb_preds = xgb_model.predict(input_df[xgb_model.feature_names_in_])
+    xgb_probs = xgb_model.predict_proba(input_df[xgb_model.feature_names_in_])[:, 1]
+    # Ensure all required columns exist for RandomForest
+    for col in rf_model.feature_names_in_:
+        if col not in input_df.columns:
+            input_df[col] = 0
+    rf_preds = rf_model.predict(input_df[rf_model.feature_names_in_])
+    rf_probs = rf_model.predict_proba(input_df[rf_model.feature_names_in_])[:, 1]
     # Combine results
     predictions_df = pd.DataFrame({
         "CatBoost": catboost_preds,
         "XGBoost": xgb_preds,
+        "RandomForest": rf_preds
     })
+    predictions_df["is_click_predicted"] = predictions_df.max(axis=1)
     st.success("Predictions completed! Download results below.")
+    st.dataframe(predictions_df)