Spaces:

chagu13
/

is_click

Build error

App Files Files Community

chkp-talexm commited on Feb 18, 2025

Commit

617b96b

1 Parent(s): 8a5806f

update

Browse files

Files changed (1) hide show

app.py +56 -230

app.py CHANGED Viewed

@@ -1,13 +1,9 @@
-import os, shutil
 import streamlit as st
 import pandas as pd
-import numpy as np
 import joblib
-import os
 from huggingface_hub import hf_hub_download
-from sklearn.preprocessing import LabelEncoder, StandardScaler
 from catboost import Pool
 # Hugging Face Model Repo
@@ -16,14 +12,14 @@ MODEL_DIR = "models"
 os.makedirs(MODEL_DIR, exist_ok=True)
 # Model Filenames
-CATBOOST_MODEL_FILENAME = "models/catboost_model.pkl"
-XGB_MODEL_FILENAME = "models/xgb_model.pkl"
-RF_MODEL_FILENAME = "models/rf_model.pkl"
 # Local Paths
-CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.pkl")
-XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.pkl")
-RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")
 # Define Features
 CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"]
@@ -37,121 +33,12 @@ NUMERICAL_COLUMNS = [
 FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
-from sklearn.preprocessing import LabelEncoder, StandardScaler
-from catboost import Pool
-def preprocess_input(input_df, expected_feature_order):
-    """
-    Ensure preprocessing is correct:
-    - Removes duplicate columns
-    - Computes aggregations using only test data
-    - Ensures categorical variables are properly encoded
-    - Normalizes numerical features
-    - Adds `is_click` column with 0 for compatibility
-    - Orders columns as expected by the model
-    """
-    # Drop the DateTime column if it exists
-    if "DateTime" in input_df.columns:
-        input_df.drop(columns=["DateTime"], inplace=True)
-    # Remove duplicate columns
-    input_df = input_df.loc[:, ~input_df.columns.duplicated()]
-    input_df.fillna(0, inplace=True)
-    # Aggregate by age & gender vs product
-    age_sex_product_agg = input_df.groupby(["age_level", "gender", "product"]).agg({
-        "campaign_id": "nunique",
-        "webpage_id": "nunique"
-    }).reset_index()
-    # Fix renaming: Remove missing columns
-    age_sex_product_agg.columns = ["age_level", "gender", "product",
-                                   "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod"]
-    input_df = input_df.merge(age_sex_product_agg, on=["age_level", "gender", "product"], how="left")
-    # Aggregate by city, age, product
-    city_age_product_agg = input_df.groupby(["city_development_index", "age_level", "product"]).agg({
-        "campaign_id": "nunique",
-        "webpage_id": "nunique"
-    }).reset_index()
-    # Fix renaming: Remove missing columns
-    city_age_product_agg.columns = ["city_development_index", "age_level", "product",
-                                    "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"]
-    input_df = input_df.merge(city_age_product_agg, on=["city_development_index", "age_level", "product"], how="left")
-    input_df.fillna(0, inplace=True)
-    # **Ensure missing columns exist (Important Fix)**
-    missing_columns = ["click_sum_age_sex_prod", "click_count_age_sex_prod",
-                       "click_sum_city_age_prod", "click_count_city_age_prod"]
-    for col in missing_columns:
-        if col not in input_df.columns:
-            print(f"Warning: Missing column {col}. Filling with 0.")
-            input_df[col] = 0  # Fill missing columns with default values
-    # **Add `is_click` column with 0 for compatibility**
-    if "is_click" not in input_df.columns:
-        print("Adding `is_click` column with all values set to 0.")
-        input_df["is_click"] = 0  # Model will ignore this for prediction
-    # Feature List (Now includes `is_click`)
-    features = ["age_level", "gender", "product", "campaign_id", "webpage_id",
-                "product_category_1", "product_category_2", "user_group_id",
-                "user_depth", "city_development_index", "var_1",
-                "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
-                "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod",
-                "click_sum_age_sex_prod", "click_count_age_sex_prod",
-                "click_sum_city_age_prod", "click_count_city_age_prod",
-                "is_click"]  # Included for compatibility
-    categorical_columns = ["gender", "product", "campaign_id", "webpage_id"]
-    # ===========================
-    #  ENCODE CATEGORICAL FEATURES
-    # ===========================
-    label_encoders = {}
-    for col in categorical_columns:
-        le = LabelEncoder()
-        input_df[col] = le.fit_transform(input_df[col].astype(str))  # Apply transformation correctly
-        label_encoders[col] = le  # Store encoder for reference
-    # Normalize numerical features
-    numerical_columns = [col for col in features if col not in categorical_columns]
-    scaler = StandardScaler()
-    input_df[numerical_columns] = scaler.fit_transform(input_df[numerical_columns])
-    # ===========================
-    #  ENFORCE FEATURE ORDER
-    # ===========================
-    missing_features = set(expected_feature_order) - set(input_df.columns)
-    extra_features = set(input_df.columns) - set(expected_feature_order)
-    # Add missing features with default values
-    for col in missing_features:
-        print(f"Warning: Missing feature {col}. Filling with 0.")
-        input_df[col] = 0
-    # Drop unexpected features
-    if extra_features:
-        print(f"Warning: Dropping unexpected features: {extra_features}")
-        input_df = input_df.drop(columns=list(extra_features))
-    # Reorder columns to match the model's expected input
-    input_df = input_df[expected_feature_order]
-    return input_df
 def download_model(filename, local_path):
     """Download model from Hugging Face and move it to the correct location."""
     temp_path = hf_hub_download(repo_id=MODEL_REPO, filename=filename, local_dir=MODEL_DIR)
-    # Ensure correct file placement
     if temp_path != local_path:
         shutil.move(temp_path, local_path)
@@ -163,20 +50,15 @@ def load_models():
     try:
         print("🔄 Checking and downloading models...")
-        # Ensure models are downloaded and placed correctly
         if not os.path.exists(CATBOOST_MODEL_PATH):
-            print("🚀 Downloading CatBoost model...")
             download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH)
         if not os.path.exists(XGB_MODEL_PATH):
-            print("🚀 Downloading XGBoost model...")
             download_model(XGB_MODEL_FILENAME, XGB_MODEL_PATH)
         if not os.path.exists(RF_MODEL_PATH):
-            print("🚀 Downloading RandomForest model...")
             download_model(RF_MODEL_FILENAME, RF_MODEL_PATH)
-        # ✅ Load models
         print("📦 Loading models...")
         catboost_model = joblib.load(CATBOOST_MODEL_PATH)
         xgb_model = joblib.load(XGB_MODEL_PATH)
@@ -189,139 +71,83 @@ def load_models():
         print(f"❌ Error loading models: {e}")
         return None, None, None
 # Streamlit UI
 st.title("Is_Click Predictor - ML Model Inference")
 st.info("Upload a CSV file, and the trained models will predict click probability.")
 catboost, xgb, rf = load_models()
 expected_feature_order = catboost.feature_names_
 print("Expected Feature Order:", expected_feature_order)
 # Upload File
 uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
 if uploaded_file:
     input_df = pd.read_csv(uploaded_file)
     st.success("File uploaded successfully!")
-    # ✅ Compute aggregations & preprocess
-    input_df = preprocess_input(input_df, expected_feature_order)
     # ✅ Make Predictions
     st.subheader("Predictions in Progress...")
-    from catboost import Pool
-    # Define categorical features (MUST MATCH what was used during training)
-    cat_features = ["gender", "product", "campaign_id", "webpage_id"]
-    # Convert categorical features to strings (MUST be string, not float)
-    for col in cat_features:
-        input_df[col] = input_df[col].astype(str)
-    expected_feature_order = catboost.feature_names_
-    print("Expected Feature Order:", expected_feature_order)
-    # Ensure input_df has the correct column order
-    input_df = input_df[expected_feature_order]
-    input_pool = Pool(input_df, cat_features=cat_features)
-    catboost_preds = catboost.predict(input_pool)
-    catboost_probs = catboost.predict_proba(input_df)[:, 1]
-    label_encoders = {}  # Store encoders to ensure consistency
-    for col in cat_features:
-        le = LabelEncoder()
-        input_df[col] = input_df[col].astype(str)  # Ensure it's a string
-        le.fit(input_df[col])  # Fit only on input_df (since training is done)
-        label_encoders[col] = le  # Save encoder for reference
-        input_df[col] = le.transform(input_df[col])
-    # List of features used during training for XGBoost
-    xgb_training_features = [
-        "age_level", "gender", "product", "campaign_id", "webpage_id",
-        "product_category_1", "product_category_2", "user_group_id",
-        "user_depth", "city_development_index", "var_1",
-        "click_sum_age_sex_prod", "click_count_age_sex_prod",
-        "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
-        "click_sum_city_age_prod", "click_count_city_age_prod",
-        "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
-    ]
-    xgb_preds = xgb.predict(input_df[xgb_training_features])
-    # # 🔥 List of features RandomForest was trained with
-    # rf_training_features = [
-    #     "age_level", "gender", "product", "campaign_id", "webpage_id",
-    #     "product_category_1", "product_category_2", "user_group_id",
-    #     "user_depth", "city_development_index", "var_1",
-    #     "click_sum_age_sex_prod", "click_count_age_sex_prod",
-    #     "unique_campaigns_age_sex_prod", "unique_webpages_age_sex_prod",
-    #     "click_sum_city_age_prod", "click_count_city_age_prod",
-    #     "unique_campaigns_city_age_prod", "unique_webpages_city_age_prod"
-    # ]
-    #
-    # # ✅ Ensure all training features exist in `input_df`
-    # for col in rf_training_features:
-    #     if col not in input_df.columns:
-    #         input_df[col] = 0  # Default missing columns to 0
-    #
-    # # Get intersection of trained features and current input_df columns
-    # common_features = list(set(rf.feature_names_in_) & set(input_df.columns))
-    #
-    # # Select only the matching features
-    # input_df_rf = input_df[common_features]
-    #
-    # # Predict without needing to add missing features
-    # rf_preds = rf.predict(input_df_rf)
-    #
-    #
-    # print("RF Model Trained Features:", rf.feature_names_in_)
-    # print("Input Data Features:", input_df_rf.columns.tolist())
-    #
-    # # Debugging: Check for missing or extra features
-    # missing_features = set(rf.feature_names_in_) - set(input_df_rf.columns)
-    # extra_features = set(input_df_rf.columns) - set(rf.feature_names_in_)
-    #
-    # print("Missing Features in Input:", missing_features)
-    # print("Extra Features in Input:", extra_features)
-    # # ✅ Make Predictions with RandomForest
-    # rf_preds = rf.predict(input_df_rf)
-    xgb_probs = xgb.predict_proba(input_df)[:, 1]
-    #rf_probs = rf.predict_proba(input_df)[:, 1]
- #test
     # Combine results
     predictions_df = pd.DataFrame({
         "CatBoost": catboost_preds,
         "XGBoost": xgb_preds,
-      #  "RandomForest": rf_preds
     })
     # Apply "at least one model predicts 1" rule
     predictions_df["is_click_predicted"] = predictions_df.max(axis=1)
-    # Generate probability file
-    probabilities_df = pd.DataFrame({
-        "CatBoost_Prob": catboost_probs,
-        "XGBoost_Prob": xgb_probs,
-      #  "RandomForest_Prob": rf_probs
-    })
     # Save results
-    binary_predictions_path = "binary_predictions.csv"
-    filtered_predictions_path = "filtered_predictions.csv"
-    probabilities_path = "model_probabilities.csv"
-    predictions_df.to_csv(binary_predictions_path, index=False)
-    predictions_df[predictions_df["is_click_predicted"] == 1].to_csv(filtered_predictions_path, index=False)
-    probabilities_df.to_csv(probabilities_path, index=False)
     st.success("Predictions completed! Download results below.")
-    # Download Buttons
-    with open(binary_predictions_path, "rb") as f:
-        st.download_button("Download Binary Predictions (0/1)", f, file_name="binary_predictions.csv")
-    with open(filtered_predictions_path, "rb") as f:
-        st.download_button("Download Clicked Predictions (Only 1s)", f, file_name="filtered_predictions.csv")
-    with open(probabilities_path, "rb") as f:
-        st.download_button("Download Probability Predictions", f, file_name="model_probabilities.csv")

+import os
+import shutil
 import streamlit as st
 import pandas as pd
 import joblib
 from huggingface_hub import hf_hub_download
 from catboost import Pool
 # Hugging Face Model Repo
 os.makedirs(MODEL_DIR, exist_ok=True)
 # Model Filenames
+CATBOOST_MODEL_FILENAME = "catboost_model.pkl"
+XGB_MODEL_FILENAME = "xgb_model.pkl"
+RF_MODEL_FILENAME = "rf_model.pkl"
 # Local Paths
+CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, CATBOOST_MODEL_FILENAME)
+XGB_MODEL_PATH = os.path.join(MODEL_DIR, XGB_MODEL_FILENAME)
+RF_MODEL_PATH = os.path.join(MODEL_DIR, RF_MODEL_FILENAME)
 # Define Features
 CATEGORICAL_COLUMNS = ["gender", "product", "campaign_id", "webpage_id"]
 FEATURE_COLUMNS = CATEGORICAL_COLUMNS + NUMERICAL_COLUMNS
 def download_model(filename, local_path):
     """Download model from Hugging Face and move it to the correct location."""
+    print(f"📥 Downloading {filename} from Hugging Face...")
     temp_path = hf_hub_download(repo_id=MODEL_REPO, filename=filename, local_dir=MODEL_DIR)
     if temp_path != local_path:
         shutil.move(temp_path, local_path)
     try:
         print("🔄 Checking and downloading models...")
         if not os.path.exists(CATBOOST_MODEL_PATH):
             download_model(CATBOOST_MODEL_FILENAME, CATBOOST_MODEL_PATH)
         if not os.path.exists(XGB_MODEL_PATH):
             download_model(XGB_MODEL_FILENAME, XGB_MODEL_PATH)
         if not os.path.exists(RF_MODEL_PATH):
             download_model(RF_MODEL_FILENAME, RF_MODEL_PATH)
         print("📦 Loading models...")
         catboost_model = joblib.load(CATBOOST_MODEL_PATH)
         xgb_model = joblib.load(XGB_MODEL_PATH)
         print(f"❌ Error loading models: {e}")
         return None, None, None
 # Streamlit UI
 st.title("Is_Click Predictor - ML Model Inference")
 st.info("Upload a CSV file, and the trained models will predict click probability.")
 catboost, xgb, rf = load_models()
+if not catboost:
+    st.error("❌ Error: Failed to load models. Please check your Hugging Face repo.")
+    st.stop()
 expected_feature_order = catboost.feature_names_
 print("Expected Feature Order:", expected_feature_order)
 # Upload File
 uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
 if uploaded_file:
     input_df = pd.read_csv(uploaded_file)
     st.success("File uploaded successfully!")
+    # Ensure all expected columns exist in the test set
+    for col in expected_feature_order:
+        if col not in input_df.columns:
+            print(f"⚠️ Warning: Missing feature {col}. Filling with 0.")
+            input_df[col] = 0
+    # Reorder columns before prediction
+    input_df = input_df[expected_feature_order]
     # ✅ Make Predictions
     st.subheader("Predictions in Progress...")
+    # Create CatBoost pool
+    cat_features = CATEGORICAL_COLUMNS
+    input_pool = Pool(input_df, cat_features=cat_features)
+    catboost_probs = catboost.predict_proba(input_pool)[:, 1]
+    # ✅ Adjust decision threshold
+    THRESHOLD = 0.6  # Reduce false positives
+    catboost_preds = (catboost_probs >= THRESHOLD).astype(int)
+    # Ensure all required columns exist for XGBoost
+    for col in xgb.feature_names_in_:
+        if col not in input_df.columns:
+            input_df[col] = 0
+    xgb_probs = xgb.predict_proba(input_df[xgb.feature_names_in_])[:, 1]
+    xgb_preds = (xgb_probs >= THRESHOLD).astype(int)
+    # Ensure all required columns exist for RandomForest
+    for col in rf.feature_names_in_:
+        if col not in input_df.columns:
+            input_df[col] = 0
+    rf_probs = rf.predict_proba(input_df[rf.feature_names_in_])[:, 1]
+    rf_preds = (rf_probs >= THRESHOLD).astype(int)
+    # ✅ Debugging: Check probability distributions
+    print("🔍 Probability distributions:")
+    print("CatBoost:", pd.Series(catboost_probs).describe())
+    print("XGBoost:", pd.Series(xgb_probs).describe())
+    print("RandomForest:", pd.Series(rf_probs).describe())
     # Combine results
     predictions_df = pd.DataFrame({
         "CatBoost": catboost_preds,
         "XGBoost": xgb_preds,
+        "RandomForest": rf_preds
     })
     # Apply "at least one model predicts 1" rule
     predictions_df["is_click_predicted"] = predictions_df.max(axis=1)
     # Save results
+    predictions_df.to_csv("binary_predictions.csv", index=False)
+    predictions_df[predictions_df["is_click_predicted"] == 1].to_csv("filtered_predictions.csv", index=False)
     st.success("Predictions completed! Download results below.")
+    st.dataframe(predictions_df)