Spaces:

KaiquanMah
/

is_click

Sleeping

App Files Files Community

kaiquanmah commited on Jan 9

Commit

a203ed2

1 Parent(s): 0e613f0

v2 tested pipeline in colab. update from git-lfs to git xet to upload model checkpoint files

Browse files

Files changed (13) hide show

.gitattributes +1 -0
COLAB_verify_pipeline.py +119 -0
COLAB_zip_project.py +43 -0
README_COLAB.md +46 -0
app.py +4 -1
config.py +8 -3
data/v1_202502/y_test_1st (1).csv +0 -0
data/y_test_1st (1).csv +0 -0
modelConnector.py +40 -19
model_manager.py +41 -11
models/catboost_model.pkl +3 -0
models/randomforest_model.pkl +3 -0
models/xgboost_model.pkl +3 -0

.gitattributes CHANGED Viewed

@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 models/catboost_model.cbm filter=lfs diff=lfs merge=lfs -text
 data/train_dataset_full[[:space:]]-[[:space:]]train_dataset_full.csv filter=lfs diff=lfs merge=lfs -text
 models/v1_202502/catboost_model.cbm filter=lfs diff=lfs merge=lfs -text

 models/catboost_model.cbm filter=lfs diff=lfs merge=lfs -text
 data/train_dataset_full[[:space:]]-[[:space:]]train_dataset_full.csv filter=lfs diff=lfs merge=lfs -text
 models/v1_202502/catboost_model.cbm filter=lfs diff=lfs merge=lfs -text
+*.cbm filter=lfs diff=lfs merge=lfs -text

COLAB_verify_pipeline.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import pandas as pd
+import joblib
+import os
+import numpy as np
+from huggingface_hub import hf_hub_download
+from model_trainer import train_models
+from data_loader import load_and_process_data, CATEGORICAL_COLUMNS, TARGET_COLUMN
+# Configuration
+# Configuration
+DATA_REPO = "KaiquanMah/is_click"
+MODELS_DIR = "models"
+os.makedirs(MODELS_DIR, exist_ok=True)
+def run_verification():
+    print("🚀 Starting Verification Pipeline...")
+    # 1. Download Data
+    print("🔹 Downloading datasets from Hugging Face...")
+    try:
+        # Assuming files are in 'data/' folder in the Space
+        train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/train_dataset_full - train_dataset_full.csv")
+        # Try primary test file name, fallback to alt
+        try:
+            test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st.csv")
+        except:
+             test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st (1).csv")
+        print(f"✅ Data downloaded:\n  Train: {train_data_path}\n  Test: {test_data_path}")
+    except Exception as e:
+        print(f"❌ Error downloading data: {e}")
+        return
+    # 2. Process Data
+    print("🔹 Processing data...")
+    try:
+        X_train, X_val, y_train, y_val, test_df = load_and_process_data(train_path=train_data_path, test_path=test_data_path)
+        print(f"✅ Data processed. X_train shape: {X_train.shape}")
+    except Exception as e:
+        print(f"❌ Error processing data: {e}")
+        return
+    # 3. Train All Models
+    print("🔹 Training models (CatBoost, XGBoost, RandomForest)...")
+    try:
+        models = train_models(X_train, y_train, CATEGORICAL_COLUMNS)
+        print("✅ Models trained successfully.")
+    except Exception as e:
+        print(f"❌ Error training models: {e}")
+        return
+    # 4. Save All Models
+    print("🔹 Saving models...")
+    saved_paths = []
+    for name, model in models.items():
+        if model is not None:
+            path = os.path.join(MODELS_DIR, f"{name.lower()}_model.pkl")
+            joblib.dump(model, path)
+            saved_paths.append(path)
+            print(f"  Saved {name} to {path}")
+        else:
+            print(f"  ⚠ {name} was not trained (None).")
+    # 5. Ensemble Prediction
+    print("🔹 Generating Ensemble Predictions on Test Set...")
+    try:
+        # Align test_df columns with X_train (just in case)
+        # model_trainer.train_models uses X_train.
+        # Ensure test_df has same columns
+        X_test = test_df[X_train.columns]
+        predictions = {}
+        probas = []
+        for name, model in models.items():
+            if model is not None:
+                # Predict probabilities
+                # CatBoost needs categorical features indices if passing raw, but here we passed preprocessed/encoded data?
+                # Wait, load_and_process_data calls preprocess_data which label encodes.
+                # model_trainer.py: catboost.fit(..., cat_features=[indices])
+                # When predicting, CatBoost expects the same features.
+                # If we passed X_train (DataFrame) to fit, we should pass DataFrame to predict.
+                # Note: CatBoost usually handles categorical columns automatically if specified during fit.
+                p = model.predict_proba(X_test)[:, 1] # Probability of class 1
+                predictions[name] = p
+                probas.append(p)
+        if not probas:
+            print("❌ No models available for prediction.")
+            return
+        # Simple Average Ensemble
+        avg_proba = np.mean(probas, axis=0)
+        final_preds = (avg_proba >= 0.5).astype(int)
+        # Save Predictions
+        output_df = test_df.copy()
+        output_df["is_click_predicted_proba"] = avg_proba
+        output_df["is_click_predicted"] = final_preds
+        output_path = "ensemble_predictions.csv"
+        output_df.to_csv(output_path, index=False)
+        print(f"✅ Ensemble predictions saved to {output_path}")
+    except Exception as e:
+        print(f"❌ Error during prediction: {e}")
+        import traceback
+        traceback.print_exc()
+    print("\n🎉 Verification Complete!")
+    print("Files generated:")
+    for p in saved_paths:
+        print(f" - {p}")
+    print(f" - {output_path}")
+if __name__ == "__main__":
+    run_verification()

COLAB_zip_project.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import shutil
+import os
+def zip_project():
+    # Files to include
+    files = [
+        "app.py",
+        "config.py",
+        "data_loader.py",
+        "modelConnector.py",
+        "model_trainer.py",
+        "requirements.txt",
+        "COLAB_verify_pipeline.py",
+        "README.md"
+    ]
+    # Create a 'colab_pack' directory
+    os.makedirs("colab_pack", exist_ok=True)
+    # Copy files
+    for f in files:
+        if os.path.exists(f):
+            shutil.copy(f, os.path.join("colab_pack", f))
+        else:
+            print(f"Warning: {f} not found.")
+    # Copy 'data' folder if it exists (optional, but code downloads data usually)
+    # The users environment might have local data they want to preserve?
+    # The requirement says "Constraint: I do not have python on my local laptop... please do not install...".
+    # So they probably only have the code.
+    # 'verify_pipeline.py' downloads data from HF. So we don't need to zip data folder.
+    # Zip the directory
+    output_filename = "is_click_project_colab"
+    shutil.make_archive(output_filename, 'zip', "colab_pack")
+    print(f"✅ Created {output_filename}.zip")
+    # Clean up temp dir
+    shutil.rmtree("colab_pack")
+if __name__ == "__main__":
+    zip_project()

README_COLAB.md ADDED Viewed

	@@ -0,0 +1,46 @@

+# Colab Verification Instructions
+If you do not have Python locally, please follow these steps to verify the code in Google Colab.
+## 1. Prepare Files
+Select the following files from your folder and zip them into a file named **`project.zip`**:
+- `app.py`
+- `config.py`
+- `data_loader.py`
+- `modelConnector.py`
+- `model_trainer.py`
+- `requirements.txt`
+- `verify_pipeline.py`
+## 2. Open Google Colab
+Go to [Google Colab](https://colab.research.google.com/) and create a **New Notebook**.
+## 3. Upload Project
+In the left sidebar of Colab, click the **Folder icon (Files)**, then click the **Upload icon**.
+Upload your **`project.zip`** file.
+## 4. Run Commands
+Copy and run the following commands in separate code cells:
+### Cell 1: Unzip and Install Dependencies
+```bash
+!unzip project.zip
+!pip install -r project/requirements.txt
+```
+### Cell 2: Run Verification Pipeline
+This script will download data, train all 3 models (CatBoost, XGBoost, RandomForest), save them, and generate ensemble predictions.
+```bash
+!python project/COLAB_verify_pipeline.py
+```
+## 5. Download Results
+After the script finishes, check the **Files** sidebar (you may need to refresh it).
+You will find:
+- A `models/` folder containing:
+    - `catboost_model.pkl`
+    - `xgboost_model.pkl`
+    - `randomforest_model.pkl`
+- A file named `ensemble_predictions.csv`
+You can right-click these files to download them.

app.py CHANGED Viewed

@@ -13,11 +13,14 @@ from data_loader import load_and_process_data, load_data
 st.title("📊 Is Click Predictor")
 # Download and load the trained model from Hugging Face
 # We wrap this in a try-except block just in case, but modelConnector handles checking too.
 # However, app.py tries to load it directly for initial "Success" message.
 try:
-    model_path = hf_hub_download(repo_id="chagu13/is_click", repo_type="space", filename="models/rf_model.pkl")
     rf_model = joblib.load(model_path)
     st.success("✅ Model Loaded Successfully!")
 except Exception:

 st.title("📊 Is Click Predictor")
+# Download and load the trained model from Hugging Face
+# We wrap this in a try-except block just in case, but modelConnector handles checking too.
+# However, app.py tries to load it directly for initial "Success" message.
 # Download and load the trained model from Hugging Face
 # We wrap this in a try-except block just in case, but modelConnector handles checking too.
 # However, app.py tries to load it directly for initial "Success" message.
 try:
+    model_path = hf_hub_download(repo_id="KaiquanMah/is_click", repo_type="space", filename="models/rf_model.pkl")
     rf_model = joblib.load(model_path)
     st.success("✅ Model Loaded Successfully!")
 except Exception:

config.py CHANGED Viewed

@@ -4,9 +4,14 @@ import os
 MODEL_DIR = "models"
 # Model File Paths
-CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.cbm")
-XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.json")
-RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")

 MODEL_DIR = "models"
 # Model File Paths
+# v1_202502 - mixture of saved file formats
+# CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.cbm")
+# XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.json")
+# RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")
+# v2_202502 - all saved as pickle
+CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.pkl")
+XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgboost_model.pkl")
+RF_MODEL_PATH = os.path.join(MODEL_DIR, "randomforest_model.pkl")

data/v1_202502/y_test_1st (1).csv ADDED Viewed

The diff for this file is too large to render. See raw diff

data/y_test_1st (1).csv CHANGED Viewed

The diff for this file is too large to render. See raw diff

modelConnector.py CHANGED Viewed

@@ -6,9 +6,11 @@ from model_trainer import train_models
 from data_loader import load_and_process_data, CATEGORICAL_COLUMNS, load_data, add_aggregated_features, preprocess_data, TARGET_COLUMN
 # Hugging Face Model & Dataset Information
-MODEL_REPO = "chagu13/is_click"
 MODEL_FILENAME = "models/rf_model.pkl"
-DATA_REPO = "chagu13/is_click_data"
 LOCAL_MODEL_PATH = f"models/{MODEL_FILENAME}"
 # Hugging Face API
@@ -39,21 +41,35 @@ class ModelConnector:
     def train_model(self):
         """Train a new model and upload it to Hugging Face."""
         try:
-            # Download datasets
-            train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
-            # We also need the test set for the processing pipeline validation/consistency
-            # Try to download standard test file. If not found, it might return an error, but assuming it exists in repo.
-            # Based on app.py comments, filename might be "X_test_1st(1).csv" or "X_test_1st.csv".
-            # We'll try the one matching local naming first, or fall back to what we know works in data_loader if local.
-            # However, hf_hub_download needs exact name.
-            # app.py comments: X_test_1st(1).csv
-            # data_loader: X_test_1st.csv
-            # Let's try "X_test_1st.csv" first as it's cleaner, if it fails user might need to adjust.
             try:
-                 test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
             except:
-                 # Fallback to the other name if the first one fails
-                 test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st (1).csv")
             # Load and process data using the central pipeline
@@ -82,12 +98,17 @@ class ModelConnector:
     def retrain_model(self):
         """Retrain the existing model with new data."""
         try:
-            # Download datasets (ensure we have latest)
-            train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
             try:
-                 test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
             except:
-                 test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st (1).csv")
             # Re-run the full processing pipeline
             X_train, X_val, y_train, y_val, test_df = load_and_process_data(train_path=train_data_path, test_path=test_data_path)

 from data_loader import load_and_process_data, CATEGORICAL_COLUMNS, load_data, add_aggregated_features, preprocess_data, TARGET_COLUMN
 # Hugging Face Model & Dataset Information
+# MODEL_REPO = "chagu13/is_click"     # commented out to avoid broken links when referencing across repos
+MODEL_REPO = "KaiquanMah/is_click"
 MODEL_FILENAME = "models/rf_model.pkl"
+# DATA_REPO = "chagu13/is_click_data"     # commented out to avoid broken links when referencing across repos
+DATA_REPO = "KaiquanMah/is_click"
 LOCAL_MODEL_PATH = f"models/{MODEL_FILENAME}"
 # Hugging Face API
     def train_model(self):
         """Train a new model and upload it to Hugging Face."""
         try:
+            # # Download datasets
+            # train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
+            # # We also need the test set for the processing pipeline validation/consistency
+            # # Try to download standard test file. If not found, it might return an error, but assuming it exists in repo.
+            # # Based on app.py comments, filename might be "X_test_1st(1).csv" or "X_test_1st.csv".
+            # # We'll try the one matching local naming first, or fall back to what we know works in data_loader if local.
+            # # However, hf_hub_download needs exact name.
+            # # app.py comments: X_test_1st(1).csv
+            # # data_loader: X_test_1st.csv
+            # # Let's try "X_test_1st.csv" first as it's cleaner, if it fails user might need to adjust.
+            # try:
+            #      test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
+            # except:
+            #     # Fallback to the other name if the first one fails
+            #     test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st (1).csv")
+            # Download datasets from the Space
+            # Assuming the files are in 'data/' directory within the Space
+            train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/train_dataset_full - train_dataset_full.csv")
             try:
+                 test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st.csv")
             except:
+                 # Fallback if file not found, try without folder prefix or alternate name if needed
+                 # But sticking to project structure:
+                 test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st (1).csv")
             # Load and process data using the central pipeline
     def retrain_model(self):
         """Retrain the existing model with new data."""
         try:
+            # # Download datasets (ensure we have latest)
+            # train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
+            # Download datasets from the Space
+            train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/train_dataset_full - train_dataset_full.csv")
             try:
+                # test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
+                test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st.csv")
             except:
+                # test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st (1).csv")
+                test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st (1).csv")
             # Re-run the full processing pipeline
             X_train, X_val, y_train, y_val, test_df = load_and_process_data(train_path=train_data_path, test_path=test_data_path)

model_manager.py CHANGED Viewed

@@ -5,21 +5,51 @@ from config import CATBOOST_MODEL_PATH, XGB_MODEL_PATH, RF_MODEL_PATH
 def save_models(models):
     """ Save trained models """
-    models["CatBoost"].save_model(CATBOOST_MODEL_PATH)
-    if models["XGBoost"] is not None:
-        # Save XGBoost model in binary format to reduce memory usage
-        models["XGBoost"].get_booster().save_model(XGB_MODEL_PATH)
-    joblib.dump(models["RandomForest"], RF_MODEL_PATH)
     print("✅ Models saved successfully!")
 def load_models():
     """ Load trained models """
-    catboost = CatBoostClassifier()
-    catboost.load_model(CATBOOST_MODEL_PATH)
-    xgb = XGBClassifier()  # Load XGBoost model in binary format
-    xgb.load_model(XGB_MODEL_PATH)
-    rf = joblib.load(RF_MODEL_PATH)
-    return {"CatBoost": catboost, "XGBoost": xgb, "RandomForest": rf}

 def save_models(models):
     """ Save trained models """
+    # v1
+    # models["CatBoost"].save_model(CATBOOST_MODEL_PATH)
+    # if models["XGBoost"] is not None:
+    #     # Save XGBoost model in binary format to reduce memory usage
+    #     models["XGBoost"].get_booster().save_model(XGB_MODEL_PATH)
+    # joblib.dump(models["RandomForest"], RF_MODEL_PATH)
+    # v2
+    # Save all models using joblib (pickle)
+    for name, path in [("CatBoost", CATBOOST_MODEL_PATH), ("XGBoost", XGB_MODEL_PATH), ("RandomForest", RF_MODEL_PATH)]:
+        if models.get(name):
+             joblib.dump(models[name], path)
     print("✅ Models saved successfully!")
 def load_models():
     """ Load trained models """
+    # v1
+    # catboost = CatBoostClassifier()
+    # catboost.load_model(CATBOOST_MODEL_PATH)
+    # xgb = XGBClassifier()  # Load XGBoost model in binary format
+    # xgb.load_model(XGB_MODEL_PATH)
+    # rf = joblib.load(RF_MODEL_PATH)
+    # return {"CatBoost": catboost, "XGBoost": xgb, "RandomForest": rf}
+    # v2
+    models = {}
+    # Load all models using joblib
+    try:
+        models["CatBoost"] = joblib.load(CATBOOST_MODEL_PATH)
+    except:
+        models["CatBoost"] = None
+    try:
+        models["XGBoost"] = joblib.load(XGB_MODEL_PATH)
+    except:
+        models["XGBoost"] = None
+    try:
+        models["RandomForest"] = joblib.load(RF_MODEL_PATH)
+    except:
+        models["RandomForest"] = None
+    return models

models/catboost_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f39fd49eba13ac1975ff4450dcbf76277ddf6d07f36fc26d6e4ab9a87500b0b7
+size 911336

models/randomforest_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22ca6974136bd0c244026586b6efc06bb37123b5911ea78b8dd9d282a906a397
+size 111639785

models/xgboost_model.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8bce423cd91635d2a7b5be212518b229b6ece8a88562d7ad370e93b8f1ecf3f2
+size 3046668