kaiquanmah commited on
Commit
a203ed2
·
1 Parent(s): 0e613f0

v2 tested pipeline in colab. update from git-lfs to git xet to upload model checkpoint files

Browse files
.gitattributes CHANGED
@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  models/catboost_model.cbm filter=lfs diff=lfs merge=lfs -text
37
  data/train_dataset_full[[:space:]]-[[:space:]]train_dataset_full.csv filter=lfs diff=lfs merge=lfs -text
38
  models/v1_202502/catboost_model.cbm filter=lfs diff=lfs merge=lfs -text
 
 
36
  models/catboost_model.cbm filter=lfs diff=lfs merge=lfs -text
37
  data/train_dataset_full[[:space:]]-[[:space:]]train_dataset_full.csv filter=lfs diff=lfs merge=lfs -text
38
  models/v1_202502/catboost_model.cbm filter=lfs diff=lfs merge=lfs -text
39
+ *.cbm filter=lfs diff=lfs merge=lfs -text
COLAB_verify_pipeline.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import joblib
3
+ import os
4
+ import numpy as np
5
+ from huggingface_hub import hf_hub_download
6
+ from model_trainer import train_models
7
+ from data_loader import load_and_process_data, CATEGORICAL_COLUMNS, TARGET_COLUMN
8
+
9
+ # Configuration
10
+ # Configuration
11
+ DATA_REPO = "KaiquanMah/is_click"
12
+ MODELS_DIR = "models"
13
+ os.makedirs(MODELS_DIR, exist_ok=True)
14
+
15
+ def run_verification():
16
+ print("🚀 Starting Verification Pipeline...")
17
+
18
+ # 1. Download Data
19
+ print("🔹 Downloading datasets from Hugging Face...")
20
+ try:
21
+ # Assuming files are in 'data/' folder in the Space
22
+ train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/train_dataset_full - train_dataset_full.csv")
23
+ # Try primary test file name, fallback to alt
24
+ try:
25
+ test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st.csv")
26
+ except:
27
+ test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st (1).csv")
28
+
29
+ print(f"✅ Data downloaded:\n Train: {train_data_path}\n Test: {test_data_path}")
30
+ except Exception as e:
31
+ print(f"❌ Error downloading data: {e}")
32
+ return
33
+
34
+ # 2. Process Data
35
+ print("🔹 Processing data...")
36
+ try:
37
+ X_train, X_val, y_train, y_val, test_df = load_and_process_data(train_path=train_data_path, test_path=test_data_path)
38
+ print(f"✅ Data processed. X_train shape: {X_train.shape}")
39
+ except Exception as e:
40
+ print(f"❌ Error processing data: {e}")
41
+ return
42
+
43
+ # 3. Train All Models
44
+ print("🔹 Training models (CatBoost, XGBoost, RandomForest)...")
45
+ try:
46
+ models = train_models(X_train, y_train, CATEGORICAL_COLUMNS)
47
+ print("✅ Models trained successfully.")
48
+ except Exception as e:
49
+ print(f"❌ Error training models: {e}")
50
+ return
51
+
52
+ # 4. Save All Models
53
+ print("🔹 Saving models...")
54
+ saved_paths = []
55
+ for name, model in models.items():
56
+ if model is not None:
57
+ path = os.path.join(MODELS_DIR, f"{name.lower()}_model.pkl")
58
+ joblib.dump(model, path)
59
+ saved_paths.append(path)
60
+ print(f" Saved {name} to {path}")
61
+ else:
62
+ print(f" ⚠ {name} was not trained (None).")
63
+
64
+ # 5. Ensemble Prediction
65
+ print("🔹 Generating Ensemble Predictions on Test Set...")
66
+ try:
67
+ # Align test_df columns with X_train (just in case)
68
+ # model_trainer.train_models uses X_train.
69
+ # Ensure test_df has same columns
70
+ X_test = test_df[X_train.columns]
71
+
72
+ predictions = {}
73
+ probas = []
74
+
75
+ for name, model in models.items():
76
+ if model is not None:
77
+ # Predict probabilities
78
+ # CatBoost needs categorical features indices if passing raw, but here we passed preprocessed/encoded data?
79
+ # Wait, load_and_process_data calls preprocess_data which label encodes.
80
+ # model_trainer.py: catboost.fit(..., cat_features=[indices])
81
+ # When predicting, CatBoost expects the same features.
82
+ # If we passed X_train (DataFrame) to fit, we should pass DataFrame to predict.
83
+
84
+ # Note: CatBoost usually handles categorical columns automatically if specified during fit.
85
+
86
+ p = model.predict_proba(X_test)[:, 1] # Probability of class 1
87
+ predictions[name] = p
88
+ probas.append(p)
89
+
90
+ if not probas:
91
+ print("❌ No models available for prediction.")
92
+ return
93
+
94
+ # Simple Average Ensemble
95
+ avg_proba = np.mean(probas, axis=0)
96
+ final_preds = (avg_proba >= 0.5).astype(int)
97
+
98
+ # Save Predictions
99
+ output_df = test_df.copy()
100
+ output_df["is_click_predicted_proba"] = avg_proba
101
+ output_df["is_click_predicted"] = final_preds
102
+
103
+ output_path = "ensemble_predictions.csv"
104
+ output_df.to_csv(output_path, index=False)
105
+ print(f"✅ Ensemble predictions saved to {output_path}")
106
+
107
+ except Exception as e:
108
+ print(f"❌ Error during prediction: {e}")
109
+ import traceback
110
+ traceback.print_exc()
111
+
112
+ print("\n🎉 Verification Complete!")
113
+ print("Files generated:")
114
+ for p in saved_paths:
115
+ print(f" - {p}")
116
+ print(f" - {output_path}")
117
+
118
+ if __name__ == "__main__":
119
+ run_verification()
COLAB_zip_project.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import shutil
2
+ import os
3
+
4
+ def zip_project():
5
+ # Files to include
6
+ files = [
7
+ "app.py",
8
+ "config.py",
9
+ "data_loader.py",
10
+ "modelConnector.py",
11
+ "model_trainer.py",
12
+ "requirements.txt",
13
+ "COLAB_verify_pipeline.py",
14
+ "README.md"
15
+ ]
16
+
17
+ # Create a 'colab_pack' directory
18
+ os.makedirs("colab_pack", exist_ok=True)
19
+
20
+ # Copy files
21
+ for f in files:
22
+ if os.path.exists(f):
23
+ shutil.copy(f, os.path.join("colab_pack", f))
24
+ else:
25
+ print(f"Warning: {f} not found.")
26
+
27
+ # Copy 'data' folder if it exists (optional, but code downloads data usually)
28
+ # The users environment might have local data they want to preserve?
29
+ # The requirement says "Constraint: I do not have python on my local laptop... please do not install...".
30
+ # So they probably only have the code.
31
+ # 'verify_pipeline.py' downloads data from HF. So we don't need to zip data folder.
32
+
33
+ # Zip the directory
34
+ output_filename = "is_click_project_colab"
35
+ shutil.make_archive(output_filename, 'zip', "colab_pack")
36
+
37
+ print(f"✅ Created {output_filename}.zip")
38
+
39
+ # Clean up temp dir
40
+ shutil.rmtree("colab_pack")
41
+
42
+ if __name__ == "__main__":
43
+ zip_project()
README_COLAB.md ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Colab Verification Instructions
2
+
3
+ If you do not have Python locally, please follow these steps to verify the code in Google Colab.
4
+
5
+ ## 1. Prepare Files
6
+ Select the following files from your folder and zip them into a file named **`project.zip`**:
7
+ - `app.py`
8
+ - `config.py`
9
+ - `data_loader.py`
10
+ - `modelConnector.py`
11
+ - `model_trainer.py`
12
+ - `requirements.txt`
13
+ - `verify_pipeline.py`
14
+
15
+ ## 2. Open Google Colab
16
+ Go to [Google Colab](https://colab.research.google.com/) and create a **New Notebook**.
17
+
18
+ ## 3. Upload Project
19
+ In the left sidebar of Colab, click the **Folder icon (Files)**, then click the **Upload icon**.
20
+ Upload your **`project.zip`** file.
21
+
22
+ ## 4. Run Commands
23
+ Copy and run the following commands in separate code cells:
24
+
25
+ ### Cell 1: Unzip and Install Dependencies
26
+ ```bash
27
+ !unzip project.zip
28
+ !pip install -r project/requirements.txt
29
+ ```
30
+
31
+ ### Cell 2: Run Verification Pipeline
32
+ This script will download data, train all 3 models (CatBoost, XGBoost, RandomForest), save them, and generate ensemble predictions.
33
+ ```bash
34
+ !python project/COLAB_verify_pipeline.py
35
+ ```
36
+
37
+ ## 5. Download Results
38
+ After the script finishes, check the **Files** sidebar (you may need to refresh it).
39
+ You will find:
40
+ - A `models/` folder containing:
41
+ - `catboost_model.pkl`
42
+ - `xgboost_model.pkl`
43
+ - `randomforest_model.pkl`
44
+ - A file named `ensemble_predictions.csv`
45
+
46
+ You can right-click these files to download them.
app.py CHANGED
@@ -13,11 +13,14 @@ from data_loader import load_and_process_data, load_data
13
 
14
  st.title("📊 Is Click Predictor")
15
 
 
 
 
16
  # Download and load the trained model from Hugging Face
17
  # We wrap this in a try-except block just in case, but modelConnector handles checking too.
18
  # However, app.py tries to load it directly for initial "Success" message.
19
  try:
20
- model_path = hf_hub_download(repo_id="chagu13/is_click", repo_type="space", filename="models/rf_model.pkl")
21
  rf_model = joblib.load(model_path)
22
  st.success("✅ Model Loaded Successfully!")
23
  except Exception:
 
13
 
14
  st.title("📊 Is Click Predictor")
15
 
16
+ # Download and load the trained model from Hugging Face
17
+ # We wrap this in a try-except block just in case, but modelConnector handles checking too.
18
+ # However, app.py tries to load it directly for initial "Success" message.
19
  # Download and load the trained model from Hugging Face
20
  # We wrap this in a try-except block just in case, but modelConnector handles checking too.
21
  # However, app.py tries to load it directly for initial "Success" message.
22
  try:
23
+ model_path = hf_hub_download(repo_id="KaiquanMah/is_click", repo_type="space", filename="models/rf_model.pkl")
24
  rf_model = joblib.load(model_path)
25
  st.success("✅ Model Loaded Successfully!")
26
  except Exception:
config.py CHANGED
@@ -4,9 +4,14 @@ import os
4
  MODEL_DIR = "models"
5
 
6
  # Model File Paths
7
- CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.cbm")
8
- XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.json")
9
- RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")
 
 
 
 
 
10
 
11
 
12
 
 
4
  MODEL_DIR = "models"
5
 
6
  # Model File Paths
7
+ # v1_202502 - mixture of saved file formats
8
+ # CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.cbm")
9
+ # XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.json")
10
+ # RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")
11
+ # v2_202502 - all saved as pickle
12
+ CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.pkl")
13
+ XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgboost_model.pkl")
14
+ RF_MODEL_PATH = os.path.join(MODEL_DIR, "randomforest_model.pkl")
15
 
16
 
17
 
data/v1_202502/y_test_1st (1).csv ADDED
The diff for this file is too large to render. See raw diff
 
data/y_test_1st (1).csv CHANGED
The diff for this file is too large to render. See raw diff
 
modelConnector.py CHANGED
@@ -6,9 +6,11 @@ from model_trainer import train_models
6
  from data_loader import load_and_process_data, CATEGORICAL_COLUMNS, load_data, add_aggregated_features, preprocess_data, TARGET_COLUMN
7
 
8
  # Hugging Face Model & Dataset Information
9
- MODEL_REPO = "chagu13/is_click"
 
10
  MODEL_FILENAME = "models/rf_model.pkl"
11
- DATA_REPO = "chagu13/is_click_data"
 
12
  LOCAL_MODEL_PATH = f"models/{MODEL_FILENAME}"
13
 
14
  # Hugging Face API
@@ -39,21 +41,35 @@ class ModelConnector:
39
  def train_model(self):
40
  """Train a new model and upload it to Hugging Face."""
41
  try:
42
- # Download datasets
43
- train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
44
- # We also need the test set for the processing pipeline validation/consistency
45
- # Try to download standard test file. If not found, it might return an error, but assuming it exists in repo.
46
- # Based on app.py comments, filename might be "X_test_1st(1).csv" or "X_test_1st.csv".
47
- # We'll try the one matching local naming first, or fall back to what we know works in data_loader if local.
48
- # However, hf_hub_download needs exact name.
49
- # app.py comments: X_test_1st(1).csv
50
- # data_loader: X_test_1st.csv
51
- # Let's try "X_test_1st.csv" first as it's cleaner, if it fails user might need to adjust.
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  try:
53
- test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
54
  except:
55
- # Fallback to the other name if the first one fails
56
- test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st (1).csv")
 
57
 
58
 
59
  # Load and process data using the central pipeline
@@ -82,12 +98,17 @@ class ModelConnector:
82
  def retrain_model(self):
83
  """Retrain the existing model with new data."""
84
  try:
85
- # Download datasets (ensure we have latest)
86
- train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
 
 
 
87
  try:
88
- test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
 
89
  except:
90
- test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st (1).csv")
 
91
 
92
  # Re-run the full processing pipeline
93
  X_train, X_val, y_train, y_val, test_df = load_and_process_data(train_path=train_data_path, test_path=test_data_path)
 
6
  from data_loader import load_and_process_data, CATEGORICAL_COLUMNS, load_data, add_aggregated_features, preprocess_data, TARGET_COLUMN
7
 
8
  # Hugging Face Model & Dataset Information
9
+ # MODEL_REPO = "chagu13/is_click" # commented out to avoid broken links when referencing across repos
10
+ MODEL_REPO = "KaiquanMah/is_click"
11
  MODEL_FILENAME = "models/rf_model.pkl"
12
+ # DATA_REPO = "chagu13/is_click_data" # commented out to avoid broken links when referencing across repos
13
+ DATA_REPO = "KaiquanMah/is_click"
14
  LOCAL_MODEL_PATH = f"models/{MODEL_FILENAME}"
15
 
16
  # Hugging Face API
 
41
  def train_model(self):
42
  """Train a new model and upload it to Hugging Face."""
43
  try:
44
+ # # Download datasets
45
+ # train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
46
+ # # We also need the test set for the processing pipeline validation/consistency
47
+ # # Try to download standard test file. If not found, it might return an error, but assuming it exists in repo.
48
+ # # Based on app.py comments, filename might be "X_test_1st(1).csv" or "X_test_1st.csv".
49
+ # # We'll try the one matching local naming first, or fall back to what we know works in data_loader if local.
50
+ # # However, hf_hub_download needs exact name.
51
+ # # app.py comments: X_test_1st(1).csv
52
+ # # data_loader: X_test_1st.csv
53
+ # # Let's try "X_test_1st.csv" first as it's cleaner, if it fails user might need to adjust.
54
+ # try:
55
+ # test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
56
+ # except:
57
+ # # Fallback to the other name if the first one fails
58
+ # test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st (1).csv")
59
+
60
+
61
+
62
+
63
+ # Download datasets from the Space
64
+ # Assuming the files are in 'data/' directory within the Space
65
+ train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/train_dataset_full - train_dataset_full.csv")
66
+
67
  try:
68
+ test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st.csv")
69
  except:
70
+ # Fallback if file not found, try without folder prefix or alternate name if needed
71
+ # But sticking to project structure:
72
+ test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st (1).csv")
73
 
74
 
75
  # Load and process data using the central pipeline
 
98
  def retrain_model(self):
99
  """Retrain the existing model with new data."""
100
  try:
101
+ # # Download datasets (ensure we have latest)
102
+ # train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
103
+ # Download datasets from the Space
104
+ train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/train_dataset_full - train_dataset_full.csv")
105
+
106
  try:
107
+ # test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
108
+ test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st.csv")
109
  except:
110
+ # test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st (1).csv")
111
+ test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st (1).csv")
112
 
113
  # Re-run the full processing pipeline
114
  X_train, X_val, y_train, y_val, test_df = load_and_process_data(train_path=train_data_path, test_path=test_data_path)
model_manager.py CHANGED
@@ -5,21 +5,51 @@ from config import CATBOOST_MODEL_PATH, XGB_MODEL_PATH, RF_MODEL_PATH
5
 
6
  def save_models(models):
7
  """ Save trained models """
8
- models["CatBoost"].save_model(CATBOOST_MODEL_PATH)
9
- if models["XGBoost"] is not None:
10
- # Save XGBoost model in binary format to reduce memory usage
11
- models["XGBoost"].get_booster().save_model(XGB_MODEL_PATH)
12
- joblib.dump(models["RandomForest"], RF_MODEL_PATH)
 
 
 
 
 
 
 
 
13
  print("✅ Models saved successfully!")
14
 
15
  def load_models():
16
  """ Load trained models """
17
- catboost = CatBoostClassifier()
18
- catboost.load_model(CATBOOST_MODEL_PATH)
 
 
 
 
 
 
 
 
19
 
20
- xgb = XGBClassifier() # Load XGBoost model in binary format
21
- xgb.load_model(XGB_MODEL_PATH)
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- rf = joblib.load(RF_MODEL_PATH)
 
 
 
24
 
25
- return {"CatBoost": catboost, "XGBoost": xgb, "RandomForest": rf}
 
5
 
6
  def save_models(models):
7
  """ Save trained models """
8
+ # v1
9
+ # models["CatBoost"].save_model(CATBOOST_MODEL_PATH)
10
+ # if models["XGBoost"] is not None:
11
+ # # Save XGBoost model in binary format to reduce memory usage
12
+ # models["XGBoost"].get_booster().save_model(XGB_MODEL_PATH)
13
+ # joblib.dump(models["RandomForest"], RF_MODEL_PATH)
14
+
15
+ # v2
16
+ # Save all models using joblib (pickle)
17
+ for name, path in [("CatBoost", CATBOOST_MODEL_PATH), ("XGBoost", XGB_MODEL_PATH), ("RandomForest", RF_MODEL_PATH)]:
18
+ if models.get(name):
19
+ joblib.dump(models[name], path)
20
+
21
  print("✅ Models saved successfully!")
22
 
23
  def load_models():
24
  """ Load trained models """
25
+ # v1
26
+ # catboost = CatBoostClassifier()
27
+ # catboost.load_model(CATBOOST_MODEL_PATH)
28
+
29
+ # xgb = XGBClassifier() # Load XGBoost model in binary format
30
+ # xgb.load_model(XGB_MODEL_PATH)
31
+
32
+ # rf = joblib.load(RF_MODEL_PATH)
33
+
34
+ # return {"CatBoost": catboost, "XGBoost": xgb, "RandomForest": rf}
35
 
36
+ # v2
37
+ models = {}
38
+
39
+ # Load all models using joblib
40
+ try:
41
+ models["CatBoost"] = joblib.load(CATBOOST_MODEL_PATH)
42
+ except:
43
+ models["CatBoost"] = None
44
+
45
+ try:
46
+ models["XGBoost"] = joblib.load(XGB_MODEL_PATH)
47
+ except:
48
+ models["XGBoost"] = None
49
 
50
+ try:
51
+ models["RandomForest"] = joblib.load(RF_MODEL_PATH)
52
+ except:
53
+ models["RandomForest"] = None
54
 
55
+ return models
models/catboost_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f39fd49eba13ac1975ff4450dcbf76277ddf6d07f36fc26d6e4ab9a87500b0b7
3
+ size 911336
models/randomforest_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22ca6974136bd0c244026586b6efc06bb37123b5911ea78b8dd9d282a906a397
3
+ size 111639785
models/xgboost_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8bce423cd91635d2a7b5be212518b229b6ece8a88562d7ad370e93b8f1ecf3f2
3
+ size 3046668