Spaces:
Sleeping
Sleeping
kaiquanmah commited on
Commit ·
a203ed2
1
Parent(s): 0e613f0
v2 tested pipeline in colab. update from git-lfs to git xet to upload model checkpoint files
Browse files- .gitattributes +1 -0
- COLAB_verify_pipeline.py +119 -0
- COLAB_zip_project.py +43 -0
- README_COLAB.md +46 -0
- app.py +4 -1
- config.py +8 -3
- data/v1_202502/y_test_1st (1).csv +0 -0
- data/y_test_1st (1).csv +0 -0
- modelConnector.py +40 -19
- model_manager.py +41 -11
- models/catboost_model.pkl +3 -0
- models/randomforest_model.pkl +3 -0
- models/xgboost_model.pkl +3 -0
.gitattributes
CHANGED
|
@@ -36,3 +36,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 36 |
models/catboost_model.cbm filter=lfs diff=lfs merge=lfs -text
|
| 37 |
data/train_dataset_full[[:space:]]-[[:space:]]train_dataset_full.csv filter=lfs diff=lfs merge=lfs -text
|
| 38 |
models/v1_202502/catboost_model.cbm filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 36 |
models/catboost_model.cbm filter=lfs diff=lfs merge=lfs -text
|
| 37 |
data/train_dataset_full[[:space:]]-[[:space:]]train_dataset_full.csv filter=lfs diff=lfs merge=lfs -text
|
| 38 |
models/v1_202502/catboost_model.cbm filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
*.cbm filter=lfs diff=lfs merge=lfs -text
|
COLAB_verify_pipeline.py
ADDED
|
@@ -0,0 +1,119 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import pandas as pd
|
| 2 |
+
import joblib
|
| 3 |
+
import os
|
| 4 |
+
import numpy as np
|
| 5 |
+
from huggingface_hub import hf_hub_download
|
| 6 |
+
from model_trainer import train_models
|
| 7 |
+
from data_loader import load_and_process_data, CATEGORICAL_COLUMNS, TARGET_COLUMN
|
| 8 |
+
|
| 9 |
+
# Configuration
|
| 10 |
+
# Configuration
|
| 11 |
+
DATA_REPO = "KaiquanMah/is_click"
|
| 12 |
+
MODELS_DIR = "models"
|
| 13 |
+
os.makedirs(MODELS_DIR, exist_ok=True)
|
| 14 |
+
|
| 15 |
+
def run_verification():
|
| 16 |
+
print("🚀 Starting Verification Pipeline...")
|
| 17 |
+
|
| 18 |
+
# 1. Download Data
|
| 19 |
+
print("🔹 Downloading datasets from Hugging Face...")
|
| 20 |
+
try:
|
| 21 |
+
# Assuming files are in 'data/' folder in the Space
|
| 22 |
+
train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/train_dataset_full - train_dataset_full.csv")
|
| 23 |
+
# Try primary test file name, fallback to alt
|
| 24 |
+
try:
|
| 25 |
+
test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st.csv")
|
| 26 |
+
except:
|
| 27 |
+
test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st (1).csv")
|
| 28 |
+
|
| 29 |
+
print(f"✅ Data downloaded:\n Train: {train_data_path}\n Test: {test_data_path}")
|
| 30 |
+
except Exception as e:
|
| 31 |
+
print(f"❌ Error downloading data: {e}")
|
| 32 |
+
return
|
| 33 |
+
|
| 34 |
+
# 2. Process Data
|
| 35 |
+
print("🔹 Processing data...")
|
| 36 |
+
try:
|
| 37 |
+
X_train, X_val, y_train, y_val, test_df = load_and_process_data(train_path=train_data_path, test_path=test_data_path)
|
| 38 |
+
print(f"✅ Data processed. X_train shape: {X_train.shape}")
|
| 39 |
+
except Exception as e:
|
| 40 |
+
print(f"❌ Error processing data: {e}")
|
| 41 |
+
return
|
| 42 |
+
|
| 43 |
+
# 3. Train All Models
|
| 44 |
+
print("🔹 Training models (CatBoost, XGBoost, RandomForest)...")
|
| 45 |
+
try:
|
| 46 |
+
models = train_models(X_train, y_train, CATEGORICAL_COLUMNS)
|
| 47 |
+
print("✅ Models trained successfully.")
|
| 48 |
+
except Exception as e:
|
| 49 |
+
print(f"❌ Error training models: {e}")
|
| 50 |
+
return
|
| 51 |
+
|
| 52 |
+
# 4. Save All Models
|
| 53 |
+
print("🔹 Saving models...")
|
| 54 |
+
saved_paths = []
|
| 55 |
+
for name, model in models.items():
|
| 56 |
+
if model is not None:
|
| 57 |
+
path = os.path.join(MODELS_DIR, f"{name.lower()}_model.pkl")
|
| 58 |
+
joblib.dump(model, path)
|
| 59 |
+
saved_paths.append(path)
|
| 60 |
+
print(f" Saved {name} to {path}")
|
| 61 |
+
else:
|
| 62 |
+
print(f" ⚠ {name} was not trained (None).")
|
| 63 |
+
|
| 64 |
+
# 5. Ensemble Prediction
|
| 65 |
+
print("🔹 Generating Ensemble Predictions on Test Set...")
|
| 66 |
+
try:
|
| 67 |
+
# Align test_df columns with X_train (just in case)
|
| 68 |
+
# model_trainer.train_models uses X_train.
|
| 69 |
+
# Ensure test_df has same columns
|
| 70 |
+
X_test = test_df[X_train.columns]
|
| 71 |
+
|
| 72 |
+
predictions = {}
|
| 73 |
+
probas = []
|
| 74 |
+
|
| 75 |
+
for name, model in models.items():
|
| 76 |
+
if model is not None:
|
| 77 |
+
# Predict probabilities
|
| 78 |
+
# CatBoost needs categorical features indices if passing raw, but here we passed preprocessed/encoded data?
|
| 79 |
+
# Wait, load_and_process_data calls preprocess_data which label encodes.
|
| 80 |
+
# model_trainer.py: catboost.fit(..., cat_features=[indices])
|
| 81 |
+
# When predicting, CatBoost expects the same features.
|
| 82 |
+
# If we passed X_train (DataFrame) to fit, we should pass DataFrame to predict.
|
| 83 |
+
|
| 84 |
+
# Note: CatBoost usually handles categorical columns automatically if specified during fit.
|
| 85 |
+
|
| 86 |
+
p = model.predict_proba(X_test)[:, 1] # Probability of class 1
|
| 87 |
+
predictions[name] = p
|
| 88 |
+
probas.append(p)
|
| 89 |
+
|
| 90 |
+
if not probas:
|
| 91 |
+
print("❌ No models available for prediction.")
|
| 92 |
+
return
|
| 93 |
+
|
| 94 |
+
# Simple Average Ensemble
|
| 95 |
+
avg_proba = np.mean(probas, axis=0)
|
| 96 |
+
final_preds = (avg_proba >= 0.5).astype(int)
|
| 97 |
+
|
| 98 |
+
# Save Predictions
|
| 99 |
+
output_df = test_df.copy()
|
| 100 |
+
output_df["is_click_predicted_proba"] = avg_proba
|
| 101 |
+
output_df["is_click_predicted"] = final_preds
|
| 102 |
+
|
| 103 |
+
output_path = "ensemble_predictions.csv"
|
| 104 |
+
output_df.to_csv(output_path, index=False)
|
| 105 |
+
print(f"✅ Ensemble predictions saved to {output_path}")
|
| 106 |
+
|
| 107 |
+
except Exception as e:
|
| 108 |
+
print(f"❌ Error during prediction: {e}")
|
| 109 |
+
import traceback
|
| 110 |
+
traceback.print_exc()
|
| 111 |
+
|
| 112 |
+
print("\n🎉 Verification Complete!")
|
| 113 |
+
print("Files generated:")
|
| 114 |
+
for p in saved_paths:
|
| 115 |
+
print(f" - {p}")
|
| 116 |
+
print(f" - {output_path}")
|
| 117 |
+
|
| 118 |
+
if __name__ == "__main__":
|
| 119 |
+
run_verification()
|
COLAB_zip_project.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import shutil
|
| 2 |
+
import os
|
| 3 |
+
|
| 4 |
+
def zip_project():
|
| 5 |
+
# Files to include
|
| 6 |
+
files = [
|
| 7 |
+
"app.py",
|
| 8 |
+
"config.py",
|
| 9 |
+
"data_loader.py",
|
| 10 |
+
"modelConnector.py",
|
| 11 |
+
"model_trainer.py",
|
| 12 |
+
"requirements.txt",
|
| 13 |
+
"COLAB_verify_pipeline.py",
|
| 14 |
+
"README.md"
|
| 15 |
+
]
|
| 16 |
+
|
| 17 |
+
# Create a 'colab_pack' directory
|
| 18 |
+
os.makedirs("colab_pack", exist_ok=True)
|
| 19 |
+
|
| 20 |
+
# Copy files
|
| 21 |
+
for f in files:
|
| 22 |
+
if os.path.exists(f):
|
| 23 |
+
shutil.copy(f, os.path.join("colab_pack", f))
|
| 24 |
+
else:
|
| 25 |
+
print(f"Warning: {f} not found.")
|
| 26 |
+
|
| 27 |
+
# Copy 'data' folder if it exists (optional, but code downloads data usually)
|
| 28 |
+
# The users environment might have local data they want to preserve?
|
| 29 |
+
# The requirement says "Constraint: I do not have python on my local laptop... please do not install...".
|
| 30 |
+
# So they probably only have the code.
|
| 31 |
+
# 'verify_pipeline.py' downloads data from HF. So we don't need to zip data folder.
|
| 32 |
+
|
| 33 |
+
# Zip the directory
|
| 34 |
+
output_filename = "is_click_project_colab"
|
| 35 |
+
shutil.make_archive(output_filename, 'zip', "colab_pack")
|
| 36 |
+
|
| 37 |
+
print(f"✅ Created {output_filename}.zip")
|
| 38 |
+
|
| 39 |
+
# Clean up temp dir
|
| 40 |
+
shutil.rmtree("colab_pack")
|
| 41 |
+
|
| 42 |
+
if __name__ == "__main__":
|
| 43 |
+
zip_project()
|
README_COLAB.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Colab Verification Instructions
|
| 2 |
+
|
| 3 |
+
If you do not have Python locally, please follow these steps to verify the code in Google Colab.
|
| 4 |
+
|
| 5 |
+
## 1. Prepare Files
|
| 6 |
+
Select the following files from your folder and zip them into a file named **`project.zip`**:
|
| 7 |
+
- `app.py`
|
| 8 |
+
- `config.py`
|
| 9 |
+
- `data_loader.py`
|
| 10 |
+
- `modelConnector.py`
|
| 11 |
+
- `model_trainer.py`
|
| 12 |
+
- `requirements.txt`
|
| 13 |
+
- `verify_pipeline.py`
|
| 14 |
+
|
| 15 |
+
## 2. Open Google Colab
|
| 16 |
+
Go to [Google Colab](https://colab.research.google.com/) and create a **New Notebook**.
|
| 17 |
+
|
| 18 |
+
## 3. Upload Project
|
| 19 |
+
In the left sidebar of Colab, click the **Folder icon (Files)**, then click the **Upload icon**.
|
| 20 |
+
Upload your **`project.zip`** file.
|
| 21 |
+
|
| 22 |
+
## 4. Run Commands
|
| 23 |
+
Copy and run the following commands in separate code cells:
|
| 24 |
+
|
| 25 |
+
### Cell 1: Unzip and Install Dependencies
|
| 26 |
+
```bash
|
| 27 |
+
!unzip project.zip
|
| 28 |
+
!pip install -r project/requirements.txt
|
| 29 |
+
```
|
| 30 |
+
|
| 31 |
+
### Cell 2: Run Verification Pipeline
|
| 32 |
+
This script will download data, train all 3 models (CatBoost, XGBoost, RandomForest), save them, and generate ensemble predictions.
|
| 33 |
+
```bash
|
| 34 |
+
!python project/COLAB_verify_pipeline.py
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
## 5. Download Results
|
| 38 |
+
After the script finishes, check the **Files** sidebar (you may need to refresh it).
|
| 39 |
+
You will find:
|
| 40 |
+
- A `models/` folder containing:
|
| 41 |
+
- `catboost_model.pkl`
|
| 42 |
+
- `xgboost_model.pkl`
|
| 43 |
+
- `randomforest_model.pkl`
|
| 44 |
+
- A file named `ensemble_predictions.csv`
|
| 45 |
+
|
| 46 |
+
You can right-click these files to download them.
|
app.py
CHANGED
|
@@ -13,11 +13,14 @@ from data_loader import load_and_process_data, load_data
|
|
| 13 |
|
| 14 |
st.title("📊 Is Click Predictor")
|
| 15 |
|
|
|
|
|
|
|
|
|
|
| 16 |
# Download and load the trained model from Hugging Face
|
| 17 |
# We wrap this in a try-except block just in case, but modelConnector handles checking too.
|
| 18 |
# However, app.py tries to load it directly for initial "Success" message.
|
| 19 |
try:
|
| 20 |
-
model_path = hf_hub_download(repo_id="
|
| 21 |
rf_model = joblib.load(model_path)
|
| 22 |
st.success("✅ Model Loaded Successfully!")
|
| 23 |
except Exception:
|
|
|
|
| 13 |
|
| 14 |
st.title("📊 Is Click Predictor")
|
| 15 |
|
| 16 |
+
# Download and load the trained model from Hugging Face
|
| 17 |
+
# We wrap this in a try-except block just in case, but modelConnector handles checking too.
|
| 18 |
+
# However, app.py tries to load it directly for initial "Success" message.
|
| 19 |
# Download and load the trained model from Hugging Face
|
| 20 |
# We wrap this in a try-except block just in case, but modelConnector handles checking too.
|
| 21 |
# However, app.py tries to load it directly for initial "Success" message.
|
| 22 |
try:
|
| 23 |
+
model_path = hf_hub_download(repo_id="KaiquanMah/is_click", repo_type="space", filename="models/rf_model.pkl")
|
| 24 |
rf_model = joblib.load(model_path)
|
| 25 |
st.success("✅ Model Loaded Successfully!")
|
| 26 |
except Exception:
|
config.py
CHANGED
|
@@ -4,9 +4,14 @@ import os
|
|
| 4 |
MODEL_DIR = "models"
|
| 5 |
|
| 6 |
# Model File Paths
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
|
|
|
|
| 4 |
MODEL_DIR = "models"
|
| 5 |
|
| 6 |
# Model File Paths
|
| 7 |
+
# v1_202502 - mixture of saved file formats
|
| 8 |
+
# CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.cbm")
|
| 9 |
+
# XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgb_model.json")
|
| 10 |
+
# RF_MODEL_PATH = os.path.join(MODEL_DIR, "rf_model.pkl")
|
| 11 |
+
# v2_202502 - all saved as pickle
|
| 12 |
+
CATBOOST_MODEL_PATH = os.path.join(MODEL_DIR, "catboost_model.pkl")
|
| 13 |
+
XGB_MODEL_PATH = os.path.join(MODEL_DIR, "xgboost_model.pkl")
|
| 14 |
+
RF_MODEL_PATH = os.path.join(MODEL_DIR, "randomforest_model.pkl")
|
| 15 |
|
| 16 |
|
| 17 |
|
data/v1_202502/y_test_1st (1).csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
data/y_test_1st (1).csv
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
modelConnector.py
CHANGED
|
@@ -6,9 +6,11 @@ from model_trainer import train_models
|
|
| 6 |
from data_loader import load_and_process_data, CATEGORICAL_COLUMNS, load_data, add_aggregated_features, preprocess_data, TARGET_COLUMN
|
| 7 |
|
| 8 |
# Hugging Face Model & Dataset Information
|
| 9 |
-
MODEL_REPO = "chagu13/is_click"
|
|
|
|
| 10 |
MODEL_FILENAME = "models/rf_model.pkl"
|
| 11 |
-
DATA_REPO = "chagu13/is_click_data"
|
|
|
|
| 12 |
LOCAL_MODEL_PATH = f"models/{MODEL_FILENAME}"
|
| 13 |
|
| 14 |
# Hugging Face API
|
|
@@ -39,21 +41,35 @@ class ModelConnector:
|
|
| 39 |
def train_model(self):
|
| 40 |
"""Train a new model and upload it to Hugging Face."""
|
| 41 |
try:
|
| 42 |
-
# Download datasets
|
| 43 |
-
train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
|
| 44 |
-
# We also need the test set for the processing pipeline validation/consistency
|
| 45 |
-
# Try to download standard test file. If not found, it might return an error, but assuming it exists in repo.
|
| 46 |
-
# Based on app.py comments, filename might be "X_test_1st(1).csv" or "X_test_1st.csv".
|
| 47 |
-
# We'll try the one matching local naming first, or fall back to what we know works in data_loader if local.
|
| 48 |
-
# However, hf_hub_download needs exact name.
|
| 49 |
-
# app.py comments: X_test_1st(1).csv
|
| 50 |
-
# data_loader: X_test_1st.csv
|
| 51 |
-
# Let's try "X_test_1st.csv" first as it's cleaner, if it fails user might need to adjust.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
try:
|
| 53 |
-
test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="
|
| 54 |
except:
|
| 55 |
-
# Fallback
|
| 56 |
-
|
|
|
|
| 57 |
|
| 58 |
|
| 59 |
# Load and process data using the central pipeline
|
|
@@ -82,12 +98,17 @@ class ModelConnector:
|
|
| 82 |
def retrain_model(self):
|
| 83 |
"""Retrain the existing model with new data."""
|
| 84 |
try:
|
| 85 |
-
# Download datasets (ensure we have latest)
|
| 86 |
-
train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
|
|
|
|
|
|
|
|
|
|
| 87 |
try:
|
| 88 |
-
|
|
|
|
| 89 |
except:
|
| 90 |
-
|
|
|
|
| 91 |
|
| 92 |
# Re-run the full processing pipeline
|
| 93 |
X_train, X_val, y_train, y_val, test_df = load_and_process_data(train_path=train_data_path, test_path=test_data_path)
|
|
|
|
| 6 |
from data_loader import load_and_process_data, CATEGORICAL_COLUMNS, load_data, add_aggregated_features, preprocess_data, TARGET_COLUMN
|
| 7 |
|
| 8 |
# Hugging Face Model & Dataset Information
|
| 9 |
+
# MODEL_REPO = "chagu13/is_click" # commented out to avoid broken links when referencing across repos
|
| 10 |
+
MODEL_REPO = "KaiquanMah/is_click"
|
| 11 |
MODEL_FILENAME = "models/rf_model.pkl"
|
| 12 |
+
# DATA_REPO = "chagu13/is_click_data" # commented out to avoid broken links when referencing across repos
|
| 13 |
+
DATA_REPO = "KaiquanMah/is_click"
|
| 14 |
LOCAL_MODEL_PATH = f"models/{MODEL_FILENAME}"
|
| 15 |
|
| 16 |
# Hugging Face API
|
|
|
|
| 41 |
def train_model(self):
|
| 42 |
"""Train a new model and upload it to Hugging Face."""
|
| 43 |
try:
|
| 44 |
+
# # Download datasets
|
| 45 |
+
# train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
|
| 46 |
+
# # We also need the test set for the processing pipeline validation/consistency
|
| 47 |
+
# # Try to download standard test file. If not found, it might return an error, but assuming it exists in repo.
|
| 48 |
+
# # Based on app.py comments, filename might be "X_test_1st(1).csv" or "X_test_1st.csv".
|
| 49 |
+
# # We'll try the one matching local naming first, or fall back to what we know works in data_loader if local.
|
| 50 |
+
# # However, hf_hub_download needs exact name.
|
| 51 |
+
# # app.py comments: X_test_1st(1).csv
|
| 52 |
+
# # data_loader: X_test_1st.csv
|
| 53 |
+
# # Let's try "X_test_1st.csv" first as it's cleaner, if it fails user might need to adjust.
|
| 54 |
+
# try:
|
| 55 |
+
# test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
|
| 56 |
+
# except:
|
| 57 |
+
# # Fallback to the other name if the first one fails
|
| 58 |
+
# test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st (1).csv")
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
# Download datasets from the Space
|
| 64 |
+
# Assuming the files are in 'data/' directory within the Space
|
| 65 |
+
train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/train_dataset_full - train_dataset_full.csv")
|
| 66 |
+
|
| 67 |
try:
|
| 68 |
+
test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st.csv")
|
| 69 |
except:
|
| 70 |
+
# Fallback if file not found, try without folder prefix or alternate name if needed
|
| 71 |
+
# But sticking to project structure:
|
| 72 |
+
test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st (1).csv")
|
| 73 |
|
| 74 |
|
| 75 |
# Load and process data using the central pipeline
|
|
|
|
| 98 |
def retrain_model(self):
|
| 99 |
"""Retrain the existing model with new data."""
|
| 100 |
try:
|
| 101 |
+
# # Download datasets (ensure we have latest)
|
| 102 |
+
# train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
|
| 103 |
+
# Download datasets from the Space
|
| 104 |
+
train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/train_dataset_full - train_dataset_full.csv")
|
| 105 |
+
|
| 106 |
try:
|
| 107 |
+
# test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
|
| 108 |
+
test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st.csv")
|
| 109 |
except:
|
| 110 |
+
# test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st (1).csv")
|
| 111 |
+
test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="space", filename="data/X_test_1st (1).csv")
|
| 112 |
|
| 113 |
# Re-run the full processing pipeline
|
| 114 |
X_train, X_val, y_train, y_val, test_df = load_and_process_data(train_path=train_data_path, test_path=test_data_path)
|
model_manager.py
CHANGED
|
@@ -5,21 +5,51 @@ from config import CATBOOST_MODEL_PATH, XGB_MODEL_PATH, RF_MODEL_PATH
|
|
| 5 |
|
| 6 |
def save_models(models):
|
| 7 |
""" Save trained models """
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
print("✅ Models saved successfully!")
|
| 14 |
|
| 15 |
def load_models():
|
| 16 |
""" Load trained models """
|
| 17 |
-
|
| 18 |
-
catboost
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
-
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
-
|
|
|
|
|
|
|
|
|
|
| 24 |
|
| 25 |
-
return
|
|
|
|
| 5 |
|
| 6 |
def save_models(models):
|
| 7 |
""" Save trained models """
|
| 8 |
+
# v1
|
| 9 |
+
# models["CatBoost"].save_model(CATBOOST_MODEL_PATH)
|
| 10 |
+
# if models["XGBoost"] is not None:
|
| 11 |
+
# # Save XGBoost model in binary format to reduce memory usage
|
| 12 |
+
# models["XGBoost"].get_booster().save_model(XGB_MODEL_PATH)
|
| 13 |
+
# joblib.dump(models["RandomForest"], RF_MODEL_PATH)
|
| 14 |
+
|
| 15 |
+
# v2
|
| 16 |
+
# Save all models using joblib (pickle)
|
| 17 |
+
for name, path in [("CatBoost", CATBOOST_MODEL_PATH), ("XGBoost", XGB_MODEL_PATH), ("RandomForest", RF_MODEL_PATH)]:
|
| 18 |
+
if models.get(name):
|
| 19 |
+
joblib.dump(models[name], path)
|
| 20 |
+
|
| 21 |
print("✅ Models saved successfully!")
|
| 22 |
|
| 23 |
def load_models():
|
| 24 |
""" Load trained models """
|
| 25 |
+
# v1
|
| 26 |
+
# catboost = CatBoostClassifier()
|
| 27 |
+
# catboost.load_model(CATBOOST_MODEL_PATH)
|
| 28 |
+
|
| 29 |
+
# xgb = XGBClassifier() # Load XGBoost model in binary format
|
| 30 |
+
# xgb.load_model(XGB_MODEL_PATH)
|
| 31 |
+
|
| 32 |
+
# rf = joblib.load(RF_MODEL_PATH)
|
| 33 |
+
|
| 34 |
+
# return {"CatBoost": catboost, "XGBoost": xgb, "RandomForest": rf}
|
| 35 |
|
| 36 |
+
# v2
|
| 37 |
+
models = {}
|
| 38 |
+
|
| 39 |
+
# Load all models using joblib
|
| 40 |
+
try:
|
| 41 |
+
models["CatBoost"] = joblib.load(CATBOOST_MODEL_PATH)
|
| 42 |
+
except:
|
| 43 |
+
models["CatBoost"] = None
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
models["XGBoost"] = joblib.load(XGB_MODEL_PATH)
|
| 47 |
+
except:
|
| 48 |
+
models["XGBoost"] = None
|
| 49 |
|
| 50 |
+
try:
|
| 51 |
+
models["RandomForest"] = joblib.load(RF_MODEL_PATH)
|
| 52 |
+
except:
|
| 53 |
+
models["RandomForest"] = None
|
| 54 |
|
| 55 |
+
return models
|
models/catboost_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f39fd49eba13ac1975ff4450dcbf76277ddf6d07f36fc26d6e4ab9a87500b0b7
|
| 3 |
+
size 911336
|
models/randomforest_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:22ca6974136bd0c244026586b6efc06bb37123b5911ea78b8dd9d282a906a397
|
| 3 |
+
size 111639785
|
models/xgboost_model.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8bce423cd91635d2a7b5be212518b229b6ece8a88562d7ad370e93b8f1ecf3f2
|
| 3 |
+
size 3046668
|