Spaces:
Sleeping
Sleeping
kaiquanmah commited on
Commit ·
5e4c84c
1
Parent(s): 440560f
fix dataset loading feeding into modelConnector train_model and retrain_model
Browse files- .gitignore +1 -0
- app.py +65 -19
- data_loader.py +2 -2
- modelConnector.py +34 -17
.gitignore
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
*s.md
|
app.py
CHANGED
|
@@ -5,6 +5,7 @@ from huggingface_hub import hf_hub_download
|
|
| 5 |
import os
|
| 6 |
|
| 7 |
from modelConnector import ModelConnector
|
|
|
|
| 8 |
|
| 9 |
# ===========================
|
| 10 |
# LOAD MODEL & DATASET
|
|
@@ -13,12 +14,19 @@ from modelConnector import ModelConnector
|
|
| 13 |
st.title("📊 Is Click Predictor")
|
| 14 |
|
| 15 |
# Download and load the trained model from Hugging Face
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
|
| 20 |
# ===========================
|
| 21 |
-
# LOAD DATA FROM
|
|
|
|
| 22 |
# ===========================
|
| 23 |
|
| 24 |
st.sidebar.header("Dataset Selection")
|
|
@@ -27,21 +35,59 @@ st.sidebar.header("Dataset Selection")
|
|
| 27 |
# X_test_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="X_test_1st(1).csv")
|
| 28 |
# y_test_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="y_test_1st.csv")
|
| 29 |
# train_data_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="train_dataset_full - train_dataset_full (1).csv")
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
# Initialize Model Connector
|
| 47 |
model_connector = ModelConnector()
|
|
|
|
| 5 |
import os
|
| 6 |
|
| 7 |
from modelConnector import ModelConnector
|
| 8 |
+
from data_loader import load_and_process_data, load_data
|
| 9 |
|
| 10 |
# ===========================
|
| 11 |
# LOAD MODEL & DATASET
|
|
|
|
| 14 |
st.title("📊 Is Click Predictor")
|
| 15 |
|
| 16 |
# Download and load the trained model from Hugging Face
|
| 17 |
+
# We wrap this in a try-except block just in case, but modelConnector handles checking too.
|
| 18 |
+
# However, app.py tries to load it directly for initial "Success" message.
|
| 19 |
+
try:
|
| 20 |
+
model_path = hf_hub_download(repo_id="chagu13/is_click", repo_type="space", filename="models/rf_model.pkl")
|
| 21 |
+
rf_model = joblib.load(model_path)
|
| 22 |
+
st.success("✅ Model Loaded Successfully!")
|
| 23 |
+
except Exception:
|
| 24 |
+
st.warning("⚠ Model not found locally or on HF. Please train it first.")
|
| 25 |
+
rf_model = None
|
| 26 |
|
| 27 |
# ===========================
|
| 28 |
+
# LOAD DATA FROM DATA LOADER
|
| 29 |
+
# No Longer LOAD DATA FROM HUGGING FACE
|
| 30 |
# ===========================
|
| 31 |
|
| 32 |
st.sidebar.header("Dataset Selection")
|
|
|
|
| 35 |
# X_test_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="X_test_1st(1).csv")
|
| 36 |
# y_test_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="y_test_1st.csv")
|
| 37 |
# train_data_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="train_dataset_full - train_dataset_full (1).csv")
|
| 38 |
+
# Use the central data loader to get the processed test data
|
| 39 |
+
# This ensures we have the same features (aggregations, encodings) as the model expects
|
| 40 |
+
@st.cache_data
|
| 41 |
+
def get_data():
|
| 42 |
+
return load_and_process_data()
|
| 43 |
+
|
| 44 |
+
try:
|
| 45 |
+
with st.spinner("Loading and processing data..."):
|
| 46 |
+
X_train, X_val, y_train, y_val, test_df = get_data()
|
| 47 |
+
|
| 48 |
+
# We also need the raw y_test labels for "actual_click" comparison
|
| 49 |
+
# The data_loader doesn't return y_test explicitly for the test set split (it treats test_df as unlabeled usually)
|
| 50 |
+
# But based on the original app.py, y_test_1st.csv exists.
|
| 51 |
+
|
| 52 |
+
##########################
|
| 53 |
+
# base_dir = os.path.dirname(os.path.abspath(__file__)) # Get the directory of app.py
|
| 54 |
+
# X_test_path = os.path.join(base_dir, "data", "X_test_1st.csv")
|
| 55 |
+
# y_test_path = os.path.join(base_dir, "data", "y_test_1st (1).csv")
|
| 56 |
+
|
| 57 |
+
# train_data_path = os.path.join(base_dir, "data", "train_dataset_full - train_dataset_full.csv")
|
| 58 |
+
|
| 59 |
+
# # Load datasets
|
| 60 |
+
# X_test = pd.read_csv(X_test_path)
|
| 61 |
+
# y_test = pd.read_csv(y_test_path, header=None) # Ensure labels match test dataset index
|
| 62 |
+
# train_data = pd.read_csv(train_data_path)
|
| 63 |
+
################################
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# Load datasets
|
| 67 |
+
base_dir = os.path.dirname(os.path.abspath(__file__)) # Get the directory of app.py
|
| 68 |
+
y_test_path = os.path.join(base_dir, "data", "y_test_1st (1).csv")
|
| 69 |
+
|
| 70 |
+
if os.path.exists(y_test_path):
|
| 71 |
+
y_test = pd.read_csv(y_test_path, header=None)
|
| 72 |
+
else:
|
| 73 |
+
# Fallback if file not found locally, try standard name
|
| 74 |
+
y_test_path_alt = os.path.join(base_dir, "data", "y_test_1st.csv")
|
| 75 |
+
if os.path.exists(y_test_path_alt):
|
| 76 |
+
y_test = pd.read_csv(y_test_path_alt, header=None)
|
| 77 |
+
else:
|
| 78 |
+
# If still not found, just create dummy labels of 0 to avoid crash, or handle gracefully
|
| 79 |
+
y_test = pd.DataFrame([0]*len(test_df))
|
| 80 |
+
|
| 81 |
+
# X_test in the app context is now 'test_df' which is processed
|
| 82 |
+
X_test = test_df
|
| 83 |
+
|
| 84 |
+
# st.info(f"✅ Loaded datasets: **Train: {len(train_data)} rows**, **Test: {len(X_test)} rows**")
|
| 85 |
+
st.info(f"✅ Loaded datasets: **Train: {len(X_train)} rows**, **Test: {len(X_test)} rows**")
|
| 86 |
+
|
| 87 |
+
except Exception as e:
|
| 88 |
+
st.error(f"Error loading data: {e}")
|
| 89 |
+
X_test = pd.DataFrame()
|
| 90 |
+
y_test = pd.DataFrame()
|
| 91 |
|
| 92 |
# Initialize Model Connector
|
| 93 |
model_connector = ModelConnector()
|
data_loader.py
CHANGED
|
@@ -199,10 +199,10 @@ def visualize_features():
|
|
| 199 |
# RUN FULL DATA PROCESSING PIPELINE
|
| 200 |
# ===========================
|
| 201 |
|
| 202 |
-
def load_and_process_data():
|
| 203 |
"""Runs the full data processing pipeline and returns preprocessed training & test data."""
|
| 204 |
|
| 205 |
-
df, test_df = load_data()
|
| 206 |
df, test_df = add_aggregated_features(df, test_df)
|
| 207 |
df, test_df, label_encoders = preprocess_data(df, test_df, CATEGORICAL_COLUMNS)
|
| 208 |
X_train, X_val, y_train, y_val = split_and_balance_data(df, TARGET_COLUMN)
|
|
|
|
| 199 |
# RUN FULL DATA PROCESSING PIPELINE
|
| 200 |
# ===========================
|
| 201 |
|
| 202 |
+
def load_and_process_data(train_path=TRAIN_PATH, test_path=TEST_PATH):
|
| 203 |
"""Runs the full data processing pipeline and returns preprocessed training & test data."""
|
| 204 |
|
| 205 |
+
df, test_df = load_data(train_path, test_path)
|
| 206 |
df, test_df = add_aggregated_features(df, test_df)
|
| 207 |
df, test_df, label_encoders = preprocess_data(df, test_df, CATEGORICAL_COLUMNS)
|
| 208 |
X_train, X_val, y_train, y_val = split_and_balance_data(df, TARGET_COLUMN)
|
modelConnector.py
CHANGED
|
@@ -2,7 +2,8 @@ import os
|
|
| 2 |
import joblib
|
| 3 |
import pandas as pd
|
| 4 |
from huggingface_hub import hf_hub_download, HfApi
|
| 5 |
-
from model_trainer import train_models
|
|
|
|
| 6 |
|
| 7 |
# Hugging Face Model & Dataset Information
|
| 8 |
MODEL_REPO = "chagu13/is_click"
|
|
@@ -38,16 +39,28 @@ class ModelConnector:
|
|
| 38 |
def train_model(self):
|
| 39 |
"""Train a new model and upload it to Hugging Face."""
|
| 40 |
try:
|
| 41 |
-
#
|
| 42 |
-
# train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full.csv")
|
| 43 |
train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
#
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
rf_model = models["RandomForest"]
|
| 52 |
|
| 53 |
# Save locally
|
|
@@ -69,14 +82,16 @@ class ModelConnector:
|
|
| 69 |
def retrain_model(self):
|
| 70 |
"""Retrain the existing model with new data."""
|
| 71 |
try:
|
| 72 |
-
#
|
| 73 |
-
# train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full.csv")
|
| 74 |
train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
|
|
|
|
|
|
|
|
|
| 80 |
if self.model is None:
|
| 81 |
return "No existing model found. Train a new model first."
|
| 82 |
|
|
@@ -102,5 +117,7 @@ class ModelConnector:
|
|
| 102 |
return "No model found. Train the model first."
|
| 103 |
|
| 104 |
input_df = pd.DataFrame([input_data])
|
|
|
|
|
|
|
| 105 |
prediction = self.model.predict(input_df)[0]
|
| 106 |
return int(prediction)
|
|
|
|
| 2 |
import joblib
|
| 3 |
import pandas as pd
|
| 4 |
from huggingface_hub import hf_hub_download, HfApi
|
| 5 |
+
from model_trainer import train_models
|
| 6 |
+
from data_loader import load_and_process_data, CATEGORICAL_COLUMNS, load_data, add_aggregated_features, preprocess_data, TARGET_COLUMN
|
| 7 |
|
| 8 |
# Hugging Face Model & Dataset Information
|
| 9 |
MODEL_REPO = "chagu13/is_click"
|
|
|
|
| 39 |
def train_model(self):
|
| 40 |
"""Train a new model and upload it to Hugging Face."""
|
| 41 |
try:
|
| 42 |
+
# Download datasets
|
|
|
|
| 43 |
train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
|
| 44 |
+
# We also need the test set for the processing pipeline validation/consistency
|
| 45 |
+
# Try to download standard test file. If not found, it might return an error, but assuming it exists in repo.
|
| 46 |
+
# Based on app.py comments, filename might be "X_test_1st(1).csv" or "X_test_1st.csv".
|
| 47 |
+
# We'll try the one matching local naming first, or fall back to what we know works in data_loader if local.
|
| 48 |
+
# However, hf_hub_download needs exact name.
|
| 49 |
+
# app.py comments: X_test_1st(1).csv
|
| 50 |
+
# data_loader: X_test_1st.csv
|
| 51 |
+
# Let's try "X_test_1st.csv" first as it's cleaner, if it fails user might need to adjust.
|
| 52 |
+
try:
|
| 53 |
+
test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
|
| 54 |
+
except:
|
| 55 |
+
# Fallback to the other name if the first one fails
|
| 56 |
+
test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st(1).csv")
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# Load and process data using the central pipeline
|
| 60 |
+
X_train, X_val, y_train, y_val, test_df = load_and_process_data(train_path=train_data_path, test_path=test_data_path)
|
| 61 |
+
|
| 62 |
+
# Train models (passing categorical columns as required by model_trainer.py)
|
| 63 |
+
models = train_models(X_train, y_train, CATEGORICAL_COLUMNS)
|
| 64 |
rf_model = models["RandomForest"]
|
| 65 |
|
| 66 |
# Save locally
|
|
|
|
| 82 |
def retrain_model(self):
|
| 83 |
"""Retrain the existing model with new data."""
|
| 84 |
try:
|
| 85 |
+
# Download datasets (ensure we have latest)
|
|
|
|
| 86 |
train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
|
| 87 |
+
try:
|
| 88 |
+
test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
|
| 89 |
+
except:
|
| 90 |
+
test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st(1).csv")
|
| 91 |
+
|
| 92 |
+
# Re-run the full processing pipeline
|
| 93 |
+
X_train, X_val, y_train, y_val, test_df = load_and_process_data(train_path=train_data_path, test_path=test_data_path)
|
| 94 |
+
|
| 95 |
if self.model is None:
|
| 96 |
return "No existing model found. Train a new model first."
|
| 97 |
|
|
|
|
| 117 |
return "No model found. Train the model first."
|
| 118 |
|
| 119 |
input_df = pd.DataFrame([input_data])
|
| 120 |
+
# Ensure column order matches training (optional but good practice)
|
| 121 |
+
# Note: input_data passed here is expected to be already preprocessed by app.py
|
| 122 |
prediction = self.model.predict(input_df)[0]
|
| 123 |
return int(prediction)
|