Spaces:

KaiquanMah
/

is_click

Sleeping

App Files Files Community

kaiquanmah commited on Jan 9

Commit

5e4c84c

1 Parent(s): 440560f

fix dataset loading feeding into modelConnector train_model and retrain_model

Browse files

Files changed (4) hide show

.gitignore +1 -0
app.py +65 -19
data_loader.py +2 -2
modelConnector.py +34 -17

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *s.md

app.py CHANGED Viewed

@@ -5,6 +5,7 @@ from huggingface_hub import hf_hub_download
 import os
 from modelConnector import ModelConnector
 # ===========================
 #  LOAD MODEL & DATASET
@@ -13,12 +14,19 @@ from modelConnector import ModelConnector
 st.title("📊 Is Click Predictor")
 # Download and load the trained model from Hugging Face
-model_path = hf_hub_download(repo_id="chagu13/is_click", repo_type="space", filename="models/rf_model.pkl")
-rf_model = joblib.load(model_path)
-st.success("✅ Model Loaded Successfully!")
 # ===========================
-#  LOAD DATA FROM HUGGING FACE
 # ===========================
 st.sidebar.header("Dataset Selection")
@@ -27,21 +35,59 @@ st.sidebar.header("Dataset Selection")
 # X_test_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="X_test_1st(1).csv")
 # y_test_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="y_test_1st.csv")
 # train_data_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="train_dataset_full - train_dataset_full (1).csv")
-base_dir = os.path.dirname(os.path.abspath(__file__))  # Get the directory of app.py
-X_test_path = os.path.join(base_dir, "data", "X_test_1st.csv")
-y_test_path = os.path.join(base_dir, "data", "y_test_1st (1).csv")
-train_data_path = os.path.join(base_dir, "data", "train_dataset_full - train_dataset_full.csv")
-# Load datasets
-X_test = pd.read_csv(X_test_path)
-y_test = pd.read_csv(y_test_path, header=None)  # Ensure labels match test dataset index
-train_data = pd.read_csv(train_data_path)
-st.info(f"✅ Loaded datasets: **Train: {len(train_data)} rows**, **Test: {len(X_test)} rows**")
 # Initialize Model Connector
 model_connector = ModelConnector()

 import os
 from modelConnector import ModelConnector
+from data_loader import load_and_process_data, load_data
 # ===========================
 #  LOAD MODEL & DATASET
 st.title("📊 Is Click Predictor")
 # Download and load the trained model from Hugging Face
+# We wrap this in a try-except block just in case, but modelConnector handles checking too.
+# However, app.py tries to load it directly for initial "Success" message.
+try:
+    model_path = hf_hub_download(repo_id="chagu13/is_click", repo_type="space", filename="models/rf_model.pkl")
+    rf_model = joblib.load(model_path)
+    st.success("✅ Model Loaded Successfully!")
+except Exception:
+    st.warning("⚠ Model not found locally or on HF. Please train it first.")
+    rf_model = None
 # ===========================
+#  LOAD DATA FROM DATA LOADER
+#       No Longer LOAD DATA FROM HUGGING FACE
 # ===========================
 st.sidebar.header("Dataset Selection")
 # X_test_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="X_test_1st(1).csv")
 # y_test_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="y_test_1st.csv")
 # train_data_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="train_dataset_full - train_dataset_full (1).csv")
+# Use the central data loader to get the processed test data
+# This ensures we have the same features (aggregations, encodings) as the model expects
+@st.cache_data
+def get_data():
+    return load_and_process_data()
+try:
+    with st.spinner("Loading and processing data..."):
+        X_train, X_val, y_train, y_val, test_df = get_data()
+    # We also need the raw y_test labels for "actual_click" comparison
+    # The data_loader doesn't return y_test explicitly for the test set split (it treats test_df as unlabeled usually)
+    # But based on the original app.py, y_test_1st.csv exists.
+    ##########################
+    # base_dir = os.path.dirname(os.path.abspath(__file__))  # Get the directory of app.py
+    # X_test_path = os.path.join(base_dir, "data", "X_test_1st.csv")
+    # y_test_path = os.path.join(base_dir, "data", "y_test_1st (1).csv")
+    # train_data_path = os.path.join(base_dir, "data", "train_dataset_full - train_dataset_full.csv")
+    # # Load datasets
+    # X_test = pd.read_csv(X_test_path)
+    # y_test = pd.read_csv(y_test_path, header=None)  # Ensure labels match test dataset index
+    # train_data = pd.read_csv(train_data_path)
+    ################################
+    # Load datasets
+    base_dir = os.path.dirname(os.path.abspath(__file__))  # Get the directory of app.py
+    y_test_path = os.path.join(base_dir, "data", "y_test_1st (1).csv")
+    if os.path.exists(y_test_path):
+        y_test = pd.read_csv(y_test_path, header=None)
+    else:
+        # Fallback if file not found locally, try standard name
+        y_test_path_alt = os.path.join(base_dir, "data", "y_test_1st.csv")
+        if os.path.exists(y_test_path_alt):
+             y_test = pd.read_csv(y_test_path_alt, header=None)
+        else:
+             # If still not found, just create dummy labels of 0 to avoid crash, or handle gracefully
+             y_test = pd.DataFrame([0]*len(test_df))
+    # X_test in the app context is now 'test_df' which is processed
+    X_test = test_df
+    # st.info(f"✅ Loaded datasets: **Train: {len(train_data)} rows**, **Test: {len(X_test)} rows**")
+    st.info(f"✅ Loaded datasets: **Train: {len(X_train)} rows**, **Test: {len(X_test)} rows**")
+except Exception as e:
+    st.error(f"Error loading data: {e}")
+    X_test = pd.DataFrame()
+    y_test = pd.DataFrame()
 # Initialize Model Connector
 model_connector = ModelConnector()

data_loader.py CHANGED Viewed

@@ -199,10 +199,10 @@ def visualize_features():
 #  RUN FULL DATA PROCESSING PIPELINE
 # ===========================
-def load_and_process_data():
     """Runs the full data processing pipeline and returns preprocessed training & test data."""
-    df, test_df = load_data()
     df, test_df = add_aggregated_features(df, test_df)
     df, test_df, label_encoders = preprocess_data(df, test_df, CATEGORICAL_COLUMNS)
     X_train, X_val, y_train, y_val = split_and_balance_data(df, TARGET_COLUMN)

 #  RUN FULL DATA PROCESSING PIPELINE
 # ===========================
+def load_and_process_data(train_path=TRAIN_PATH, test_path=TEST_PATH):
     """Runs the full data processing pipeline and returns preprocessed training & test data."""
+    df, test_df = load_data(train_path, test_path)
     df, test_df = add_aggregated_features(df, test_df)
     df, test_df, label_encoders = preprocess_data(df, test_df, CATEGORICAL_COLUMNS)
     X_train, X_val, y_train, y_val = split_and_balance_data(df, TARGET_COLUMN)

modelConnector.py CHANGED Viewed

@@ -2,7 +2,8 @@ import os
 import joblib
 import pandas as pd
 from huggingface_hub import hf_hub_download, HfApi
-from model_trainer import train_models  # Assumes model_trainer.py exists with train_models function
 # Hugging Face Model & Dataset Information
 MODEL_REPO = "chagu13/is_click"
@@ -38,16 +39,28 @@ class ModelConnector:
     def train_model(self):
         """Train a new model and upload it to Hugging Face."""
         try:
-            # Load dataset
-            # train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full.csv")
             train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
-            train_data = pd.read_csv(train_data_path)
-            X_train = train_data.drop(columns=["is_click"])
-            y_train = train_data["is_click"]
-            # Train model
-            models = train_models(X_train, y_train)
             rf_model = models["RandomForest"]
             # Save locally
@@ -69,14 +82,16 @@ class ModelConnector:
     def retrain_model(self):
         """Retrain the existing model with new data."""
         try:
-            # Load dataset
-            # train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full.csv")
             train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
-            train_data = pd.read_csv(train_data_path)
-            X_train = train_data.drop(columns=["is_click"])
-            y_train = train_data["is_click"]
             if self.model is None:
                 return "No existing model found. Train a new model first."
@@ -102,5 +117,7 @@ class ModelConnector:
             return "No model found. Train the model first."
         input_df = pd.DataFrame([input_data])
         prediction = self.model.predict(input_df)[0]
         return int(prediction)

 import joblib
 import pandas as pd
 from huggingface_hub import hf_hub_download, HfApi
+from model_trainer import train_models
+from data_loader import load_and_process_data, CATEGORICAL_COLUMNS, load_data, add_aggregated_features, preprocess_data, TARGET_COLUMN
 # Hugging Face Model & Dataset Information
 MODEL_REPO = "chagu13/is_click"
     def train_model(self):
         """Train a new model and upload it to Hugging Face."""
         try:
+            # Download datasets
             train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
+            # We also need the test set for the processing pipeline validation/consistency
+            # Try to download standard test file. If not found, it might return an error, but assuming it exists in repo.
+            # Based on app.py comments, filename might be "X_test_1st(1).csv" or "X_test_1st.csv".
+            # We'll try the one matching local naming first, or fall back to what we know works in data_loader if local.
+            # However, hf_hub_download needs exact name.
+            # app.py comments: X_test_1st(1).csv
+            # data_loader: X_test_1st.csv
+            # Let's try "X_test_1st.csv" first as it's cleaner, if it fails user might need to adjust.
+            try:
+                 test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
+            except:
+                 # Fallback to the other name if the first one fails
+                 test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st(1).csv")
+            # Load and process data using the central pipeline
+            X_train, X_val, y_train, y_val, test_df = load_and_process_data(train_path=train_data_path, test_path=test_data_path)
+            # Train models (passing categorical columns as required by model_trainer.py)
+            models = train_models(X_train, y_train, CATEGORICAL_COLUMNS)
             rf_model = models["RandomForest"]
             # Save locally
     def retrain_model(self):
         """Retrain the existing model with new data."""
         try:
+            # Download datasets (ensure we have latest)
             train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
+            try:
+                 test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
+            except:
+                 test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st(1).csv")
+            # Re-run the full processing pipeline
+            X_train, X_val, y_train, y_val, test_df = load_and_process_data(train_path=train_data_path, test_path=test_data_path)
             if self.model is None:
                 return "No existing model found. Train a new model first."
             return "No model found. Train the model first."
         input_df = pd.DataFrame([input_data])
+        # Ensure column order matches training (optional but good practice)
+        # Note: input_data passed here is expected to be already preprocessed by app.py
         prediction = self.model.predict(input_df)[0]
         return int(prediction)