kaiquanmah commited on
Commit
5e4c84c
·
1 Parent(s): 440560f

fix dataset loading feeding into modelConnector train_model and retrain_model

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +65 -19
  3. data_loader.py +2 -2
  4. modelConnector.py +34 -17
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ *s.md
app.py CHANGED
@@ -5,6 +5,7 @@ from huggingface_hub import hf_hub_download
5
  import os
6
 
7
  from modelConnector import ModelConnector
 
8
 
9
  # ===========================
10
  # LOAD MODEL & DATASET
@@ -13,12 +14,19 @@ from modelConnector import ModelConnector
13
  st.title("📊 Is Click Predictor")
14
 
15
  # Download and load the trained model from Hugging Face
16
- model_path = hf_hub_download(repo_id="chagu13/is_click", repo_type="space", filename="models/rf_model.pkl")
17
- rf_model = joblib.load(model_path)
18
- st.success("✅ Model Loaded Successfully!")
 
 
 
 
 
 
19
 
20
  # ===========================
21
- # LOAD DATA FROM HUGGING FACE
 
22
  # ===========================
23
 
24
  st.sidebar.header("Dataset Selection")
@@ -27,21 +35,59 @@ st.sidebar.header("Dataset Selection")
27
  # X_test_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="X_test_1st(1).csv")
28
  # y_test_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="y_test_1st.csv")
29
  # train_data_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="train_dataset_full - train_dataset_full (1).csv")
30
-
31
- base_dir = os.path.dirname(os.path.abspath(__file__)) # Get the directory of app.py
32
- X_test_path = os.path.join(base_dir, "data", "X_test_1st.csv")
33
- y_test_path = os.path.join(base_dir, "data", "y_test_1st (1).csv")
34
- train_data_path = os.path.join(base_dir, "data", "train_dataset_full - train_dataset_full.csv")
35
-
36
-
37
- # Load datasets
38
- X_test = pd.read_csv(X_test_path)
39
- y_test = pd.read_csv(y_test_path, header=None) # Ensure labels match test dataset index
40
- train_data = pd.read_csv(train_data_path)
41
-
42
- st.info(f"✅ Loaded datasets: **Train: {len(train_data)} rows**, **Test: {len(X_test)} rows**")
43
-
44
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  # Initialize Model Connector
47
  model_connector = ModelConnector()
 
5
  import os
6
 
7
  from modelConnector import ModelConnector
8
+ from data_loader import load_and_process_data, load_data
9
 
10
  # ===========================
11
  # LOAD MODEL & DATASET
 
14
  st.title("📊 Is Click Predictor")
15
 
16
  # Download and load the trained model from Hugging Face
17
+ # We wrap this in a try-except block just in case, but modelConnector handles checking too.
18
+ # However, app.py tries to load it directly for initial "Success" message.
19
+ try:
20
+ model_path = hf_hub_download(repo_id="chagu13/is_click", repo_type="space", filename="models/rf_model.pkl")
21
+ rf_model = joblib.load(model_path)
22
+ st.success("✅ Model Loaded Successfully!")
23
+ except Exception:
24
+ st.warning("⚠ Model not found locally or on HF. Please train it first.")
25
+ rf_model = None
26
 
27
  # ===========================
28
+ # LOAD DATA FROM DATA LOADER
29
+ # No Longer LOAD DATA FROM HUGGING FACE
30
  # ===========================
31
 
32
  st.sidebar.header("Dataset Selection")
 
35
  # X_test_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="X_test_1st(1).csv")
36
  # y_test_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="y_test_1st.csv")
37
  # train_data_path = hf_hub_download(repo_id="taimax13/is_click_data", filename="train_dataset_full - train_dataset_full (1).csv")
38
+ # Use the central data loader to get the processed test data
39
+ # This ensures we have the same features (aggregations, encodings) as the model expects
40
+ @st.cache_data
41
+ def get_data():
42
+ return load_and_process_data()
43
+
44
+ try:
45
+ with st.spinner("Loading and processing data..."):
46
+ X_train, X_val, y_train, y_val, test_df = get_data()
47
+
48
+ # We also need the raw y_test labels for "actual_click" comparison
49
+ # The data_loader doesn't return y_test explicitly for the test set split (it treats test_df as unlabeled usually)
50
+ # But based on the original app.py, y_test_1st.csv exists.
51
+
52
+ ##########################
53
+ # base_dir = os.path.dirname(os.path.abspath(__file__)) # Get the directory of app.py
54
+ # X_test_path = os.path.join(base_dir, "data", "X_test_1st.csv")
55
+ # y_test_path = os.path.join(base_dir, "data", "y_test_1st (1).csv")
56
+
57
+ # train_data_path = os.path.join(base_dir, "data", "train_dataset_full - train_dataset_full.csv")
58
+
59
+ # # Load datasets
60
+ # X_test = pd.read_csv(X_test_path)
61
+ # y_test = pd.read_csv(y_test_path, header=None) # Ensure labels match test dataset index
62
+ # train_data = pd.read_csv(train_data_path)
63
+ ################################
64
+
65
+
66
+ # Load datasets
67
+ base_dir = os.path.dirname(os.path.abspath(__file__)) # Get the directory of app.py
68
+ y_test_path = os.path.join(base_dir, "data", "y_test_1st (1).csv")
69
+
70
+ if os.path.exists(y_test_path):
71
+ y_test = pd.read_csv(y_test_path, header=None)
72
+ else:
73
+ # Fallback if file not found locally, try standard name
74
+ y_test_path_alt = os.path.join(base_dir, "data", "y_test_1st.csv")
75
+ if os.path.exists(y_test_path_alt):
76
+ y_test = pd.read_csv(y_test_path_alt, header=None)
77
+ else:
78
+ # If still not found, just create dummy labels of 0 to avoid crash, or handle gracefully
79
+ y_test = pd.DataFrame([0]*len(test_df))
80
+
81
+ # X_test in the app context is now 'test_df' which is processed
82
+ X_test = test_df
83
+
84
+ # st.info(f"✅ Loaded datasets: **Train: {len(train_data)} rows**, **Test: {len(X_test)} rows**")
85
+ st.info(f"✅ Loaded datasets: **Train: {len(X_train)} rows**, **Test: {len(X_test)} rows**")
86
+
87
+ except Exception as e:
88
+ st.error(f"Error loading data: {e}")
89
+ X_test = pd.DataFrame()
90
+ y_test = pd.DataFrame()
91
 
92
  # Initialize Model Connector
93
  model_connector = ModelConnector()
data_loader.py CHANGED
@@ -199,10 +199,10 @@ def visualize_features():
199
  # RUN FULL DATA PROCESSING PIPELINE
200
  # ===========================
201
 
202
- def load_and_process_data():
203
  """Runs the full data processing pipeline and returns preprocessed training & test data."""
204
 
205
- df, test_df = load_data()
206
  df, test_df = add_aggregated_features(df, test_df)
207
  df, test_df, label_encoders = preprocess_data(df, test_df, CATEGORICAL_COLUMNS)
208
  X_train, X_val, y_train, y_val = split_and_balance_data(df, TARGET_COLUMN)
 
199
  # RUN FULL DATA PROCESSING PIPELINE
200
  # ===========================
201
 
202
+ def load_and_process_data(train_path=TRAIN_PATH, test_path=TEST_PATH):
203
  """Runs the full data processing pipeline and returns preprocessed training & test data."""
204
 
205
+ df, test_df = load_data(train_path, test_path)
206
  df, test_df = add_aggregated_features(df, test_df)
207
  df, test_df, label_encoders = preprocess_data(df, test_df, CATEGORICAL_COLUMNS)
208
  X_train, X_val, y_train, y_val = split_and_balance_data(df, TARGET_COLUMN)
modelConnector.py CHANGED
@@ -2,7 +2,8 @@ import os
2
  import joblib
3
  import pandas as pd
4
  from huggingface_hub import hf_hub_download, HfApi
5
- from model_trainer import train_models # Assumes model_trainer.py exists with train_models function
 
6
 
7
  # Hugging Face Model & Dataset Information
8
  MODEL_REPO = "chagu13/is_click"
@@ -38,16 +39,28 @@ class ModelConnector:
38
  def train_model(self):
39
  """Train a new model and upload it to Hugging Face."""
40
  try:
41
- # Load dataset
42
- # train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full.csv")
43
  train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
44
- train_data = pd.read_csv(train_data_path)
45
-
46
- X_train = train_data.drop(columns=["is_click"])
47
- y_train = train_data["is_click"]
48
-
49
- # Train model
50
- models = train_models(X_train, y_train)
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  rf_model = models["RandomForest"]
52
 
53
  # Save locally
@@ -69,14 +82,16 @@ class ModelConnector:
69
  def retrain_model(self):
70
  """Retrain the existing model with new data."""
71
  try:
72
- # Load dataset
73
- # train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full.csv")
74
  train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
75
- train_data = pd.read_csv(train_data_path)
76
-
77
- X_train = train_data.drop(columns=["is_click"])
78
- y_train = train_data["is_click"]
79
-
 
 
 
80
  if self.model is None:
81
  return "No existing model found. Train a new model first."
82
 
@@ -102,5 +117,7 @@ class ModelConnector:
102
  return "No model found. Train the model first."
103
 
104
  input_df = pd.DataFrame([input_data])
 
 
105
  prediction = self.model.predict(input_df)[0]
106
  return int(prediction)
 
2
  import joblib
3
  import pandas as pd
4
  from huggingface_hub import hf_hub_download, HfApi
5
+ from model_trainer import train_models
6
+ from data_loader import load_and_process_data, CATEGORICAL_COLUMNS, load_data, add_aggregated_features, preprocess_data, TARGET_COLUMN
7
 
8
  # Hugging Face Model & Dataset Information
9
  MODEL_REPO = "chagu13/is_click"
 
39
  def train_model(self):
40
  """Train a new model and upload it to Hugging Face."""
41
  try:
42
+ # Download datasets
 
43
  train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
44
+ # We also need the test set for the processing pipeline validation/consistency
45
+ # Try to download standard test file. If not found, it might return an error, but assuming it exists in repo.
46
+ # Based on app.py comments, filename might be "X_test_1st(1).csv" or "X_test_1st.csv".
47
+ # We'll try the one matching local naming first, or fall back to what we know works in data_loader if local.
48
+ # However, hf_hub_download needs exact name.
49
+ # app.py comments: X_test_1st(1).csv
50
+ # data_loader: X_test_1st.csv
51
+ # Let's try "X_test_1st.csv" first as it's cleaner, if it fails user might need to adjust.
52
+ try:
53
+ test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
54
+ except:
55
+ # Fallback to the other name if the first one fails
56
+ test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st(1).csv")
57
+
58
+
59
+ # Load and process data using the central pipeline
60
+ X_train, X_val, y_train, y_val, test_df = load_and_process_data(train_path=train_data_path, test_path=test_data_path)
61
+
62
+ # Train models (passing categorical columns as required by model_trainer.py)
63
+ models = train_models(X_train, y_train, CATEGORICAL_COLUMNS)
64
  rf_model = models["RandomForest"]
65
 
66
  # Save locally
 
82
  def retrain_model(self):
83
  """Retrain the existing model with new data."""
84
  try:
85
+ # Download datasets (ensure we have latest)
 
86
  train_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="train_dataset_full - train_dataset_full (1).csv")
87
+ try:
88
+ test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st.csv")
89
+ except:
90
+ test_data_path = hf_hub_download(repo_id=DATA_REPO, repo_type="dataset", filename="X_test_1st(1).csv")
91
+
92
+ # Re-run the full processing pipeline
93
+ X_train, X_val, y_train, y_val, test_df = load_and_process_data(train_path=train_data_path, test_path=test_data_path)
94
+
95
  if self.model is None:
96
  return "No existing model found. Train a new model first."
97
 
 
117
  return "No model found. Train the model first."
118
 
119
  input_df = pd.DataFrame([input_data])
120
+ # Ensure column order matches training (optional but good practice)
121
+ # Note: input_data passed here is expected to be already preprocessed by app.py
122
  prediction = self.model.predict(input_df)[0]
123
  return int(prediction)