Che237 commited on
Commit
f12e058
·
verified ·
1 Parent(s): a022514

Fix: Use per-dataset scalers to avoid feature mismatch

Browse files
Files changed (1) hide show
  1. notebooks/03_model_training.ipynb +23 -12
notebooks/03_model_training.ipynb CHANGED
@@ -221,14 +221,15 @@
221
  " \"\"\"\n",
222
  " \n",
223
  " def __init__(self):\n",
224
- " self.scaler = StandardScaler()\n",
225
- " self.label_encoder = LabelEncoder()\n",
226
  " self.trained_models = {}\n",
227
  " self.training_metrics = {}\n",
 
 
 
228
  " \n",
229
- " def prepare_data(self, df: pd.DataFrame, label_col: str = 'label', \n",
230
  " test_size: float = 0.2) -> Tuple:\n",
231
- " \"\"\"Prepare data for training\"\"\"\n",
232
  " # Separate features and labels\n",
233
  " y = df[label_col]\n",
234
  " X = df.drop(columns=[label_col])\n",
@@ -236,12 +237,20 @@
236
  " # Keep only numeric columns\n",
237
  " X = X.select_dtypes(include=[np.number]).fillna(0)\n",
238
  " \n",
 
 
 
 
239
  " # Encode labels if needed\n",
240
  " if y.dtype == 'object':\n",
241
- " y = self.label_encoder.fit_transform(y)\n",
 
 
 
242
  " \n",
243
  " # Scale features\n",
244
- " X_scaled = self.scaler.fit_transform(X)\n",
 
245
  " \n",
246
  " # Split\n",
247
  " X_train, X_test, y_train, y_test = train_test_split(\n",
@@ -295,8 +304,8 @@
295
  " print(f\"Training models for: {dataset_name}\")\n",
296
  " print(f\"{'='*50}\")\n",
297
  " \n",
298
- " # Prepare data\n",
299
- " X_train, X_test, y_train, y_test, feature_names = self.prepare_data(df)\n",
300
  " print(f\" Data: {len(X_train)} train, {len(X_test)} test samples\")\n",
301
  " print(f\" Features: {len(feature_names)}\")\n",
302
  " \n",
@@ -334,18 +343,20 @@
334
  " \n",
335
  " print(f\"\\n \u2713 Best model: {best_model} (F1: {best_score:.4f})\")\n",
336
  " \n",
337
- " # Store results\n",
338
  " self.trained_models[dataset_name] = {\n",
339
  " 'models': results,\n",
340
  " 'best_model': best_model,\n",
341
- " 'scaler': self.scaler,\n",
342
- " 'label_encoder': self.label_encoder if hasattr(self.label_encoder, 'classes_') else None\n",
 
343
  " }\n",
344
  " \n",
345
  " return results\n",
346
  "\n",
347
  "trainer = CyberForgeTrainer()\n",
348
- "print(\"\u2713 CyberForge Trainer initialized\")"
 
349
  ]
350
  },
351
  {
 
221
  " \"\"\"\n",
222
  " \n",
223
  " def __init__(self):\n",
 
 
224
  " self.trained_models = {}\n",
225
  " self.training_metrics = {}\n",
226
+ " # Store scalers and encoders per dataset\n",
227
+ " self.scalers = {}\n",
228
+ " self.label_encoders = {}\n",
229
  " \n",
230
+ " def prepare_data(self, df: pd.DataFrame, dataset_name: str, label_col: str = 'label', \n",
231
  " test_size: float = 0.2) -> Tuple:\n",
232
+ " \"\"\"Prepare data for training - creates a new scaler per dataset\"\"\"\n",
233
  " # Separate features and labels\n",
234
  " y = df[label_col]\n",
235
  " X = df.drop(columns=[label_col])\n",
 
237
  " # Keep only numeric columns\n",
238
  " X = X.select_dtypes(include=[np.number]).fillna(0)\n",
239
  " \n",
240
+ " # Create NEW scaler and encoder for THIS dataset\n",
241
+ " scaler = StandardScaler()\n",
242
+ " label_encoder = LabelEncoder()\n",
243
+ " \n",
244
  " # Encode labels if needed\n",
245
  " if y.dtype == 'object':\n",
246
+ " y = label_encoder.fit_transform(y)\n",
247
+ " self.label_encoders[dataset_name] = label_encoder\n",
248
+ " else:\n",
249
+ " y = y.values\n",
250
  " \n",
251
  " # Scale features\n",
252
+ " X_scaled = scaler.fit_transform(X)\n",
253
+ " self.scalers[dataset_name] = scaler\n",
254
  " \n",
255
  " # Split\n",
256
  " X_train, X_test, y_train, y_test = train_test_split(\n",
 
304
  " print(f\"Training models for: {dataset_name}\")\n",
305
  " print(f\"{'='*50}\")\n",
306
  " \n",
307
+ " # Prepare data - pass dataset_name to create per-dataset scaler\n",
308
+ " X_train, X_test, y_train, y_test, feature_names = self.prepare_data(df, dataset_name)\n",
309
  " print(f\" Data: {len(X_train)} train, {len(X_test)} test samples\")\n",
310
  " print(f\" Features: {len(feature_names)}\")\n",
311
  " \n",
 
343
  " \n",
344
  " print(f\"\\n \u2713 Best model: {best_model} (F1: {best_score:.4f})\")\n",
345
  " \n",
346
+ " # Store results with PER-DATASET scaler\n",
347
  " self.trained_models[dataset_name] = {\n",
348
  " 'models': results,\n",
349
  " 'best_model': best_model,\n",
350
+ " 'scaler': self.scalers.get(dataset_name),\n",
351
+ " 'label_encoder': self.label_encoders.get(dataset_name),\n",
352
+ " 'n_features': len(feature_names)\n",
353
  " }\n",
354
  " \n",
355
  " return results\n",
356
  "\n",
357
  "trainer = CyberForgeTrainer()\n",
358
+ "print(\"\u2713 CyberForge Trainer initialized\")\n",
359
+ "\n"
360
  ]
361
  },
362
  {