Spaces:
Sleeping
Sleeping
Fix: Use per-dataset scalers to avoid feature mismatch
Browse files
notebooks/03_model_training.ipynb
CHANGED
|
@@ -221,14 +221,15 @@
|
|
| 221 |
" \"\"\"\n",
|
| 222 |
" \n",
|
| 223 |
" def __init__(self):\n",
|
| 224 |
-
" self.scaler = StandardScaler()\n",
|
| 225 |
-
" self.label_encoder = LabelEncoder()\n",
|
| 226 |
" self.trained_models = {}\n",
|
| 227 |
" self.training_metrics = {}\n",
|
|
|
|
|
|
|
|
|
|
| 228 |
" \n",
|
| 229 |
-
" def prepare_data(self, df: pd.DataFrame, label_col: str = 'label', \n",
|
| 230 |
" test_size: float = 0.2) -> Tuple:\n",
|
| 231 |
-
" \"\"\"Prepare data for training\"\"\"\n",
|
| 232 |
" # Separate features and labels\n",
|
| 233 |
" y = df[label_col]\n",
|
| 234 |
" X = df.drop(columns=[label_col])\n",
|
|
@@ -236,12 +237,20 @@
|
|
| 236 |
" # Keep only numeric columns\n",
|
| 237 |
" X = X.select_dtypes(include=[np.number]).fillna(0)\n",
|
| 238 |
" \n",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
" # Encode labels if needed\n",
|
| 240 |
" if y.dtype == 'object':\n",
|
| 241 |
-
" y =
|
|
|
|
|
|
|
|
|
|
| 242 |
" \n",
|
| 243 |
" # Scale features\n",
|
| 244 |
-
" X_scaled =
|
|
|
|
| 245 |
" \n",
|
| 246 |
" # Split\n",
|
| 247 |
" X_train, X_test, y_train, y_test = train_test_split(\n",
|
|
@@ -295,8 +304,8 @@
|
|
| 295 |
" print(f\"Training models for: {dataset_name}\")\n",
|
| 296 |
" print(f\"{'='*50}\")\n",
|
| 297 |
" \n",
|
| 298 |
-
" # Prepare data\n",
|
| 299 |
-
" X_train, X_test, y_train, y_test, feature_names = self.prepare_data(df)\n",
|
| 300 |
" print(f\" Data: {len(X_train)} train, {len(X_test)} test samples\")\n",
|
| 301 |
" print(f\" Features: {len(feature_names)}\")\n",
|
| 302 |
" \n",
|
|
@@ -334,18 +343,20 @@
|
|
| 334 |
" \n",
|
| 335 |
" print(f\"\\n \u2713 Best model: {best_model} (F1: {best_score:.4f})\")\n",
|
| 336 |
" \n",
|
| 337 |
-
" # Store results\n",
|
| 338 |
" self.trained_models[dataset_name] = {\n",
|
| 339 |
" 'models': results,\n",
|
| 340 |
" 'best_model': best_model,\n",
|
| 341 |
-
" 'scaler': self.
|
| 342 |
-
" 'label_encoder': self.
|
|
|
|
| 343 |
" }\n",
|
| 344 |
" \n",
|
| 345 |
" return results\n",
|
| 346 |
"\n",
|
| 347 |
"trainer = CyberForgeTrainer()\n",
|
| 348 |
-
"print(\"\u2713 CyberForge Trainer initialized\")"
|
|
|
|
| 349 |
]
|
| 350 |
},
|
| 351 |
{
|
|
|
|
| 221 |
" \"\"\"\n",
|
| 222 |
" \n",
|
| 223 |
" def __init__(self):\n",
|
|
|
|
|
|
|
| 224 |
" self.trained_models = {}\n",
|
| 225 |
" self.training_metrics = {}\n",
|
| 226 |
+
" # Store scalers and encoders per dataset\n",
|
| 227 |
+
" self.scalers = {}\n",
|
| 228 |
+
" self.label_encoders = {}\n",
|
| 229 |
" \n",
|
| 230 |
+
" def prepare_data(self, df: pd.DataFrame, dataset_name: str, label_col: str = 'label', \n",
|
| 231 |
" test_size: float = 0.2) -> Tuple:\n",
|
| 232 |
+
" \"\"\"Prepare data for training - creates a new scaler per dataset\"\"\"\n",
|
| 233 |
" # Separate features and labels\n",
|
| 234 |
" y = df[label_col]\n",
|
| 235 |
" X = df.drop(columns=[label_col])\n",
|
|
|
|
| 237 |
" # Keep only numeric columns\n",
|
| 238 |
" X = X.select_dtypes(include=[np.number]).fillna(0)\n",
|
| 239 |
" \n",
|
| 240 |
+
" # Create NEW scaler and encoder for THIS dataset\n",
|
| 241 |
+
" scaler = StandardScaler()\n",
|
| 242 |
+
" label_encoder = LabelEncoder()\n",
|
| 243 |
+
" \n",
|
| 244 |
" # Encode labels if needed\n",
|
| 245 |
" if y.dtype == 'object':\n",
|
| 246 |
+
" y = label_encoder.fit_transform(y)\n",
|
| 247 |
+
" self.label_encoders[dataset_name] = label_encoder\n",
|
| 248 |
+
" else:\n",
|
| 249 |
+
" y = y.values\n",
|
| 250 |
" \n",
|
| 251 |
" # Scale features\n",
|
| 252 |
+
" X_scaled = scaler.fit_transform(X)\n",
|
| 253 |
+
" self.scalers[dataset_name] = scaler\n",
|
| 254 |
" \n",
|
| 255 |
" # Split\n",
|
| 256 |
" X_train, X_test, y_train, y_test = train_test_split(\n",
|
|
|
|
| 304 |
" print(f\"Training models for: {dataset_name}\")\n",
|
| 305 |
" print(f\"{'='*50}\")\n",
|
| 306 |
" \n",
|
| 307 |
+
" # Prepare data - pass dataset_name to create per-dataset scaler\n",
|
| 308 |
+
" X_train, X_test, y_train, y_test, feature_names = self.prepare_data(df, dataset_name)\n",
|
| 309 |
" print(f\" Data: {len(X_train)} train, {len(X_test)} test samples\")\n",
|
| 310 |
" print(f\" Features: {len(feature_names)}\")\n",
|
| 311 |
" \n",
|
|
|
|
| 343 |
" \n",
|
| 344 |
" print(f\"\\n \u2713 Best model: {best_model} (F1: {best_score:.4f})\")\n",
|
| 345 |
" \n",
|
| 346 |
+
" # Store results with PER-DATASET scaler\n",
|
| 347 |
" self.trained_models[dataset_name] = {\n",
|
| 348 |
" 'models': results,\n",
|
| 349 |
" 'best_model': best_model,\n",
|
| 350 |
+
" 'scaler': self.scalers.get(dataset_name),\n",
|
| 351 |
+
" 'label_encoder': self.label_encoders.get(dataset_name),\n",
|
| 352 |
+
" 'n_features': len(feature_names)\n",
|
| 353 |
" }\n",
|
| 354 |
" \n",
|
| 355 |
" return results\n",
|
| 356 |
"\n",
|
| 357 |
"trainer = CyberForgeTrainer()\n",
|
| 358 |
+
"print(\"\u2713 CyberForge Trainer initialized\")\n",
|
| 359 |
+
"\n"
|
| 360 |
]
|
| 361 |
},
|
| 362 |
{
|