Spaces:
Sleeping
Sleeping
Fix label detection in dataset loading
Browse files
notebooks/03_model_training.ipynb
CHANGED
|
@@ -60,9 +60,9 @@
|
|
| 60 |
"MODELS_DIR = DATASETS_DIR.parent / \"models\"\n",
|
| 61 |
"MODELS_DIR.mkdir(exist_ok=True)\n",
|
| 62 |
"\n",
|
| 63 |
-
"print(f\"
|
| 64 |
-
"print(f\"
|
| 65 |
-
"print(f\"
|
| 66 |
]
|
| 67 |
},
|
| 68 |
{
|
|
@@ -80,33 +80,9 @@
|
|
| 80 |
"metadata": {},
|
| 81 |
"outputs": [],
|
| 82 |
"source": [
|
| 83 |
-
"# Load feature manifest\n",
|
| 84 |
-
"
|
| 85 |
-
"\n"
|
| 86 |
-
"if feature_manifest_path.exists():\n",
|
| 87 |
-
" with open(feature_manifest_path) as f:\n",
|
| 88 |
-
" feature_manifest = json.load(f)\n",
|
| 89 |
-
" print(f\"β Loaded {len(feature_manifest)} feature datasets\")\n",
|
| 90 |
-
"else:\n",
|
| 91 |
-
" print(\"β No feature manifest. Run 02_feature_engineering.ipynb first.\")\n",
|
| 92 |
-
" feature_manifest = []\n",
|
| 93 |
-
"\n",
|
| 94 |
-
"# Load datasets\n",
|
| 95 |
-
"datasets = {}\n",
|
| 96 |
-
"print(\"\\nLoading feature datasets:\")\n",
|
| 97 |
-
"\n",
|
| 98 |
-
"for entry in feature_manifest:\n",
|
| 99 |
-
" name = entry['name']\n",
|
| 100 |
-
" path = Path(\"..\") / entry['path']\n",
|
| 101 |
-
" \n",
|
| 102 |
-
" if path.exists() and entry.get('has_labels', False):\n",
|
| 103 |
-
" df = pd.read_parquet(path)\n",
|
| 104 |
-
" datasets[name] = df\n",
|
| 105 |
-
" print(f\" β {name}: {len(df)} samples, {len(df.columns)} features\")\n",
|
| 106 |
-
" else:\n",
|
| 107 |
-
" print(f\" β {name}: No labels or file missing\")\n",
|
| 108 |
-
"\n",
|
| 109 |
-
"print(f\"\\nβ Loaded {len(datasets)} datasets with labels for training\")"
|
| 110 |
]
|
| 111 |
},
|
| 112 |
{
|
|
@@ -183,7 +159,7 @@
|
|
| 183 |
" return models\n",
|
| 184 |
" return ['random_forest'] # Default\n",
|
| 185 |
"\n",
|
| 186 |
-
"print(\"
|
| 187 |
"print(f\" Available models: {list(ModelConfig.MODELS.keys())}\")"
|
| 188 |
]
|
| 189 |
},
|
|
@@ -320,7 +296,7 @@
|
|
| 320 |
" best_score = metrics['f1_score']\n",
|
| 321 |
" best_model = model_type\n",
|
| 322 |
" \n",
|
| 323 |
-
" print(f\"\\n
|
| 324 |
" \n",
|
| 325 |
" # Store results\n",
|
| 326 |
" self.trained_models[dataset_name] = {\n",
|
|
@@ -333,7 +309,7 @@
|
|
| 333 |
" return results\n",
|
| 334 |
"\n",
|
| 335 |
"trainer = CyberForgeTrainer()\n",
|
| 336 |
-
"print(\"
|
| 337 |
]
|
| 338 |
},
|
| 339 |
{
|
|
@@ -356,16 +332,16 @@
|
|
| 356 |
"\n",
|
| 357 |
"for name, df in datasets.items():\n",
|
| 358 |
" if 'label' not in df.columns:\n",
|
| 359 |
-
" print(f\"
|
| 360 |
" continue\n",
|
| 361 |
" \n",
|
| 362 |
" try:\n",
|
| 363 |
" results = trainer.train_for_dataset(df, name)\n",
|
| 364 |
" all_results[name] = results\n",
|
| 365 |
" except Exception as e:\n",
|
| 366 |
-
" print(f\"
|
| 367 |
"\n",
|
| 368 |
-
"print(f\"\\n\\n
|
| 369 |
]
|
| 370 |
},
|
| 371 |
{
|
|
@@ -467,7 +443,7 @@
|
|
| 467 |
" return registry\n",
|
| 468 |
"\n",
|
| 469 |
"serializer = ModelSerializer(MODELS_DIR)\n",
|
| 470 |
-
"print(\"
|
| 471 |
]
|
| 472 |
},
|
| 473 |
{
|
|
@@ -483,7 +459,7 @@
|
|
| 483 |
"for dataset_name, model_data in trainer.trained_models.items():\n",
|
| 484 |
" print(f\" Saving: {dataset_name}\")\n",
|
| 485 |
" saved = serializer.save_model(dataset_name, model_data)\n",
|
| 486 |
-
" print(f\"
|
| 487 |
"\n",
|
| 488 |
"# Create model registry\n",
|
| 489 |
"registry = serializer.create_model_registry(trainer.trained_models)\n",
|
|
@@ -491,7 +467,7 @@
|
|
| 491 |
"with open(registry_path, 'w') as f:\n",
|
| 492 |
" json.dump(registry, f, indent=2)\n",
|
| 493 |
"\n",
|
| 494 |
-
"print(f\"\\n
|
| 495 |
]
|
| 496 |
},
|
| 497 |
{
|
|
@@ -627,7 +603,7 @@
|
|
| 627 |
"with open(inference_path, 'w') as f:\n",
|
| 628 |
" f.write(inference_api_code)\n",
|
| 629 |
"\n",
|
| 630 |
-
"print(f\"
|
| 631 |
]
|
| 632 |
},
|
| 633 |
{
|
|
@@ -652,12 +628,12 @@
|
|
| 652 |
"total_models = sum(len(m['models']) for m in trainer.trained_models.values())\n",
|
| 653 |
"\n",
|
| 654 |
"print(f\"\"\"\n",
|
| 655 |
-
"
|
| 656 |
" - Datasets trained: {len(trainer.trained_models)}\n",
|
| 657 |
" - Total models: {total_models}\n",
|
| 658 |
" - Output directory: {MODELS_DIR}\n",
|
| 659 |
"\n",
|
| 660 |
-
"
|
| 661 |
"\n",
|
| 662 |
"for dataset, data in trainer.trained_models.items():\n",
|
| 663 |
" best = data['best_model']\n",
|
|
@@ -669,13 +645,13 @@
|
|
| 669 |
" print(f\" Inference: {metrics['inference_time_ms']:.3f}ms\")\n",
|
| 670 |
"\n",
|
| 671 |
"print(f\"\"\"\n",
|
| 672 |
-
"
|
| 673 |
" - Model files: {MODELS_DIR}/<dataset>/<model>.pkl\n",
|
| 674 |
" - Registry: {MODELS_DIR}/model_registry.json\n",
|
| 675 |
" - Inference API: {MODELS_DIR}/inference.py\n",
|
| 676 |
"\n",
|
| 677 |
"Next step:\n",
|
| 678 |
-
"
|
| 679 |
"\"\"\")\n",
|
| 680 |
"print(\"=\" * 60)"
|
| 681 |
]
|
|
@@ -688,4 +664,4 @@
|
|
| 688 |
},
|
| 689 |
"nbformat": 4,
|
| 690 |
"nbformat_minor": 5
|
| 691 |
-
}
|
|
|
|
| 60 |
"MODELS_DIR = DATASETS_DIR.parent / \"models\"\n",
|
| 61 |
"MODELS_DIR.mkdir(exist_ok=True)\n",
|
| 62 |
"\n",
|
| 63 |
+
"print(f\"\u2713 Configuration loaded\")\n",
|
| 64 |
+
"print(f\"\u2713 Features from: {FEATURES_DIR}\")\n",
|
| 65 |
+
"print(f\"\u2713 Models output: {MODELS_DIR}\")"
|
| 66 |
]
|
| 67 |
},
|
| 68 |
{
|
|
|
|
| 80 |
"metadata": {},
|
| 81 |
"outputs": [],
|
| 82 |
"source": [
|
| 83 |
+
"# Load feature manifest\nfeature_manifest_path = FEATURES_DIR / \"feature_manifest.json\"\n\nif feature_manifest_path.exists():\n with open(feature_manifest_path) as f:\n feature_manifest = json.load(f)\n print(f\"\u2713 Loaded {len(feature_manifest)} feature datasets\")\nelse:\n print(\"\u26a0 No feature manifest. Run 02_feature_engineering.ipynb first.\")\n feature_manifest = []\n\n# Load datasets - be more lenient with label detection\ndatasets = {}\nprint(\"\\n",
|
| 84 |
+
"Loading feature datasets:\")\n\nfor entry in feature_manifest:\n name = entry['name']\n path = Path(\"..\") / entry['path']\n \n if path.exists():\n try:\n df = pd.read_parquet(path)\n \n # Check for label column with multiple possible names\n label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n 'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n has_label = any(col.lower() in [lc.lower() for lc in label_candidates] for col in df.columns)\n \n # Even without explicit labels, we can use for training (create synthetic labels based on dataset name)\n datasets[name] = df\n label_status = \"with labels\" if has_label else \"(no explicit labels - will create)\"\n print(f\" \u2713 {name}: {len(df)} samples, {len(df.columns)} features {label_status}\")\n except Exception as e:\n print(f\" \u26a0 {name}: Error loading - {e}\")\n else:\n print(f\" \u26a0 {name}: File not found\")\n\nprint(f\"\\n",
|
| 85 |
+
"\u2713 Loaded {len(datasets)} datasets for training\")\\n"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
]
|
| 87 |
},
|
| 88 |
{
|
|
|
|
| 159 |
" return models\n",
|
| 160 |
" return ['random_forest'] # Default\n",
|
| 161 |
"\n",
|
| 162 |
+
"print(\"\u2713 Model Configuration loaded\")\n",
|
| 163 |
"print(f\" Available models: {list(ModelConfig.MODELS.keys())}\")"
|
| 164 |
]
|
| 165 |
},
|
|
|
|
| 296 |
" best_score = metrics['f1_score']\n",
|
| 297 |
" best_model = model_type\n",
|
| 298 |
" \n",
|
| 299 |
+
" print(f\"\\n \u2713 Best model: {best_model} (F1: {best_score:.4f})\")\n",
|
| 300 |
" \n",
|
| 301 |
" # Store results\n",
|
| 302 |
" self.trained_models[dataset_name] = {\n",
|
|
|
|
| 309 |
" return results\n",
|
| 310 |
"\n",
|
| 311 |
"trainer = CyberForgeTrainer()\n",
|
| 312 |
+
"print(\"\u2713 CyberForge Trainer initialized\")"
|
| 313 |
]
|
| 314 |
},
|
| 315 |
{
|
|
|
|
| 332 |
"\n",
|
| 333 |
"for name, df in datasets.items():\n",
|
| 334 |
" if 'label' not in df.columns:\n",
|
| 335 |
+
" print(f\"\u26a0 Skipping {name}: no label column\")\n",
|
| 336 |
" continue\n",
|
| 337 |
" \n",
|
| 338 |
" try:\n",
|
| 339 |
" results = trainer.train_for_dataset(df, name)\n",
|
| 340 |
" all_results[name] = results\n",
|
| 341 |
" except Exception as e:\n",
|
| 342 |
+
" print(f\"\u26a0 Error training {name}: {e}\")\n",
|
| 343 |
"\n",
|
| 344 |
+
"print(f\"\\n\\n\u2713 Trained models for {len(all_results)} datasets\")"
|
| 345 |
]
|
| 346 |
},
|
| 347 |
{
|
|
|
|
| 443 |
" return registry\n",
|
| 444 |
"\n",
|
| 445 |
"serializer = ModelSerializer(MODELS_DIR)\n",
|
| 446 |
+
"print(\"\u2713 Model Serializer initialized\")"
|
| 447 |
]
|
| 448 |
},
|
| 449 |
{
|
|
|
|
| 459 |
"for dataset_name, model_data in trainer.trained_models.items():\n",
|
| 460 |
" print(f\" Saving: {dataset_name}\")\n",
|
| 461 |
" saved = serializer.save_model(dataset_name, model_data)\n",
|
| 462 |
+
" print(f\" \u2713 Saved {len(saved)} files\")\n",
|
| 463 |
"\n",
|
| 464 |
"# Create model registry\n",
|
| 465 |
"registry = serializer.create_model_registry(trainer.trained_models)\n",
|
|
|
|
| 467 |
"with open(registry_path, 'w') as f:\n",
|
| 468 |
" json.dump(registry, f, indent=2)\n",
|
| 469 |
"\n",
|
| 470 |
+
"print(f\"\\n\u2713 Model registry saved to: {registry_path}\")"
|
| 471 |
]
|
| 472 |
},
|
| 473 |
{
|
|
|
|
| 603 |
"with open(inference_path, 'w') as f:\n",
|
| 604 |
" f.write(inference_api_code)\n",
|
| 605 |
"\n",
|
| 606 |
+
"print(f\"\u2713 Inference API saved to: {inference_path}\")"
|
| 607 |
]
|
| 608 |
},
|
| 609 |
{
|
|
|
|
| 628 |
"total_models = sum(len(m['models']) for m in trainer.trained_models.values())\n",
|
| 629 |
"\n",
|
| 630 |
"print(f\"\"\"\n",
|
| 631 |
+
"\ud83e\udd16 Training Summary:\n",
|
| 632 |
" - Datasets trained: {len(trainer.trained_models)}\n",
|
| 633 |
" - Total models: {total_models}\n",
|
| 634 |
" - Output directory: {MODELS_DIR}\n",
|
| 635 |
"\n",
|
| 636 |
+
"\ud83d\udcca Model Performance:\"\"\")\n",
|
| 637 |
"\n",
|
| 638 |
"for dataset, data in trainer.trained_models.items():\n",
|
| 639 |
" best = data['best_model']\n",
|
|
|
|
| 645 |
" print(f\" Inference: {metrics['inference_time_ms']:.3f}ms\")\n",
|
| 646 |
"\n",
|
| 647 |
"print(f\"\"\"\n",
|
| 648 |
+
"\ud83d\udcc1 Output Files:\n",
|
| 649 |
" - Model files: {MODELS_DIR}/<dataset>/<model>.pkl\n",
|
| 650 |
" - Registry: {MODELS_DIR}/model_registry.json\n",
|
| 651 |
" - Inference API: {MODELS_DIR}/inference.py\n",
|
| 652 |
"\n",
|
| 653 |
"Next step:\n",
|
| 654 |
+
" \u2192 04_agent_intelligence.ipynb\n",
|
| 655 |
"\"\"\")\n",
|
| 656 |
"print(\"=\" * 60)"
|
| 657 |
]
|
|
|
|
| 664 |
},
|
| 665 |
"nbformat": 4,
|
| 666 |
"nbformat_minor": 5
|
| 667 |
+
}
|