Che237 commited on
Commit
a49ca35
Β·
verified Β·
1 Parent(s): 138d8ff

Fix label detection in dataset loading

Browse files
Files changed (1) hide show
  1. notebooks/03_model_training.ipynb +21 -45
notebooks/03_model_training.ipynb CHANGED
@@ -60,9 +60,9 @@
60
  "MODELS_DIR = DATASETS_DIR.parent / \"models\"\n",
61
  "MODELS_DIR.mkdir(exist_ok=True)\n",
62
  "\n",
63
- "print(f\"βœ“ Configuration loaded\")\n",
64
- "print(f\"βœ“ Features from: {FEATURES_DIR}\")\n",
65
- "print(f\"βœ“ Models output: {MODELS_DIR}\")"
66
  ]
67
  },
68
  {
@@ -80,33 +80,9 @@
80
  "metadata": {},
81
  "outputs": [],
82
  "source": [
83
- "# Load feature manifest\n",
84
- "feature_manifest_path = FEATURES_DIR / \"feature_manifest.json\"\n",
85
- "\n",
86
- "if feature_manifest_path.exists():\n",
87
- " with open(feature_manifest_path) as f:\n",
88
- " feature_manifest = json.load(f)\n",
89
- " print(f\"βœ“ Loaded {len(feature_manifest)} feature datasets\")\n",
90
- "else:\n",
91
- " print(\"⚠ No feature manifest. Run 02_feature_engineering.ipynb first.\")\n",
92
- " feature_manifest = []\n",
93
- "\n",
94
- "# Load datasets\n",
95
- "datasets = {}\n",
96
- "print(\"\\nLoading feature datasets:\")\n",
97
- "\n",
98
- "for entry in feature_manifest:\n",
99
- " name = entry['name']\n",
100
- " path = Path(\"..\") / entry['path']\n",
101
- " \n",
102
- " if path.exists() and entry.get('has_labels', False):\n",
103
- " df = pd.read_parquet(path)\n",
104
- " datasets[name] = df\n",
105
- " print(f\" βœ“ {name}: {len(df)} samples, {len(df.columns)} features\")\n",
106
- " else:\n",
107
- " print(f\" ⚠ {name}: No labels or file missing\")\n",
108
- "\n",
109
- "print(f\"\\nβœ“ Loaded {len(datasets)} datasets with labels for training\")"
110
  ]
111
  },
112
  {
@@ -183,7 +159,7 @@
183
  " return models\n",
184
  " return ['random_forest'] # Default\n",
185
  "\n",
186
- "print(\"βœ“ Model Configuration loaded\")\n",
187
  "print(f\" Available models: {list(ModelConfig.MODELS.keys())}\")"
188
  ]
189
  },
@@ -320,7 +296,7 @@
320
  " best_score = metrics['f1_score']\n",
321
  " best_model = model_type\n",
322
  " \n",
323
- " print(f\"\\n βœ“ Best model: {best_model} (F1: {best_score:.4f})\")\n",
324
  " \n",
325
  " # Store results\n",
326
  " self.trained_models[dataset_name] = {\n",
@@ -333,7 +309,7 @@
333
  " return results\n",
334
  "\n",
335
  "trainer = CyberForgeTrainer()\n",
336
- "print(\"βœ“ CyberForge Trainer initialized\")"
337
  ]
338
  },
339
  {
@@ -356,16 +332,16 @@
356
  "\n",
357
  "for name, df in datasets.items():\n",
358
  " if 'label' not in df.columns:\n",
359
- " print(f\"⚠ Skipping {name}: no label column\")\n",
360
  " continue\n",
361
  " \n",
362
  " try:\n",
363
  " results = trainer.train_for_dataset(df, name)\n",
364
  " all_results[name] = results\n",
365
  " except Exception as e:\n",
366
- " print(f\"⚠ Error training {name}: {e}\")\n",
367
  "\n",
368
- "print(f\"\\n\\nβœ“ Trained models for {len(all_results)} datasets\")"
369
  ]
370
  },
371
  {
@@ -467,7 +443,7 @@
467
  " return registry\n",
468
  "\n",
469
  "serializer = ModelSerializer(MODELS_DIR)\n",
470
- "print(\"βœ“ Model Serializer initialized\")"
471
  ]
472
  },
473
  {
@@ -483,7 +459,7 @@
483
  "for dataset_name, model_data in trainer.trained_models.items():\n",
484
  " print(f\" Saving: {dataset_name}\")\n",
485
  " saved = serializer.save_model(dataset_name, model_data)\n",
486
- " print(f\" βœ“ Saved {len(saved)} files\")\n",
487
  "\n",
488
  "# Create model registry\n",
489
  "registry = serializer.create_model_registry(trainer.trained_models)\n",
@@ -491,7 +467,7 @@
491
  "with open(registry_path, 'w') as f:\n",
492
  " json.dump(registry, f, indent=2)\n",
493
  "\n",
494
- "print(f\"\\nβœ“ Model registry saved to: {registry_path}\")"
495
  ]
496
  },
497
  {
@@ -627,7 +603,7 @@
627
  "with open(inference_path, 'w') as f:\n",
628
  " f.write(inference_api_code)\n",
629
  "\n",
630
- "print(f\"βœ“ Inference API saved to: {inference_path}\")"
631
  ]
632
  },
633
  {
@@ -652,12 +628,12 @@
652
  "total_models = sum(len(m['models']) for m in trainer.trained_models.values())\n",
653
  "\n",
654
  "print(f\"\"\"\n",
655
- "πŸ€– Training Summary:\n",
656
  " - Datasets trained: {len(trainer.trained_models)}\n",
657
  " - Total models: {total_models}\n",
658
  " - Output directory: {MODELS_DIR}\n",
659
  "\n",
660
- "πŸ“Š Model Performance:\"\"\")\n",
661
  "\n",
662
  "for dataset, data in trainer.trained_models.items():\n",
663
  " best = data['best_model']\n",
@@ -669,13 +645,13 @@
669
  " print(f\" Inference: {metrics['inference_time_ms']:.3f}ms\")\n",
670
  "\n",
671
  "print(f\"\"\"\n",
672
- "πŸ“ Output Files:\n",
673
  " - Model files: {MODELS_DIR}/<dataset>/<model>.pkl\n",
674
  " - Registry: {MODELS_DIR}/model_registry.json\n",
675
  " - Inference API: {MODELS_DIR}/inference.py\n",
676
  "\n",
677
  "Next step:\n",
678
- " β†’ 04_agent_intelligence.ipynb\n",
679
  "\"\"\")\n",
680
  "print(\"=\" * 60)"
681
  ]
@@ -688,4 +664,4 @@
688
  },
689
  "nbformat": 4,
690
  "nbformat_minor": 5
691
- }
 
60
  "MODELS_DIR = DATASETS_DIR.parent / \"models\"\n",
61
  "MODELS_DIR.mkdir(exist_ok=True)\n",
62
  "\n",
63
+ "print(f\"\u2713 Configuration loaded\")\n",
64
+ "print(f\"\u2713 Features from: {FEATURES_DIR}\")\n",
65
+ "print(f\"\u2713 Models output: {MODELS_DIR}\")"
66
  ]
67
  },
68
  {
 
80
  "metadata": {},
81
  "outputs": [],
82
  "source": [
83
+ "# Load feature manifest\nfeature_manifest_path = FEATURES_DIR / \"feature_manifest.json\"\n\nif feature_manifest_path.exists():\n with open(feature_manifest_path) as f:\n feature_manifest = json.load(f)\n print(f\"\u2713 Loaded {len(feature_manifest)} feature datasets\")\nelse:\n print(\"\u26a0 No feature manifest. Run 02_feature_engineering.ipynb first.\")\n feature_manifest = []\n\n# Load datasets - be more lenient with label detection\ndatasets = {}\nprint(\"\\n",
84
+ "Loading feature datasets:\")\n\nfor entry in feature_manifest:\n name = entry['name']\n path = Path(\"..\") / entry['path']\n \n if path.exists():\n try:\n df = pd.read_parquet(path)\n \n # Check for label column with multiple possible names\n label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n 'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n has_label = any(col.lower() in [lc.lower() for lc in label_candidates] for col in df.columns)\n \n # Even without explicit labels, we can use for training (create synthetic labels based on dataset name)\n datasets[name] = df\n label_status = \"with labels\" if has_label else \"(no explicit labels - will create)\"\n print(f\" \u2713 {name}: {len(df)} samples, {len(df.columns)} features {label_status}\")\n except Exception as e:\n print(f\" \u26a0 {name}: Error loading - {e}\")\n else:\n print(f\" \u26a0 {name}: File not found\")\n\nprint(f\"\\n",
85
+ "\u2713 Loaded {len(datasets)} datasets for training\")\\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  ]
87
  },
88
  {
 
159
  " return models\n",
160
  " return ['random_forest'] # Default\n",
161
  "\n",
162
+ "print(\"\u2713 Model Configuration loaded\")\n",
163
  "print(f\" Available models: {list(ModelConfig.MODELS.keys())}\")"
164
  ]
165
  },
 
296
  " best_score = metrics['f1_score']\n",
297
  " best_model = model_type\n",
298
  " \n",
299
+ " print(f\"\\n \u2713 Best model: {best_model} (F1: {best_score:.4f})\")\n",
300
  " \n",
301
  " # Store results\n",
302
  " self.trained_models[dataset_name] = {\n",
 
309
  " return results\n",
310
  "\n",
311
  "trainer = CyberForgeTrainer()\n",
312
+ "print(\"\u2713 CyberForge Trainer initialized\")"
313
  ]
314
  },
315
  {
 
332
  "\n",
333
  "for name, df in datasets.items():\n",
334
  " if 'label' not in df.columns:\n",
335
+ " print(f\"\u26a0 Skipping {name}: no label column\")\n",
336
  " continue\n",
337
  " \n",
338
  " try:\n",
339
  " results = trainer.train_for_dataset(df, name)\n",
340
  " all_results[name] = results\n",
341
  " except Exception as e:\n",
342
+ " print(f\"\u26a0 Error training {name}: {e}\")\n",
343
  "\n",
344
+ "print(f\"\\n\\n\u2713 Trained models for {len(all_results)} datasets\")"
345
  ]
346
  },
347
  {
 
443
  " return registry\n",
444
  "\n",
445
  "serializer = ModelSerializer(MODELS_DIR)\n",
446
+ "print(\"\u2713 Model Serializer initialized\")"
447
  ]
448
  },
449
  {
 
459
  "for dataset_name, model_data in trainer.trained_models.items():\n",
460
  " print(f\" Saving: {dataset_name}\")\n",
461
  " saved = serializer.save_model(dataset_name, model_data)\n",
462
+ " print(f\" \u2713 Saved {len(saved)} files\")\n",
463
  "\n",
464
  "# Create model registry\n",
465
  "registry = serializer.create_model_registry(trainer.trained_models)\n",
 
467
  "with open(registry_path, 'w') as f:\n",
468
  " json.dump(registry, f, indent=2)\n",
469
  "\n",
470
+ "print(f\"\\n\u2713 Model registry saved to: {registry_path}\")"
471
  ]
472
  },
473
  {
 
603
  "with open(inference_path, 'w') as f:\n",
604
  " f.write(inference_api_code)\n",
605
  "\n",
606
+ "print(f\"\u2713 Inference API saved to: {inference_path}\")"
607
  ]
608
  },
609
  {
 
628
  "total_models = sum(len(m['models']) for m in trainer.trained_models.values())\n",
629
  "\n",
630
  "print(f\"\"\"\n",
631
+ "\ud83e\udd16 Training Summary:\n",
632
  " - Datasets trained: {len(trainer.trained_models)}\n",
633
  " - Total models: {total_models}\n",
634
  " - Output directory: {MODELS_DIR}\n",
635
  "\n",
636
+ "\ud83d\udcca Model Performance:\"\"\")\n",
637
  "\n",
638
  "for dataset, data in trainer.trained_models.items():\n",
639
  " best = data['best_model']\n",
 
645
  " print(f\" Inference: {metrics['inference_time_ms']:.3f}ms\")\n",
646
  "\n",
647
  "print(f\"\"\"\n",
648
+ "\ud83d\udcc1 Output Files:\n",
649
  " - Model files: {MODELS_DIR}/<dataset>/<model>.pkl\n",
650
  " - Registry: {MODELS_DIR}/model_registry.json\n",
651
  " - Inference API: {MODELS_DIR}/inference.py\n",
652
  "\n",
653
  "Next step:\n",
654
+ " \u2192 04_agent_intelligence.ipynb\n",
655
  "\"\"\")\n",
656
  "print(\"=\" * 60)"
657
  ]
 
664
  },
665
  "nbformat": 4,
666
  "nbformat_minor": 5
667
+ }