Spaces:
Running
Running
Fix broken newlines in print statements
Browse files
notebooks/03_model_training.ipynb
CHANGED
|
@@ -80,9 +80,45 @@
|
|
| 80 |
"metadata": {},
|
| 81 |
"outputs": [],
|
| 82 |
"source": [
|
| 83 |
-
"# Load feature manifest\
|
| 84 |
-
"
|
| 85 |
-
"\
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
]
|
| 87 |
},
|
| 88 |
{
|
|
|
|
| 80 |
"metadata": {},
|
| 81 |
"outputs": [],
|
| 82 |
"source": [
|
| 83 |
+
"# Load feature manifest\n",
|
| 84 |
+
"feature_manifest_path = FEATURES_DIR / \"feature_manifest.json\"\n",
|
| 85 |
+
"\n",
|
| 86 |
+
"if feature_manifest_path.exists():\n",
|
| 87 |
+
" with open(feature_manifest_path) as f:\n",
|
| 88 |
+
" feature_manifest = json.load(f)\n",
|
| 89 |
+
" print(f\"\u2713 Loaded {len(feature_manifest)} feature datasets\")\n",
|
| 90 |
+
"else:\n",
|
| 91 |
+
" print(\"\u26a0 No feature manifest. Run 02_feature_engineering.ipynb first.\")\n",
|
| 92 |
+
" feature_manifest = []\n",
|
| 93 |
+
"\n",
|
| 94 |
+
"# Load datasets - be more lenient with label detection\n",
|
| 95 |
+
"datasets = {}\n",
|
| 96 |
+
"print(\"\\nLoading feature datasets:\")\n",
|
| 97 |
+
"\n",
|
| 98 |
+
"for entry in feature_manifest:\n",
|
| 99 |
+
" name = entry['name']\n",
|
| 100 |
+
" path = Path(\"..\") / entry['path']\n",
|
| 101 |
+
" \n",
|
| 102 |
+
" if path.exists():\n",
|
| 103 |
+
" try:\n",
|
| 104 |
+
" df = pd.read_parquet(path)\n",
|
| 105 |
+
" \n",
|
| 106 |
+
" # Check for label column with multiple possible names\n",
|
| 107 |
+
" label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n",
|
| 108 |
+
" 'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n",
|
| 109 |
+
" has_label = any(col.lower() in [lc.lower() for lc in label_candidates] for col in df.columns)\n",
|
| 110 |
+
" \n",
|
| 111 |
+
" # Even without explicit labels, we can use for training\n",
|
| 112 |
+
" datasets[name] = df\n",
|
| 113 |
+
" label_status = \"with labels\" if has_label else \"(no explicit labels - will create)\"\n",
|
| 114 |
+
" print(f\" \u2713 {name}: {len(df)} samples, {len(df.columns)} features {label_status}\")\n",
|
| 115 |
+
" except Exception as e:\n",
|
| 116 |
+
" print(f\" \u26a0 {name}: Error loading - {e}\")\n",
|
| 117 |
+
" else:\n",
|
| 118 |
+
" print(f\" \u26a0 {name}: File not found\")\n",
|
| 119 |
+
"\n",
|
| 120 |
+
"print(f\"\\n\u2713 Loaded {len(datasets)} datasets for training\")\n",
|
| 121 |
+
"\n"
|
| 122 |
]
|
| 123 |
},
|
| 124 |
{
|