Spaces:

Che237
/

cyberforge

Running

App Files Files Community

Che237 commited on Feb 2

Commit

48bf346

verified ·

1 Parent(s): f513f82

Fix broken newlines in print statements

Browse files

Files changed (1) hide show

notebooks/03_model_training.ipynb +39 -3

notebooks/03_model_training.ipynb CHANGED Viewed

@@ -80,9 +80,45 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "# Load feature manifest\nfeature_manifest_path = FEATURES_DIR / \"feature_manifest.json\"\n\nif feature_manifest_path.exists():\n    with open(feature_manifest_path) as f:\n        feature_manifest = json.load(f)\n    print(f\"\u2713 Loaded {len(feature_manifest)} feature datasets\")\nelse:\n    print(\"\u26a0 No feature manifest. Run 02_feature_engineering.ipynb first.\")\n    feature_manifest = []\n\n# Load datasets - be more lenient with label detection\ndatasets = {}\nprint(\"\n",
-    "Loading feature datasets:\")\n\nfor entry in feature_manifest:\n    name = entry['name']\n    path = Path(\"..\") / entry['path']\n    \n    if path.exists():\n        try:\n            df = pd.read_parquet(path)\n            \n            # Check for label column with multiple possible names\n            label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n                               'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n            has_label = any(col.lower() in [lc.lower() for lc in label_candidates] for col in df.columns)\n            \n            # Even without explicit labels, we can use for training (create synthetic labels based on dataset name)\n            datasets[name] = df\n            label_status = \"with labels\" if has_label else \"(no explicit labels - will create)\"\n            print(f\"  \u2713 {name}: {len(df)} samples, {len(df.columns)} features {label_status}\")\n        except Exception as e:\n            print(f\"  \u26a0 {name}: Error loading - {e}\")\n    else:\n        print(f\"  \u26a0 {name}: File not found\")\n\nprint(f\"\n",
-    "\u2713 Loaded {len(datasets)} datasets for training\")\n"
    ]
   },
   {

    "metadata": {},
    "outputs": [],
    "source": [
+    "# Load feature manifest\n",
+    "feature_manifest_path = FEATURES_DIR / \"feature_manifest.json\"\n",
+    "\n",
+    "if feature_manifest_path.exists():\n",
+    "    with open(feature_manifest_path) as f:\n",
+    "        feature_manifest = json.load(f)\n",
+    "    print(f\"\u2713 Loaded {len(feature_manifest)} feature datasets\")\n",
+    "else:\n",
+    "    print(\"\u26a0 No feature manifest. Run 02_feature_engineering.ipynb first.\")\n",
+    "    feature_manifest = []\n",
+    "\n",
+    "# Load datasets - be more lenient with label detection\n",
+    "datasets = {}\n",
+    "print(\"\\nLoading feature datasets:\")\n",
+    "\n",
+    "for entry in feature_manifest:\n",
+    "    name = entry['name']\n",
+    "    path = Path(\"..\") / entry['path']\n",
+    "    \n",
+    "    if path.exists():\n",
+    "        try:\n",
+    "            df = pd.read_parquet(path)\n",
+    "            \n",
+    "            # Check for label column with multiple possible names\n",
+    "            label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n",
+    "                               'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n",
+    "            has_label = any(col.lower() in [lc.lower() for lc in label_candidates] for col in df.columns)\n",
+    "            \n",
+    "            # Even without explicit labels, we can use for training\n",
+    "            datasets[name] = df\n",
+    "            label_status = \"with labels\" if has_label else \"(no explicit labels - will create)\"\n",
+    "            print(f\"  \u2713 {name}: {len(df)} samples, {len(df.columns)} features {label_status}\")\n",
+    "        except Exception as e:\n",
+    "            print(f\"  \u26a0 {name}: Error loading - {e}\")\n",
+    "    else:\n",
+    "        print(f\"  \u26a0 {name}: File not found\")\n",
+    "\n",
+    "print(f\"\\n\u2713 Loaded {len(datasets)} datasets for training\")\n",
+    "\n"
    ]
   },
   {