Spaces:

Che237
/

cyberforge

Sleeping

App Files Files Community

Che237 commited on Feb 2

Commit

a022514

verified ·

1 Parent(s): 76a101b

Fix: Better label preservation during feature engineering

Browse files

Files changed (1) hide show

notebooks/02_feature_engineering.ipynb +24 -3

notebooks/02_feature_engineering.ipynb CHANGED Viewed

@@ -43,7 +43,9 @@
     "warnings.filterwarnings('ignore')\n",
     "\n",
     "# Load configuration\n",
-    "config_path = Path(\"notebook_config.json\")\nif not config_path.exists():\n    config_path = Path(\"/home/user/app/notebooks/notebook_config.json\")\n",
     "with open(config_path) as f:\n",
     "    CONFIG = json.load(f)\n",
     "\n",
@@ -628,6 +630,18 @@
     "    try:\n",
     "        df = pd.read_csv(path)\n",
     "        \n",
     "        # Check for URL column to extract URL features\n",
     "        url_cols = [c for c in df.columns if 'url' in c.lower()]\n",
     "        if url_cols:\n",
@@ -636,6 +650,11 @@
     "        # Prepare for training\n",
     "        X, y = pipeline.prepare_for_training(df)\n",
     "        \n",
     "        processed_datasets[name] = {\n",
     "            'X': X,\n",
     "            'y': y,\n",
@@ -644,7 +663,8 @@
     "            'n_features': len(pipeline.feature_names)\n",
     "        }\n",
     "        \n",
-    "        print(f\"    \u2713 {len(X)} samples, {len(pipeline.feature_names)} features\")\n",
     "        \n",
     "        feature_stats.append({\n",
     "            'name': name,\n",
@@ -656,7 +676,8 @@
     "    except Exception as e:\n",
     "        print(f\"    \u26a0 Error: {e}\")\n",
     "\n",
-    "print(f\"\\n\u2713 Processed {len(processed_datasets)} datasets\")"
    ]
   },
   {

     "warnings.filterwarnings('ignore')\n",
     "\n",
     "# Load configuration\n",
+    "config_path = Path(\"notebook_config.json\")\n",
+    "if not config_path.exists():\n",
+    "    config_path = Path(\"/home/user/app/notebooks/notebook_config.json\")\n",
     "with open(config_path) as f:\n",
     "    CONFIG = json.load(f)\n",
     "\n",
     "    try:\n",
     "        df = pd.read_csv(path)\n",
     "        \n",
+    "        # IMPORTANT: Extract and preserve label column BEFORE processing\n",
+    "        label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n",
+    "                           'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n",
+    "        original_label = None\n",
+    "        label_col_name = None\n",
+    "        for col in df.columns:\n",
+    "            if col.lower() in [lc.lower() for lc in label_candidates]:\n",
+    "                original_label = df[col].copy()\n",
+    "                label_col_name = col\n",
+    "                print(f\"    Found label column: {col}\")\n",
+    "                break\n",
+    "        \n",
     "        # Check for URL column to extract URL features\n",
     "        url_cols = [c for c in df.columns if 'url' in c.lower()]\n",
     "        if url_cols:\n",
     "        # Prepare for training\n",
     "        X, y = pipeline.prepare_for_training(df)\n",
     "        \n",
+    "        # If y is None but we found original_label, use that\n",
+    "        if y is None and original_label is not None:\n",
+    "            y = original_label.values\n",
+    "            print(f\"    Restored label from original: {label_col_name}\")\n",
+    "        \n",
     "        processed_datasets[name] = {\n",
     "            'X': X,\n",
     "            'y': y,\n",
     "            'n_features': len(pipeline.feature_names)\n",
     "        }\n",
     "        \n",
+    "        label_status = \"with labels\" if y is not None else \"no labels\"\n",
+    "        print(f\"    \u2713 {len(X)} samples, {len(pipeline.feature_names)} features ({label_status})\")\n",
     "        \n",
     "        feature_stats.append({\n",
     "            'name': name,\n",
     "    except Exception as e:\n",
     "        print(f\"    \u26a0 Error: {e}\")\n",
     "\n",
+    "print(f\"\\n\u2713 Processed {len(processed_datasets)} datasets\")\n",
+    "\n"
    ]
   },
   {