Che237 commited on
Commit
a022514
·
verified ·
1 Parent(s): 76a101b

Fix: Better label preservation during feature engineering

Browse files
notebooks/02_feature_engineering.ipynb CHANGED
@@ -43,7 +43,9 @@
43
  "warnings.filterwarnings('ignore')\n",
44
  "\n",
45
  "# Load configuration\n",
46
- "config_path = Path(\"notebook_config.json\")\nif not config_path.exists():\n config_path = Path(\"/home/user/app/notebooks/notebook_config.json\")\n",
 
 
47
  "with open(config_path) as f:\n",
48
  " CONFIG = json.load(f)\n",
49
  "\n",
@@ -628,6 +630,18 @@
628
  " try:\n",
629
  " df = pd.read_csv(path)\n",
630
  " \n",
 
 
 
 
 
 
 
 
 
 
 
 
631
  " # Check for URL column to extract URL features\n",
632
  " url_cols = [c for c in df.columns if 'url' in c.lower()]\n",
633
  " if url_cols:\n",
@@ -636,6 +650,11 @@
636
  " # Prepare for training\n",
637
  " X, y = pipeline.prepare_for_training(df)\n",
638
  " \n",
 
 
 
 
 
639
  " processed_datasets[name] = {\n",
640
  " 'X': X,\n",
641
  " 'y': y,\n",
@@ -644,7 +663,8 @@
644
  " 'n_features': len(pipeline.feature_names)\n",
645
  " }\n",
646
  " \n",
647
- " print(f\" \u2713 {len(X)} samples, {len(pipeline.feature_names)} features\")\n",
 
648
  " \n",
649
  " feature_stats.append({\n",
650
  " 'name': name,\n",
@@ -656,7 +676,8 @@
656
  " except Exception as e:\n",
657
  " print(f\" \u26a0 Error: {e}\")\n",
658
  "\n",
659
- "print(f\"\\n\u2713 Processed {len(processed_datasets)} datasets\")"
 
660
  ]
661
  },
662
  {
 
43
  "warnings.filterwarnings('ignore')\n",
44
  "\n",
45
  "# Load configuration\n",
46
+ "config_path = Path(\"notebook_config.json\")\n",
47
+ "if not config_path.exists():\n",
48
+ " config_path = Path(\"/home/user/app/notebooks/notebook_config.json\")\n",
49
  "with open(config_path) as f:\n",
50
  " CONFIG = json.load(f)\n",
51
  "\n",
 
630
  " try:\n",
631
  " df = pd.read_csv(path)\n",
632
  " \n",
633
+ " # IMPORTANT: Extract and preserve label column BEFORE processing\n",
634
+ " label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n",
635
+ " 'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n",
636
+ " original_label = None\n",
637
+ " label_col_name = None\n",
638
+ " for col in df.columns:\n",
639
+ " if col.lower() in [lc.lower() for lc in label_candidates]:\n",
640
+ " original_label = df[col].copy()\n",
641
+ " label_col_name = col\n",
642
+ " print(f\" Found label column: {col}\")\n",
643
+ " break\n",
644
+ " \n",
645
  " # Check for URL column to extract URL features\n",
646
  " url_cols = [c for c in df.columns if 'url' in c.lower()]\n",
647
  " if url_cols:\n",
 
650
  " # Prepare for training\n",
651
  " X, y = pipeline.prepare_for_training(df)\n",
652
  " \n",
653
+ " # If y is None but we found original_label, use that\n",
654
+ " if y is None and original_label is not None:\n",
655
+ " y = original_label.values\n",
656
+ " print(f\" Restored label from original: {label_col_name}\")\n",
657
+ " \n",
658
  " processed_datasets[name] = {\n",
659
  " 'X': X,\n",
660
  " 'y': y,\n",
 
663
  " 'n_features': len(pipeline.feature_names)\n",
664
  " }\n",
665
  " \n",
666
+ " label_status = \"with labels\" if y is not None else \"no labels\"\n",
667
+ " print(f\" \u2713 {len(X)} samples, {len(pipeline.feature_names)} features ({label_status})\")\n",
668
  " \n",
669
  " feature_stats.append({\n",
670
  " 'name': name,\n",
 
676
  " except Exception as e:\n",
677
  " print(f\" \u26a0 Error: {e}\")\n",
678
  "\n",
679
+ "print(f\"\\n\u2713 Processed {len(processed_datasets)} datasets\")\n",
680
+ "\n"
681
  ]
682
  },
683
  {