Che237 commited on
Commit
138d8ff
·
verified ·
1 Parent(s): d4bed8e

Fix URL feature extraction column mismatch

Browse files
Files changed (1) hide show
  1. notebooks/02_feature_engineering.ipynb +22 -118
notebooks/02_feature_engineering.ipynb CHANGED
@@ -52,8 +52,8 @@
52
  "FEATURES_DIR = DATASETS_DIR / \"features\"\n",
53
  "FEATURES_DIR.mkdir(exist_ok=True)\n",
54
  "\n",
55
- "print(f\" Configuration loaded\")\n",
56
- "print(f\" Features output: {FEATURES_DIR}\")"
57
  ]
58
  },
59
  {
@@ -165,7 +165,7 @@
165
  " return pd.DataFrame(features)\n",
166
  "\n",
167
  "url_extractor = URLFeatureExtractor()\n",
168
- "print(\" URL Feature Extractor initialized\")\n",
169
  "\n",
170
  "# Test\n",
171
  "test_features = url_extractor.extract(\"https://suspicious-login.example.com/verify?id=123\")\n",
@@ -265,7 +265,7 @@
265
  " }\n",
266
  "\n",
267
  "network_extractor = NetworkFeatureExtractor()\n",
268
- "print(\" Network Feature Extractor initialized\")"
269
  ]
270
  },
271
  {
@@ -348,7 +348,7 @@
348
  " return min(100, max(0, score))\n",
349
  "\n",
350
  "header_extractor = SecurityHeaderExtractor()\n",
351
- "print(\" Security Header Extractor initialized\")"
352
  ]
353
  },
354
  {
@@ -437,7 +437,7 @@
437
  " }\n",
438
  "\n",
439
  "js_extractor = JavaScriptFeatureExtractor()\n",
440
- "print(\" JavaScript Feature Extractor initialized\")"
441
  ]
442
  },
443
  {
@@ -455,103 +455,7 @@
455
  "metadata": {},
456
  "outputs": [],
457
  "source": [
458
- "class CyberForgeFeaturePipeline:\n",
459
- " \"\"\"\n",
460
- " Unified feature extraction pipeline for CyberForge AI.\n",
461
- " Combines all extractors for comprehensive security feature engineering.\n",
462
- " \"\"\"\n",
463
- " \n",
464
- " def __init__(self):\n",
465
- " self.url_extractor = URLFeatureExtractor()\n",
466
- " self.network_extractor = NetworkFeatureExtractor()\n",
467
- " self.header_extractor = SecurityHeaderExtractor()\n",
468
- " self.js_extractor = JavaScriptFeatureExtractor()\n",
469
- " self.scaler = StandardScaler()\n",
470
- " self.label_encoder = LabelEncoder()\n",
471
- " self.feature_names = []\n",
472
- " \n",
473
- " def extract_website_features(self, scraped_data: Dict) -> Dict[str, Any]:\n",
474
- " \"\"\"Extract all features from website scraped data\"\"\"\n",
475
- " features = {}\n",
476
- " \n",
477
- " # URL features\n",
478
- " url_features = self.url_extractor.extract(scraped_data.get('url', ''))\n",
479
- " features.update({f\"url_{k}\": v for k, v in url_features.items() if k != 'tld'})\n",
480
- " \n",
481
- " # Network features\n",
482
- " network_features = self.network_extractor.extract_from_requests(\n",
483
- " scraped_data.get('network_requests', [])\n",
484
- " )\n",
485
- " features.update({f\"net_{k}\": v for k, v in network_features.items()})\n",
486
- " \n",
487
- " # Security header features\n",
488
- " header_features = self.header_extractor.extract(\n",
489
- " scraped_data.get('response_headers', {}),\n",
490
- " scraped_data.get('security_report', {})\n",
491
- " )\n",
492
- " features.update({f\"sec_{k}\": v for k, v in header_features.items()})\n",
493
- " \n",
494
- " # JavaScript features\n",
495
- " js_features = self.js_extractor.extract_from_console_logs(\n",
496
- " scraped_data.get('console_logs', [])\n",
497
- " )\n",
498
- " features.update({f\"js_{k}\": v for k, v in js_features.items()})\n",
499
- " \n",
500
- " # Calculate risk score\n",
501
- " features['security_score'] = self.header_extractor.calculate_security_score(header_features)\n",
502
- " \n",
503
- " return features\n",
504
- " \n",
505
- " def process_dataset(self, df: pd.DataFrame, url_column: str = 'url') -> pd.DataFrame:\n",
506
- " \"\"\"Process a dataset and extract URL features\"\"\"\n",
507
- " if url_column not in df.columns:\n",
508
- " print(f\" ⚠ No '{url_column}' column found\")\n",
509
- " return df\n",
510
- " \n",
511
- " # Extract URL features\n",
512
- " url_features = df[url_column].apply(lambda x: self.url_extractor.extract(x))\n",
513
- " url_df = pd.DataFrame(url_features.tolist())\n",
514
- " url_df.columns = [f\"url_{c}\" for c in url_df.columns if c != 'tld']\n",
515
- " \n",
516
- " # Combine with original features\n",
517
- " result = pd.concat([df.reset_index(drop=True), url_df.reset_index(drop=True)], axis=1)\n",
518
- " \n",
519
- " return result\n",
520
- " \n",
521
- " def prepare_for_training(self, df: pd.DataFrame, label_column: str = 'label') -> tuple:\n",
522
- " \"\"\"Prepare features for model training\"\"\"\n",
523
- " df = df.copy()\n",
524
- " \n",
525
- " # Separate features and labels\n",
526
- " if label_column in df.columns:\n",
527
- " y = df[label_column]\n",
528
- " X = df.drop(columns=[label_column])\n",
529
- " else:\n",
530
- " y = None\n",
531
- " X = df\n",
532
- " \n",
533
- " # Select numeric columns only\n",
534
- " numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()\n",
535
- " X_numeric = X[numeric_cols].fillna(0)\n",
536
- " \n",
537
- " # Convert boolean to int\n",
538
- " bool_cols = X.select_dtypes(include=[bool]).columns.tolist()\n",
539
- " for col in bool_cols:\n",
540
- " X_numeric[col] = X[col].astype(int)\n",
541
- " \n",
542
- " self.feature_names = X_numeric.columns.tolist()\n",
543
- " \n",
544
- " # Encode labels if present\n",
545
- " if y is not None:\n",
546
- " if y.dtype == 'object':\n",
547
- " y = self.label_encoder.fit_transform(y)\n",
548
- " else:\n",
549
- " y = y.values\n",
550
- " \n",
551
- " return X_numeric, y\n",
552
- "\n",
553
- "pipeline = CyberForgeFeaturePipeline()\n",
554
- "print(\"✓ Feature Pipeline initialized\")"
555
  ]
556
  },
557
  {
@@ -574,9 +478,9 @@
574
  "if manifest_path.exists():\n",
575
  " with open(manifest_path) as f:\n",
576
  " manifest = json.load(f)\n",
577
- " print(f\" Loaded manifest with {len(manifest)} datasets\")\n",
578
  "else:\n",
579
- " print(\" No manifest found. Run 01_data_acquisition.ipynb first.\")\n",
580
  " manifest = []"
581
  ]
582
  },
@@ -598,7 +502,7 @@
598
  " path = Path(\"..\") / entry['path']\n",
599
  " \n",
600
  " if not path.exists():\n",
601
- " print(f\" {name}: File not found\")\n",
602
  " continue\n",
603
  " \n",
604
  " print(f\" Processing: {name}\")\n",
@@ -622,7 +526,7 @@
622
  " 'n_features': len(pipeline.feature_names)\n",
623
  " }\n",
624
  " \n",
625
- " print(f\" {len(X)} samples, {len(pipeline.feature_names)} features\")\n",
626
  " \n",
627
  " feature_stats.append({\n",
628
  " 'name': name,\n",
@@ -632,9 +536,9 @@
632
  " })\n",
633
  " \n",
634
  " except Exception as e:\n",
635
- " print(f\" Error: {e}\")\n",
636
  "\n",
637
- "print(f\"\\n Processed {len(processed_datasets)} datasets\")"
638
  ]
639
  },
640
  {
@@ -679,7 +583,7 @@
679
  " 'has_labels': data['y'] is not None\n",
680
  " })\n",
681
  " \n",
682
- " print(f\" Saved: {output_path.name}\")\n",
683
  "\n",
684
  "# Save feature manifest\n",
685
  "manifest_path = FEATURES_DIR / \"feature_manifest.json\"\n",
@@ -690,8 +594,8 @@
690
  "pipeline_path = FEATURES_DIR / \"feature_pipeline.pkl\"\n",
691
  "joblib.dump(pipeline, pipeline_path)\n",
692
  "\n",
693
- "print(f\"\\n Feature manifest saved to: {manifest_path}\")\n",
694
- "print(f\" Feature pipeline saved to: {pipeline_path}\")"
695
  ]
696
  },
697
  {
@@ -717,26 +621,26 @@
717
  "total_features = max(d['n_features'] for d in processed_datasets.values()) if processed_datasets else 0\n",
718
  "\n",
719
  "print(f\"\"\"\n",
720
- "🔧 Feature Engineering Summary:\n",
721
  " - Datasets processed: {len(processed_datasets)}\n",
722
  " - Total samples: {total_samples:,}\n",
723
  " - Max features: {total_features}\n",
724
  " - Output directory: {FEATURES_DIR}\n",
725
  "\n",
726
- "📊 Feature Categories:\n",
727
  " - URL Features: Domain, path, security indicators\n",
728
  " - Network Features: Request patterns, status codes\n",
729
  " - Security Headers: CSP, HSTS, X-Frame-Options\n",
730
  " - JavaScript: Console logs, suspicious APIs\n",
731
  "\n",
732
- "📁 Datasets Ready for Training:\"\"\")\n",
733
  "\n",
734
  "for entry in feature_manifest:\n",
735
- " print(f\" {entry['name']}: {entry['samples']:,} samples, {entry['features']} features\")\n",
736
  "\n",
737
  "print(f\"\"\"\n",
738
  "Next step:\n",
739
- " 03_model_training.ipynb\n",
740
  "\"\"\")\n",
741
  "print(\"=\" * 60)"
742
  ]
@@ -749,4 +653,4 @@
749
  },
750
  "nbformat": 4,
751
  "nbformat_minor": 5
752
- }
 
52
  "FEATURES_DIR = DATASETS_DIR / \"features\"\n",
53
  "FEATURES_DIR.mkdir(exist_ok=True)\n",
54
  "\n",
55
+ "print(f\"\u2713 Configuration loaded\")\n",
56
+ "print(f\"\u2713 Features output: {FEATURES_DIR}\")"
57
  ]
58
  },
59
  {
 
165
  " return pd.DataFrame(features)\n",
166
  "\n",
167
  "url_extractor = URLFeatureExtractor()\n",
168
+ "print(\"\u2713 URL Feature Extractor initialized\")\n",
169
  "\n",
170
  "# Test\n",
171
  "test_features = url_extractor.extract(\"https://suspicious-login.example.com/verify?id=123\")\n",
 
265
  " }\n",
266
  "\n",
267
  "network_extractor = NetworkFeatureExtractor()\n",
268
+ "print(\"\u2713 Network Feature Extractor initialized\")"
269
  ]
270
  },
271
  {
 
348
  " return min(100, max(0, score))\n",
349
  "\n",
350
  "header_extractor = SecurityHeaderExtractor()\n",
351
+ "print(\"\u2713 Security Header Extractor initialized\")"
352
  ]
353
  },
354
  {
 
437
  " }\n",
438
  "\n",
439
  "js_extractor = JavaScriptFeatureExtractor()\n",
440
+ "print(\"\u2713 JavaScript Feature Extractor initialized\")"
441
  ]
442
  },
443
  {
 
455
  "metadata": {},
456
  "outputs": [],
457
  "source": [
458
+ " def process_dataset(self, df: pd.DataFrame, url_column: str = 'url') -> pd.DataFrame:\n \"\"\"Process a dataset and extract URL features\"\"\"\n if url_column not in df.columns:\n print(f\" \u26a0 No '{url_column}' column found\")\n return df\n \n try:\n # Extract URL features\n url_features = df[url_column].apply(lambda x: self.url_extractor.extract(x))\n url_df = pd.DataFrame(url_features.tolist())\n \n # Drop non-numeric 'tld' column before renaming\n if 'tld' in url_df.columns:\n url_df = url_df.drop(columns=['tld'])\n \n # Rename columns with url_ prefix\n url_df.columns = [f\"url_{c}\" for c in url_df.columns]\n \n # Combine with original features (drop original url column to avoid issues)\n result_df = df.drop(columns=[url_column]).reset_index(drop=True)\n result = pd.concat([result_df, url_df.reset_index(drop=True)], axis=1)\n \n return result\n except Exception as e:\n print(f\" \u26a0 URL feature extraction error: {e}\")\n return df\n \n def prepare_for_training(self, df: pd.DataFrame, label_column: str = 'label') -> tuple:\n \"\"\"Prepare features for model training\"\"\"\n df = df.copy()\n \n # Find label column (case insensitive, multiple names)\n label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n 'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n actual_label_col = None\n for col in df.columns:\n if col.lower() in [lc.lower() for lc in label_candidates]:\n actual_label_col = col\n break\n \n # Separate features and labels\n if actual_label_col:\n y = df[actual_label_col]\n X = df.drop(columns=[actual_label_col])\n else:\n y = None\n X = df\n \n # Select numeric columns only\n numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()\n bool_cols = X.select_dtypes(include=[bool]).columns.tolist()\n \n X_numeric = X[numeric_cols].fillna(0)\n \n # Convert boolean to int\n for col in bool_cols:\n if col in X.columns:\n X_numeric[col] = X[col].astype(int)\n \n self.feature_names = X_numeric.columns.tolist()\n \n # Encode labels if present\n if y is not None:\n if y.dtype == 'object':\n y = self.label_encoder.fit_transform(y)\n else:\n y = y.values\n \n return X_numeric, y\\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  ]
460
  },
461
  {
 
478
  "if manifest_path.exists():\n",
479
  " with open(manifest_path) as f:\n",
480
  " manifest = json.load(f)\n",
481
+ " print(f\"\u2713 Loaded manifest with {len(manifest)} datasets\")\n",
482
  "else:\n",
483
+ " print(\"\u26a0 No manifest found. Run 01_data_acquisition.ipynb first.\")\n",
484
  " manifest = []"
485
  ]
486
  },
 
502
  " path = Path(\"..\") / entry['path']\n",
503
  " \n",
504
  " if not path.exists():\n",
505
+ " print(f\" \u26a0 {name}: File not found\")\n",
506
  " continue\n",
507
  " \n",
508
  " print(f\" Processing: {name}\")\n",
 
526
  " 'n_features': len(pipeline.feature_names)\n",
527
  " }\n",
528
  " \n",
529
+ " print(f\" \u2713 {len(X)} samples, {len(pipeline.feature_names)} features\")\n",
530
  " \n",
531
  " feature_stats.append({\n",
532
  " 'name': name,\n",
 
536
  " })\n",
537
  " \n",
538
  " except Exception as e:\n",
539
+ " print(f\" \u26a0 Error: {e}\")\n",
540
  "\n",
541
+ "print(f\"\\n\u2713 Processed {len(processed_datasets)} datasets\")"
542
  ]
543
  },
544
  {
 
583
  " 'has_labels': data['y'] is not None\n",
584
  " })\n",
585
  " \n",
586
+ " print(f\" \u2713 Saved: {output_path.name}\")\n",
587
  "\n",
588
  "# Save feature manifest\n",
589
  "manifest_path = FEATURES_DIR / \"feature_manifest.json\"\n",
 
594
  "pipeline_path = FEATURES_DIR / \"feature_pipeline.pkl\"\n",
595
  "joblib.dump(pipeline, pipeline_path)\n",
596
  "\n",
597
+ "print(f\"\\n\u2713 Feature manifest saved to: {manifest_path}\")\n",
598
+ "print(f\"\u2713 Feature pipeline saved to: {pipeline_path}\")"
599
  ]
600
  },
601
  {
 
621
  "total_features = max(d['n_features'] for d in processed_datasets.values()) if processed_datasets else 0\n",
622
  "\n",
623
  "print(f\"\"\"\n",
624
+ "\ud83d\udd27 Feature Engineering Summary:\n",
625
  " - Datasets processed: {len(processed_datasets)}\n",
626
  " - Total samples: {total_samples:,}\n",
627
  " - Max features: {total_features}\n",
628
  " - Output directory: {FEATURES_DIR}\n",
629
  "\n",
630
+ "\ud83d\udcca Feature Categories:\n",
631
  " - URL Features: Domain, path, security indicators\n",
632
  " - Network Features: Request patterns, status codes\n",
633
  " - Security Headers: CSP, HSTS, X-Frame-Options\n",
634
  " - JavaScript: Console logs, suspicious APIs\n",
635
  "\n",
636
+ "\ud83d\udcc1 Datasets Ready for Training:\"\"\")\n",
637
  "\n",
638
  "for entry in feature_manifest:\n",
639
+ " print(f\" \u2713 {entry['name']}: {entry['samples']:,} samples, {entry['features']} features\")\n",
640
  "\n",
641
  "print(f\"\"\"\n",
642
  "Next step:\n",
643
+ " \u2192 03_model_training.ipynb\n",
644
  "\"\"\")\n",
645
  "print(\"=\" * 60)"
646
  ]
 
653
  },
654
  "nbformat": 4,
655
  "nbformat_minor": 5
656
+ }