Che237 commited on
Commit
f513f82
·
verified ·
1 Parent(s): 117c333

Fix CyberForgeFeaturePipeline class definition

Browse files
notebooks/02_feature_engineering.ipynb CHANGED
@@ -455,7 +455,125 @@
455
  "metadata": {},
456
  "outputs": [],
457
  "source": [
458
- " def process_dataset(self, df: pd.DataFrame, url_column: str = 'url') -> pd.DataFrame:\n \"\"\"Process a dataset and extract URL features\"\"\"\n if url_column not in df.columns:\n print(f\" \u26a0 No '{url_column}' column found\")\n return df\n \n try:\n # Extract URL features\n url_features = df[url_column].apply(lambda x: self.url_extractor.extract(x))\n url_df = pd.DataFrame(url_features.tolist())\n \n # Drop non-numeric 'tld' column before renaming\n if 'tld' in url_df.columns:\n url_df = url_df.drop(columns=['tld'])\n \n # Rename columns with url_ prefix\n url_df.columns = [f\"url_{c}\" for c in url_df.columns]\n \n # Combine with original features (drop original url column to avoid issues)\n result_df = df.drop(columns=[url_column]).reset_index(drop=True)\n result = pd.concat([result_df, url_df.reset_index(drop=True)], axis=1)\n \n return result\n except Exception as e:\n print(f\" \u26a0 URL feature extraction error: {e}\")\n return df\n \n def prepare_for_training(self, df: pd.DataFrame, label_column: str = 'label') -> tuple:\n \"\"\"Prepare features for model training\"\"\"\n df = df.copy()\n \n # Find label column (case insensitive, multiple names)\n label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n 'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n actual_label_col = None\n for col in df.columns:\n if col.lower() in [lc.lower() for lc in label_candidates]:\n actual_label_col = col\n break\n \n # Separate features and labels\n if actual_label_col:\n y = df[actual_label_col]\n X = df.drop(columns=[actual_label_col])\n else:\n y = None\n X = df\n \n # Select numeric columns only\n numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()\n bool_cols = X.select_dtypes(include=[bool]).columns.tolist()\n \n X_numeric = X[numeric_cols].fillna(0)\n \n # Convert boolean to int\n for col in bool_cols:\n if col in X.columns:\n X_numeric[col] = X[col].astype(int)\n \n self.feature_names = X_numeric.columns.tolist()\n \n # Encode labels if present\n if y is not None:\n if y.dtype == 'object':\n y = self.label_encoder.fit_transform(y)\n else:\n y = y.values\n \n return X_numeric, y\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
459
  ]
460
  },
461
  {
 
455
  "metadata": {},
456
  "outputs": [],
457
  "source": [
458
+ "class CyberForgeFeaturePipeline:\n",
459
+ " \"\"\"\n",
460
+ " Unified feature extraction pipeline.\n",
461
+ " Combines all extractors for complete feature engineering.\n",
462
+ " \"\"\"\n",
463
+ " \n",
464
+ " def __init__(self):\n",
465
+ " self.url_extractor = url_extractor\n",
466
+ " self.network_extractor = network_extractor\n",
467
+ " self.header_extractor = header_extractor\n",
468
+ " self.js_extractor = js_extractor\n",
469
+ " self.scaler = StandardScaler()\n",
470
+ " self.label_encoder = LabelEncoder()\n",
471
+ " self.feature_names = []\n",
472
+ " \n",
473
+ " def extract_website_features(self, scraped_data: Dict) -> Dict[str, Any]:\n",
474
+ " \"\"\"Extract all features from website scraped data\"\"\"\n",
475
+ " features = {}\n",
476
+ " \n",
477
+ " # URL features\n",
478
+ " url_features = self.url_extractor.extract(scraped_data.get('url', ''))\n",
479
+ " features.update({f\"url_{k}\": v for k, v in url_features.items() if k != 'tld'})\n",
480
+ " \n",
481
+ " # Network features\n",
482
+ " network_features = self.network_extractor.extract_from_requests(\n",
483
+ " scraped_data.get('network_requests', [])\n",
484
+ " )\n",
485
+ " features.update({f\"net_{k}\": v for k, v in network_features.items()})\n",
486
+ " \n",
487
+ " # Security header features\n",
488
+ " header_features = self.header_extractor.extract(\n",
489
+ " scraped_data.get('response_headers', {}),\n",
490
+ " scraped_data.get('security_report', {})\n",
491
+ " )\n",
492
+ " features.update({f\"sec_{k}\": v for k, v in header_features.items()})\n",
493
+ " \n",
494
+ " # JavaScript features\n",
495
+ " js_features = self.js_extractor.extract_from_console_logs(\n",
496
+ " scraped_data.get('console_logs', [])\n",
497
+ " )\n",
498
+ " features.update({f\"js_{k}\": v for k, v in js_features.items()})\n",
499
+ " \n",
500
+ " # Calculate risk score\n",
501
+ " features['security_score'] = self.header_extractor.calculate_security_score(header_features)\n",
502
+ " \n",
503
+ " return features\n",
504
+ " \n",
505
+ " def process_dataset(self, df: pd.DataFrame, url_column: str = 'url') -> pd.DataFrame:\n",
506
+ " \"\"\"Process a dataset and extract URL features\"\"\"\n",
507
+ " if url_column not in df.columns:\n",
508
+ " print(f\" Warning: No '{url_column}' column found\")\n",
509
+ " return df\n",
510
+ " \n",
511
+ " try:\n",
512
+ " # Extract URL features\n",
513
+ " url_features = df[url_column].apply(lambda x: self.url_extractor.extract(x))\n",
514
+ " url_df = pd.DataFrame(url_features.tolist())\n",
515
+ " \n",
516
+ " # Drop non-numeric 'tld' column before renaming\n",
517
+ " if 'tld' in url_df.columns:\n",
518
+ " url_df = url_df.drop(columns=['tld'])\n",
519
+ " \n",
520
+ " # Rename columns with url_ prefix\n",
521
+ " url_df.columns = [f\"url_{c}\" for c in url_df.columns]\n",
522
+ " \n",
523
+ " # Combine with original features (drop original url column to avoid issues)\n",
524
+ " result_df = df.drop(columns=[url_column]).reset_index(drop=True)\n",
525
+ " result = pd.concat([result_df, url_df.reset_index(drop=True)], axis=1)\n",
526
+ " \n",
527
+ " return result\n",
528
+ " except Exception as e:\n",
529
+ " print(f\" Warning: URL feature extraction error: {e}\")\n",
530
+ " return df\n",
531
+ " \n",
532
+ " def prepare_for_training(self, df: pd.DataFrame, label_column: str = 'label') -> tuple:\n",
533
+ " \"\"\"Prepare features for model training\"\"\"\n",
534
+ " df = df.copy()\n",
535
+ " \n",
536
+ " # Find label column (case insensitive, multiple names)\n",
537
+ " label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n",
538
+ " 'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n",
539
+ " actual_label_col = None\n",
540
+ " for col in df.columns:\n",
541
+ " if col.lower() in [lc.lower() for lc in label_candidates]:\n",
542
+ " actual_label_col = col\n",
543
+ " break\n",
544
+ " \n",
545
+ " # Separate features and labels\n",
546
+ " if actual_label_col:\n",
547
+ " y = df[actual_label_col]\n",
548
+ " X = df.drop(columns=[actual_label_col])\n",
549
+ " else:\n",
550
+ " y = None\n",
551
+ " X = df\n",
552
+ " \n",
553
+ " # Select numeric columns only\n",
554
+ " numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()\n",
555
+ " bool_cols = X.select_dtypes(include=[bool]).columns.tolist()\n",
556
+ " \n",
557
+ " X_numeric = X[numeric_cols].fillna(0)\n",
558
+ " \n",
559
+ " # Convert boolean to int\n",
560
+ " for col in bool_cols:\n",
561
+ " if col in X.columns:\n",
562
+ " X_numeric[col] = X[col].astype(int)\n",
563
+ " \n",
564
+ " self.feature_names = X_numeric.columns.tolist()\n",
565
+ " \n",
566
+ " # Encode labels if present\n",
567
+ " if y is not None:\n",
568
+ " if y.dtype == 'object':\n",
569
+ " y = self.label_encoder.fit_transform(y)\n",
570
+ " else:\n",
571
+ " y = y.values\n",
572
+ " \n",
573
+ " return X_numeric, y\n",
574
+ "\n",
575
+ "pipeline = CyberForgeFeaturePipeline()\n",
576
+ "print(\"Feature Pipeline initialized\")\n"
577
  ]
578
  },
579
  {