Che237 commited on
Commit
eb4b92c
·
verified ·
1 Parent(s): a49ca35

Fix syntax error - remove literal backslash-n

Browse files
notebooks/02_feature_engineering.ipynb CHANGED
@@ -455,7 +455,7 @@
455
  "metadata": {},
456
  "outputs": [],
457
  "source": [
458
- " def process_dataset(self, df: pd.DataFrame, url_column: str = 'url') -> pd.DataFrame:\n \"\"\"Process a dataset and extract URL features\"\"\"\n if url_column not in df.columns:\n print(f\" \u26a0 No '{url_column}' column found\")\n return df\n \n try:\n # Extract URL features\n url_features = df[url_column].apply(lambda x: self.url_extractor.extract(x))\n url_df = pd.DataFrame(url_features.tolist())\n \n # Drop non-numeric 'tld' column before renaming\n if 'tld' in url_df.columns:\n url_df = url_df.drop(columns=['tld'])\n \n # Rename columns with url_ prefix\n url_df.columns = [f\"url_{c}\" for c in url_df.columns]\n \n # Combine with original features (drop original url column to avoid issues)\n result_df = df.drop(columns=[url_column]).reset_index(drop=True)\n result = pd.concat([result_df, url_df.reset_index(drop=True)], axis=1)\n \n return result\n except Exception as e:\n print(f\" \u26a0 URL feature extraction error: {e}\")\n return df\n \n def prepare_for_training(self, df: pd.DataFrame, label_column: str = 'label') -> tuple:\n \"\"\"Prepare features for model training\"\"\"\n df = df.copy()\n \n # Find label column (case insensitive, multiple names)\n label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n 'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n actual_label_col = None\n for col in df.columns:\n if col.lower() in [lc.lower() for lc in label_candidates]:\n actual_label_col = col\n break\n \n # Separate features and labels\n if actual_label_col:\n y = df[actual_label_col]\n X = df.drop(columns=[actual_label_col])\n else:\n y = None\n X = df\n \n # Select numeric columns only\n numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()\n bool_cols = X.select_dtypes(include=[bool]).columns.tolist()\n \n X_numeric = X[numeric_cols].fillna(0)\n \n # Convert boolean to int\n for col in bool_cols:\n if col in X.columns:\n X_numeric[col] = X[col].astype(int)\n \n self.feature_names = X_numeric.columns.tolist()\n \n # Encode labels if present\n if y is not None:\n if y.dtype == 'object':\n y = self.label_encoder.fit_transform(y)\n else:\n y = y.values\n \n return X_numeric, y\\n"
459
  ]
460
  },
461
  {
 
455
  "metadata": {},
456
  "outputs": [],
457
  "source": [
458
+ " def process_dataset(self, df: pd.DataFrame, url_column: str = 'url') -> pd.DataFrame:\n \"\"\"Process a dataset and extract URL features\"\"\"\n if url_column not in df.columns:\n print(f\" \u26a0 No '{url_column}' column found\")\n return df\n \n try:\n # Extract URL features\n url_features = df[url_column].apply(lambda x: self.url_extractor.extract(x))\n url_df = pd.DataFrame(url_features.tolist())\n \n # Drop non-numeric 'tld' column before renaming\n if 'tld' in url_df.columns:\n url_df = url_df.drop(columns=['tld'])\n \n # Rename columns with url_ prefix\n url_df.columns = [f\"url_{c}\" for c in url_df.columns]\n \n # Combine with original features (drop original url column to avoid issues)\n result_df = df.drop(columns=[url_column]).reset_index(drop=True)\n result = pd.concat([result_df, url_df.reset_index(drop=True)], axis=1)\n \n return result\n except Exception as e:\n print(f\" \u26a0 URL feature extraction error: {e}\")\n return df\n \n def prepare_for_training(self, df: pd.DataFrame, label_column: str = 'label') -> tuple:\n \"\"\"Prepare features for model training\"\"\"\n df = df.copy()\n \n # Find label column (case insensitive, multiple names)\n label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n 'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n actual_label_col = None\n for col in df.columns:\n if col.lower() in [lc.lower() for lc in label_candidates]:\n actual_label_col = col\n break\n \n # Separate features and labels\n if actual_label_col:\n y = df[actual_label_col]\n X = df.drop(columns=[actual_label_col])\n else:\n y = None\n X = df\n \n # Select numeric columns only\n numeric_cols = X.select_dtypes(include=[np.number]).columns.tolist()\n bool_cols = X.select_dtypes(include=[bool]).columns.tolist()\n \n X_numeric = X[numeric_cols].fillna(0)\n \n # Convert boolean to int\n for col in bool_cols:\n if col in X.columns:\n X_numeric[col] = X[col].astype(int)\n \n self.feature_names = X_numeric.columns.tolist()\n \n # Encode labels if present\n if y is not None:\n if y.dtype == 'object':\n y = self.label_encoder.fit_transform(y)\n else:\n y = y.values\n \n return X_numeric, y\n"
459
  ]
460
  },
461
  {