Spaces:

Che237
/

cyberforge

Sleeping

App Files Files Community

Che237 commited on Jan 31

Commit

2fecf00

verified ·

1 Parent(s): 460801e

Fix validation and normalization logic

Browse files

Files changed (1) hide show

notebooks/01_data_acquisition.ipynb +32 -22

notebooks/01_data_acquisition.ipynb CHANGED Viewed

@@ -352,28 +352,32 @@
    "source": [
     "def validate_dataset(name: str, df: pd.DataFrame) -> Dict[str, Any]:\n",
     "    \"\"\"Validate dataset quality and return report\"\"\"\n",
     "    report = {\n",
     "        \"name\": name,\n",
     "        \"samples\": len(df),\n",
     "        \"features\": len(df.columns),\n",
     "        \"missing_values\": df.isnull().sum().sum(),\n",
-    "        \"missing_pct\": (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100,\n",
     "        \"duplicate_rows\": df.duplicated().sum(),\n",
     "        \"numeric_columns\": len(df.select_dtypes(include=[np.number]).columns),\n",
     "        \"categorical_columns\": len(df.select_dtypes(include=['object', 'category']).columns),\n",
     "        \"memory_mb\": df.memory_usage(deep=True).sum() / (1024 * 1024),\n",
-    "        \"has_label\": any(col in df.columns for col in ['label', 'target', 'class', 'is_malicious', 'attack_type']),\n",
     "        \"valid\": True\n",
     "    }\n",
     "    \n",
-    "    # Validation checks\n",
     "    issues = []\n",
-    "    if report[\"samples\"] < 100:\n",
-    "        issues.append(\"Too few samples (<100)\")\n",
-    "    if report[\"missing_pct\"] > 50:\n",
-    "        issues.append(\"Too many missing values (>50%)\")\n",
-    "    if not report[\"has_label\"]:\n",
-    "        issues.append(\"No label column found\")\n",
     "    \n",
     "    report[\"issues\"] = issues\n",
     "    report[\"valid\"] = len(issues) == 0\n",
@@ -393,7 +397,7 @@
     "    print(f\"{name:<30} {report['samples']:>10} {report['features']:>10} {report['missing_pct']:>9.2f}% {status:>8}\")\n",
     "\n",
     "valid_datasets = [r[\"name\"] for r in validation_reports if r[\"valid\"]]\n",
-    "print(f\"\\n\u2713 {len(valid_datasets)} datasets passed validation\")"
    ]
   },
   {
@@ -418,16 +422,20 @@
     "    # Standardize column names\n",
     "    df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')\n",
     "    \n",
-    "    # Find and standardize label column\n",
-    "    label_columns = ['label', 'target', 'class', 'is_malicious', 'attack_type', 'attack', 'category']\n",
     "    for col in label_columns:\n",
-    "        if col in df.columns:\n",
-    "            df = df.rename(columns={col: 'label'})\n",
     "            break\n",
     "    \n",
     "    # Handle missing values\n",
     "    numeric_cols = df.select_dtypes(include=[np.number]).columns\n",
-    "    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())\n",
     "    \n",
     "    categorical_cols = df.select_dtypes(include=['object', 'category']).columns\n",
     "    for col in categorical_cols:\n",
@@ -443,17 +451,19 @@
     "    \n",
     "    return df\n",
     "\n",
-    "# Normalize all valid datasets\n",
     "normalized_datasets = {}\n",
     "print(\"Normalizing datasets...\")\n",
     "\n",
-    "for name in valid_datasets:\n",
-    "    if name in loaded_datasets:\n",
-    "        df = normalize_dataset(loaded_datasets[name], name)\n",
-    "        normalized_datasets[name] = df\n",
-    "        print(f\"  \u2713 {name}: {len(df)} samples after normalization\")\n",
     "\n",
-    "print(f\"\\n\u2713 Normalized {len(normalized_datasets)} datasets\")"
    ]
   },
   {

    "source": [
     "def validate_dataset(name: str, df: pd.DataFrame) -> Dict[str, Any]:\n",
     "    \"\"\"Validate dataset quality and return report\"\"\"\n",
+    "    # Expanded label column detection\n",
+    "    label_columns = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n",
+    "                     'attack', 'category', 'malware', 'phishing', 'threat', \n",
+    "                     'classification', 'type', 'result', 'output', 'y']\n",
+    "    has_label = any(col.lower() in [c.lower() for c in df.columns] for col in label_columns)\n",
+    "    \n",
     "    report = {\n",
     "        \"name\": name,\n",
     "        \"samples\": len(df),\n",
     "        \"features\": len(df.columns),\n",
     "        \"missing_values\": df.isnull().sum().sum(),\n",
+    "        \"missing_pct\": (df.isnull().sum().sum() / (len(df) * len(df.columns))) * 100 if len(df) > 0 else 0,\n",
     "        \"duplicate_rows\": df.duplicated().sum(),\n",
     "        \"numeric_columns\": len(df.select_dtypes(include=[np.number]).columns),\n",
     "        \"categorical_columns\": len(df.select_dtypes(include=['object', 'category']).columns),\n",
     "        \"memory_mb\": df.memory_usage(deep=True).sum() / (1024 * 1024),\n",
+    "        \"has_label\": has_label,\n",
     "        \"valid\": True\n",
     "    }\n",
     "    \n",
+    "    # More lenient validation - only fail on critical issues\n",
     "    issues = []\n",
+    "    if report[\"samples\"] < 10:\n",
+    "        issues.append(\"Too few samples (<10)\")\n",
+    "    if report[\"missing_pct\"] > 80:\n",
+    "        issues.append(\"Too many missing values (>80%)\")\n",
     "    \n",
     "    report[\"issues\"] = issues\n",
     "    report[\"valid\"] = len(issues) == 0\n",
     "    print(f\"{name:<30} {report['samples']:>10} {report['features']:>10} {report['missing_pct']:>9.2f}% {status:>8}\")\n",
     "\n",
     "valid_datasets = [r[\"name\"] for r in validation_reports if r[\"valid\"]]\n",
+    "print(f\"\\n\u2713 {len(valid_datasets)} datasets passed validation\")\n"
    ]
   },
   {
     "    # Standardize column names\n",
     "    df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')\n",
     "    \n",
+    "    # Find and standardize label column (expanded list)\n",
+    "    label_columns = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n",
+    "                     'attack', 'category', 'malware', 'phishing', 'threat',\n",
+    "                     'classification', 'type', 'result', 'output', 'y']\n",
     "    for col in label_columns:\n",
+    "        matching_cols = [c for c in df.columns if c.lower() == col.lower()]\n",
+    "        if matching_cols:\n",
+    "            df = df.rename(columns={matching_cols[0]: 'label'})\n",
     "            break\n",
     "    \n",
     "    # Handle missing values\n",
     "    numeric_cols = df.select_dtypes(include=[np.number]).columns\n",
+    "    if len(numeric_cols) > 0:\n",
+    "        df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())\n",
     "    \n",
     "    categorical_cols = df.select_dtypes(include=['object', 'category']).columns\n",
     "    for col in categorical_cols:\n",
     "    \n",
     "    return df\n",
     "\n",
+    "# Normalize ALL loaded datasets (not just valid ones)\n",
     "normalized_datasets = {}\n",
     "print(\"Normalizing datasets...\")\n",
     "\n",
+    "for name, df in loaded_datasets.items():\n",
+    "    try:\n",
+    "        normalized_df = normalize_dataset(df, name)\n",
+    "        normalized_datasets[name] = normalized_df\n",
+    "        print(f\"  \u2713 {name}: {len(normalized_df)} samples after normalization\")\n",
+    "    except Exception as e:\n",
+    "        print(f\"  \u26a0 {name}: Error during normalization - {e}\")\n",
     "\n",
+    "print(f\"\\n\u2713 Normalized {len(normalized_datasets)} datasets\")\n"
    ]
   },
   {