Che237 commited on
Commit
48bf346
·
verified ·
1 Parent(s): f513f82

Fix broken newlines in print statements

Browse files
Files changed (1) hide show
  1. notebooks/03_model_training.ipynb +39 -3
notebooks/03_model_training.ipynb CHANGED
@@ -80,9 +80,45 @@
80
  "metadata": {},
81
  "outputs": [],
82
  "source": [
83
- "# Load feature manifest\nfeature_manifest_path = FEATURES_DIR / \"feature_manifest.json\"\n\nif feature_manifest_path.exists():\n with open(feature_manifest_path) as f:\n feature_manifest = json.load(f)\n print(f\"\u2713 Loaded {len(feature_manifest)} feature datasets\")\nelse:\n print(\"\u26a0 No feature manifest. Run 02_feature_engineering.ipynb first.\")\n feature_manifest = []\n\n# Load datasets - be more lenient with label detection\ndatasets = {}\nprint(\"\n",
84
- "Loading feature datasets:\")\n\nfor entry in feature_manifest:\n name = entry['name']\n path = Path(\"..\") / entry['path']\n \n if path.exists():\n try:\n df = pd.read_parquet(path)\n \n # Check for label column with multiple possible names\n label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n 'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n has_label = any(col.lower() in [lc.lower() for lc in label_candidates] for col in df.columns)\n \n # Even without explicit labels, we can use for training (create synthetic labels based on dataset name)\n datasets[name] = df\n label_status = \"with labels\" if has_label else \"(no explicit labels - will create)\"\n print(f\" \u2713 {name}: {len(df)} samples, {len(df.columns)} features {label_status}\")\n except Exception as e:\n print(f\" \u26a0 {name}: Error loading - {e}\")\n else:\n print(f\" \u26a0 {name}: File not found\")\n\nprint(f\"\n",
85
- "\u2713 Loaded {len(datasets)} datasets for training\")\n"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  ]
87
  },
88
  {
 
80
  "metadata": {},
81
  "outputs": [],
82
  "source": [
83
+ "# Load feature manifest\n",
84
+ "feature_manifest_path = FEATURES_DIR / \"feature_manifest.json\"\n",
85
+ "\n",
86
+ "if feature_manifest_path.exists():\n",
87
+ " with open(feature_manifest_path) as f:\n",
88
+ " feature_manifest = json.load(f)\n",
89
+ " print(f\"\u2713 Loaded {len(feature_manifest)} feature datasets\")\n",
90
+ "else:\n",
91
+ " print(\"\u26a0 No feature manifest. Run 02_feature_engineering.ipynb first.\")\n",
92
+ " feature_manifest = []\n",
93
+ "\n",
94
+ "# Load datasets - be more lenient with label detection\n",
95
+ "datasets = {}\n",
96
+ "print(\"\\nLoading feature datasets:\")\n",
97
+ "\n",
98
+ "for entry in feature_manifest:\n",
99
+ " name = entry['name']\n",
100
+ " path = Path(\"..\") / entry['path']\n",
101
+ " \n",
102
+ " if path.exists():\n",
103
+ " try:\n",
104
+ " df = pd.read_parquet(path)\n",
105
+ " \n",
106
+ " # Check for label column with multiple possible names\n",
107
+ " label_candidates = ['label', 'target', 'class', 'is_malicious', 'attack_type', \n",
108
+ " 'attack', 'category', 'malware', 'phishing', 'threat', 'type', 'y']\n",
109
+ " has_label = any(col.lower() in [lc.lower() for lc in label_candidates] for col in df.columns)\n",
110
+ " \n",
111
+ " # Even without explicit labels, we can use for training\n",
112
+ " datasets[name] = df\n",
113
+ " label_status = \"with labels\" if has_label else \"(no explicit labels - will create)\"\n",
114
+ " print(f\" \u2713 {name}: {len(df)} samples, {len(df.columns)} features {label_status}\")\n",
115
+ " except Exception as e:\n",
116
+ " print(f\" \u26a0 {name}: Error loading - {e}\")\n",
117
+ " else:\n",
118
+ " print(f\" \u26a0 {name}: File not found\")\n",
119
+ "\n",
120
+ "print(f\"\\n\u2713 Loaded {len(datasets)} datasets for training\")\n",
121
+ "\n"
122
  ]
123
  },
124
  {