Che237 commited on
Commit
f5fe12d
Β·
verified Β·
1 Parent(s): 0955fe4

Add notebook 08: train all models + upload to Che237/cyberforge-models

Browse files
Files changed (1) hide show
  1. notebooks/08_upload_to_hub.ipynb +344 -0
notebooks/08_upload_to_hub.ipynb ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "nbformat": 4,
3
+ "nbformat_minor": 5,
4
+ "metadata": {"kernelspec": {"display_name": "Python 3","language": "python","name": "python3"},"language_info": {"name": "python","version": "3.11.0"}},
5
+ "cells": [
6
+ {
7
+ "cell_type": "markdown",
8
+ "metadata": {},
9
+ "source": ["# 08 - Upload Trained Models to HuggingFace Hub\n\nTrains all 4 CyberForge models from scratch (or loads existing ones) then uploads to `Che237/cyberforge-models`."]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": null,
14
+ "metadata": {},
15
+ "outputs": [],
16
+ "source": [
17
+ "import os, json, joblib, logging\n",
18
+ "import numpy as np\n",
19
+ "import pandas as pd\n",
20
+ "from pathlib import Path\n",
21
+ "from datetime import datetime\n",
22
+ "from sklearn.ensemble import GradientBoostingClassifier, IsolationForest\n",
23
+ "from sklearn.preprocessing import StandardScaler\n",
24
+ "from sklearn.model_selection import train_test_split\n",
25
+ "from sklearn.metrics import accuracy_score, f1_score\n",
26
+ "from huggingface_hub import HfApi, create_repo\n",
27
+ "\n",
28
+ "logging.basicConfig(level=logging.INFO, format='%(levelname)s | %(message)s')\n",
29
+ "log = logging.getLogger(__name__)\n",
30
+ "\n",
31
+ "HF_TOKEN = os.environ.get('HF_TOKEN', '')\n",
32
+ "MODEL_REPO = 'Che237/cyberforge-models'\n",
33
+ "NB_DIR = Path('.').absolute()\n",
34
+ "MODELS_DIR = NB_DIR.parent / 'models'\n",
35
+ "DATASETS = NB_DIR.parent / 'datasets'\n",
36
+ "UPLOAD_DIR = NB_DIR.parent / 'trained_models'\n",
37
+ "UPLOAD_DIR.mkdir(exist_ok=True)\n",
38
+ "\n",
39
+ "FEATURE_NAMES = [\n",
40
+ " 'url_length','hostname_length','path_length','is_https',\n",
41
+ " 'has_ip_address','has_suspicious_tld','subdomain_count',\n",
42
+ " 'has_port','query_params_count','has_at_symbol',\n",
43
+ " 'has_double_slash','special_char_count'\n",
44
+ "]\n",
45
+ "rng = np.random.default_rng(42)\n",
46
+ "print(f'Working dir: {NB_DIR}')\n",
47
+ "print(f'Models dir: {MODELS_DIR} (exists={MODELS_DIR.exists()})')\n",
48
+ "print(f'Upload dir: {UPLOAD_DIR}')\n",
49
+ "print(f'HF_TOKEN set: {bool(HF_TOKEN)}')"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": null,
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
58
+ "# ── Synthetic data generators ───────────────────────────────────────────────\n",
59
+ "def synth_benign(n=1500):\n",
60
+ " d = {\n",
61
+ " 'url_length': rng.integers(15, 60, n),\n",
62
+ " 'hostname_length': rng.integers(5, 25, n),\n",
63
+ " 'path_length': rng.integers(0, 30, n),\n",
64
+ " 'is_https': rng.choice([1,1,1,0], n),\n",
65
+ " 'has_ip_address': rng.choice([0,0,0,0,1], n),\n",
66
+ " 'has_suspicious_tld': rng.choice([0,0,0,1], n),\n",
67
+ " 'subdomain_count': rng.integers(0, 2, n),\n",
68
+ " 'has_port': rng.choice([0,0,0,1], n),\n",
69
+ " 'query_params_count': rng.integers(0, 3, n),\n",
70
+ " 'has_at_symbol': rng.choice([0,0,0,0,1], n),\n",
71
+ " 'has_double_slash': rng.choice([0,0,0,1], n),\n",
72
+ " 'special_char_count': rng.integers(0, 4, n),\n",
73
+ " }\n",
74
+ " return pd.DataFrame(d), np.zeros(n, dtype=int)\n",
75
+ "\n",
76
+ "def synth_malicious(n=1500):\n",
77
+ " d = {\n",
78
+ " 'url_length': rng.integers(60, 300, n),\n",
79
+ " 'hostname_length': rng.integers(20, 80, n),\n",
80
+ " 'path_length': rng.integers(10, 120, n),\n",
81
+ " 'is_https': rng.choice([1,0,0], n),\n",
82
+ " 'has_ip_address': rng.choice([0,0,1,1], n),\n",
83
+ " 'has_suspicious_tld': rng.choice([0,1,1,1], n),\n",
84
+ " 'subdomain_count': rng.integers(1, 5, n),\n",
85
+ " 'has_port': rng.choice([0,0,1,1], n),\n",
86
+ " 'query_params_count': rng.integers(2, 10, n),\n",
87
+ " 'has_at_symbol': rng.choice([0,0,0,1,1], n),\n",
88
+ " 'has_double_slash': rng.choice([0,0,1,1], n),\n",
89
+ " 'special_char_count': rng.integers(5, 25, n),\n",
90
+ " }\n",
91
+ " return pd.DataFrame(d), np.ones(n, dtype=int)\n",
92
+ "\n",
93
+ "print('βœ“ Synthetic data generators ready')"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": null,
99
+ "metadata": {},
100
+ "outputs": [],
101
+ "source": [
102
+ "# ── Load real phishing dataset ───────────────────────────────────────────────\n",
103
+ "def load_phishing():\n",
104
+ " csv_path = DATASETS / 'phishing_detection' / 'phishing_detection_processed.csv'\n",
105
+ " X_b, y_b = synth_benign(2000)\n",
106
+ " X_m, y_m = synth_malicious(2000)\n",
107
+ " if csv_path.exists():\n",
108
+ " df = pd.read_csv(csv_path)\n",
109
+ " mapped = pd.DataFrame()\n",
110
+ " mapped['url_length'] = df.get('url_length', rng.integers(15,200,len(df)))\n",
111
+ " mapped['hostname_length'] = (df.get('url_length',40)*0.3).astype(int)\n",
112
+ " mapped['path_length'] = (df.get('url_length',40)*0.4).astype(int)\n",
113
+ " mapped['is_https'] = df.get('https',1)\n",
114
+ " mapped['has_ip_address'] = rng.integers(0,2,len(df))\n",
115
+ " mapped['has_suspicious_tld'] = (df.get('suspicious_words',0)>3).astype(int)\n",
116
+ " mapped['subdomain_count'] = df.get('subdomain_level',rng.integers(0,3,len(df)))\n",
117
+ " mapped['has_port'] = rng.choice([0,1],len(df),p=[0.85,0.15])\n",
118
+ " mapped['query_params_count'] = rng.integers(0,6,len(df))\n",
119
+ " mapped['has_at_symbol'] = rng.choice([0,1],len(df),p=[0.9,0.1])\n",
120
+ " mapped['has_double_slash'] = rng.choice([0,1],len(df),p=[0.85,0.15])\n",
121
+ " mapped['special_char_count'] = df.get('suspicious_words',rng.integers(0,15,len(df)))\n",
122
+ " y_real = df['is_phishing'].values\n",
123
+ " X = pd.concat([mapped, X_b, X_m], ignore_index=True)\n",
124
+ " y = np.concatenate([y_real, y_b, y_m])\n",
125
+ " print(f'Phishing: {len(X)} samples (real CSV + synthetic)')\n",
126
+ " else:\n",
127
+ " X = pd.concat([X_b, X_m], ignore_index=True)\n",
128
+ " y = np.concatenate([y_b, y_m])\n",
129
+ " print(f'Phishing: {len(X)} samples (synthetic only)')\n",
130
+ " return X, y\n",
131
+ "\n",
132
+ "def load_malware():\n",
133
+ " X_b, y_b = synth_benign(2000)\n",
134
+ " X_m, y_m = synth_malicious(2000)\n",
135
+ " csv_path = DATASETS / 'malware_detection' / 'malware_detection_processed.csv'\n",
136
+ " if csv_path.exists():\n",
137
+ " df = pd.read_csv(csv_path)\n",
138
+ " mapped = pd.DataFrame()\n",
139
+ " mapped['url_length'] = (df.get('file_size',50000)/1000).clip(10,300).astype(int)\n",
140
+ " mapped['hostname_length'] = (df.get('entropy',4)*5).clip(5,40).astype(int)\n",
141
+ " mapped['path_length'] = (df.get('strings_count',500)/100).clip(0,80).astype(int)\n",
142
+ " mapped['is_https'] = rng.choice([0,1],len(df),p=[0.6,0.4])\n",
143
+ " mapped['has_ip_address'] = (df.get('entropy',0)>6).astype(int)\n",
144
+ " mapped['has_suspicious_tld'] = rng.integers(0,2,len(df))\n",
145
+ " mapped['subdomain_count'] = df.get('pe_sections',rng.integers(0,4,len(df))).clip(0,6).astype(int)\n",
146
+ " mapped['has_port'] = rng.choice([0,1],len(df),p=[0.7,0.3])\n",
147
+ " mapped['query_params_count'] = (df.get('exports',0)/20).clip(0,10).astype(int)\n",
148
+ " mapped['has_at_symbol'] = rng.choice([0,1],len(df),p=[0.85,0.15])\n",
149
+ " mapped['has_double_slash'] = rng.choice([0,1],len(df),p=[0.8,0.2])\n",
150
+ " mapped['special_char_count'] = (df.get('entropy',4)*2).clip(0,25).astype(int)\n",
151
+ " y_real = df['is_malware'].values\n",
152
+ " X = pd.concat([mapped, X_b, X_m], ignore_index=True)\n",
153
+ " y = np.concatenate([y_real, y_b, y_m])\n",
154
+ " print(f'Malware: {len(X)} samples (real CSV + synthetic)')\n",
155
+ " else:\n",
156
+ " X = pd.concat([X_b, X_m], ignore_index=True)\n",
157
+ " y = np.concatenate([y_b, y_m])\n",
158
+ " print(f'Malware: {len(X)} samples (synthetic only)')\n",
159
+ " return X, y\n",
160
+ "\n",
161
+ "def load_web_attack():\n",
162
+ " X_b, y_b = synth_benign(2000)\n",
163
+ " X_m, y_m = synth_malicious(2000)\n",
164
+ " X = pd.concat([X_b, X_m], ignore_index=True)\n",
165
+ " y = np.concatenate([y_b, y_m])\n",
166
+ " print(f'WebAttack: {len(X)} samples (synthetic)')\n",
167
+ " return X, y\n",
168
+ "\n",
169
+ "def load_anomaly():\n",
170
+ " X_b, y_b = synth_benign(3000)\n",
171
+ " X_m, y_m = synth_malicious(600)\n",
172
+ " X = pd.concat([X_b, X_m], ignore_index=True)\n",
173
+ " y = np.concatenate([y_b, y_m])\n",
174
+ " print(f'Anomaly: {len(X)} samples (synthetic)')\n",
175
+ " return X, y\n",
176
+ "\n",
177
+ "print('βœ“ Dataset loaders ready')"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": null,
183
+ "metadata": {},
184
+ "outputs": [],
185
+ "source": [
186
+ "# ── Train one model ──────────────────────────────────────────────────────────\n",
187
+ "def train_model(name, X, y, isolation_forest=False):\n",
188
+ " for col in FEATURE_NAMES:\n",
189
+ " if col not in X.columns:\n",
190
+ " X[col] = 0\n",
191
+ " X = X[FEATURE_NAMES].fillna(0).astype(float)\n",
192
+ "\n",
193
+ " X_tr, X_te, y_tr, y_te = train_test_split(\n",
194
+ " X, y, test_size=0.2, random_state=42, stratify=y\n",
195
+ " )\n",
196
+ " scaler = StandardScaler()\n",
197
+ " X_tr_s = scaler.fit_transform(X_tr)\n",
198
+ " X_te_s = scaler.transform(X_te)\n",
199
+ "\n",
200
+ " if isolation_forest:\n",
201
+ " X_benign = X_tr_s[y_tr == 0]\n",
202
+ " model = IsolationForest(n_estimators=200, contamination=0.1, random_state=42)\n",
203
+ " model.fit(X_benign)\n",
204
+ " preds = model.predict(X_te_s)\n",
205
+ " y_pred = (preds == -1).astype(int)\n",
206
+ " else:\n",
207
+ " model = GradientBoostingClassifier(\n",
208
+ " n_estimators=200, learning_rate=0.1, max_depth=5,\n",
209
+ " subsample=0.8, random_state=42\n",
210
+ " )\n",
211
+ " model.fit(X_tr_s, y_tr)\n",
212
+ " y_pred = model.predict(X_te_s)\n",
213
+ "\n",
214
+ " acc = accuracy_score(y_te, y_pred)\n",
215
+ " f1 = f1_score(y_te, y_pred, zero_division=0)\n",
216
+ "\n",
217
+ " # Save to UPLOAD_DIR (= trained_models/) for app.py to pick up\n",
218
+ " model_dir = UPLOAD_DIR / name\n",
219
+ " model_dir.mkdir(parents=True, exist_ok=True)\n",
220
+ " joblib.dump(model, model_dir / 'best_model.pkl')\n",
221
+ " joblib.dump(scaler, model_dir / 'scaler.pkl')\n",
222
+ " meta = {\n",
223
+ " 'name': name, 'trained_at': datetime.utcnow().isoformat(),\n",
224
+ " 'samples': int(len(X)), 'threat_rate': float(y.mean()),\n",
225
+ " 'accuracy': float(acc), 'f1': float(f1),\n",
226
+ " 'feature_names': FEATURE_NAMES,\n",
227
+ " 'model_type': 'IsolationForest' if isolation_forest else 'GradientBoostingClassifier',\n",
228
+ " }\n",
229
+ " with open(model_dir / 'metadata.json', 'w') as f:\n",
230
+ " json.dump(meta, f, indent=2)\n",
231
+ "\n",
232
+ " print(f' βœ“ {name}: acc={acc:.3f} f1={f1:.3f} ({len(X)} samples)')\n",
233
+ " return meta\n",
234
+ "\n",
235
+ "print('βœ“ Trainer ready β€” starting training pipeline')"
236
+ ]
237
+ },
238
+ {
239
+ "cell_type": "code",
240
+ "execution_count": null,
241
+ "metadata": {},
242
+ "outputs": [],
243
+ "source": [
244
+ "# ── Run training ─────────────────────────────────────────────────────────────\n",
245
+ "results = {}\n",
246
+ "print('Training phishing_detection...')\n",
247
+ "X, y = load_phishing()\n",
248
+ "results['phishing_detection'] = train_model('phishing_detection', X, y)\n",
249
+ "\n",
250
+ "print('Training malware_detection...')\n",
251
+ "X, y = load_malware()\n",
252
+ "results['malware_detection'] = train_model('malware_detection', X, y)\n",
253
+ "\n",
254
+ "print('Training web_attack_detection...')\n",
255
+ "X, y = load_web_attack()\n",
256
+ "results['web_attack_detection'] = train_model('web_attack_detection', X, y)\n",
257
+ "\n",
258
+ "print('Training anomaly_detection...')\n",
259
+ "X, y = load_anomaly()\n",
260
+ "results['anomaly_detection'] = train_model('anomaly_detection', X, y, isolation_forest=True)\n",
261
+ "\n",
262
+ "print()\n",
263
+ "print('='*50)\n",
264
+ "print('TRAINING COMPLETE')\n",
265
+ "for name, m in results.items():\n",
266
+ " print(f' {name}: acc={m[\"accuracy\"]:.3f} f1={m[\"f1\"]:.3f}')"
267
+ ]
268
+ },
269
+ {
270
+ "cell_type": "code",
271
+ "execution_count": null,
272
+ "metadata": {},
273
+ "outputs": [],
274
+ "source": [
275
+ "# ── Upload to HuggingFace model repo ─────────────────────────────────────────\n",
276
+ "if not HF_TOKEN:\n",
277
+ " print('⚠ HF_TOKEN not set β€” skipping upload. Models saved locally only.')\n",
278
+ "else:\n",
279
+ " api = HfApi(token=HF_TOKEN)\n",
280
+ " try:\n",
281
+ " create_repo(MODEL_REPO, repo_type='model', token=HF_TOKEN, exist_ok=True, private=False)\n",
282
+ " print(f'βœ“ Repo ready: {MODEL_REPO}')\n",
283
+ " except Exception as e:\n",
284
+ " print(f'Repo create: {e}')\n",
285
+ "\n",
286
+ " uploaded = 0\n",
287
+ " for name in results:\n",
288
+ " model_dir = UPLOAD_DIR / name\n",
289
+ " for fname in ['best_model.pkl', 'scaler.pkl', 'metadata.json']:\n",
290
+ " fpath = model_dir / fname\n",
291
+ " if not fpath.exists():\n",
292
+ " print(f' Missing: {fpath}')\n",
293
+ " continue\n",
294
+ " try:\n",
295
+ " api.upload_file(\n",
296
+ " path_or_fileobj=str(fpath),\n",
297
+ " path_in_repo=f'{name}/{fname}',\n",
298
+ " repo_id=MODEL_REPO,\n",
299
+ " repo_type='model',\n",
300
+ " token=HF_TOKEN,\n",
301
+ " )\n",
302
+ " uploaded += 1\n",
303
+ " print(f' βœ… {name}/{fname}')\n",
304
+ " except Exception as e:\n",
305
+ " print(f' ❌ {name}/{fname}: {e}')\n",
306
+ "\n",
307
+ " print()\n",
308
+ " print(f'Upload complete: {uploaded} files β†’ {MODEL_REPO}')\n",
309
+ " print(f'View: https://huggingface.co/{MODEL_REPO}')"
310
+ ]
311
+ },
312
+ {
313
+ "cell_type": "code",
314
+ "execution_count": null,
315
+ "metadata": {},
316
+ "outputs": [],
317
+ "source": [
318
+ "# ── Verify models are accessible ────────────────────────────────────────��────\n",
319
+ "print('Verifying models in trained_models/')\n",
320
+ "for name in results:\n",
321
+ " model_path = UPLOAD_DIR / name / 'best_model.pkl'\n",
322
+ " if model_path.exists():\n",
323
+ " m = joblib.load(model_path)\n",
324
+ " # Quick test prediction\n",
325
+ " import numpy as np\n",
326
+ " X_test = np.array([[100,20,30,0,1,1,2,1,3,0,1,8]]) # suspicious URL features\n",
327
+ " try:\n",
328
+ " scaler_path = UPLOAD_DIR / name / 'scaler.pkl'\n",
329
+ " if scaler_path.exists():\n",
330
+ " sc = joblib.load(scaler_path)\n",
331
+ " X_test = sc.transform(X_test)\n",
332
+ " pred = m.predict(X_test)\n",
333
+ " label = 'THREAT' if pred[0] == 1 else 'BENIGN'\n",
334
+ " print(f' βœ“ {name}: predict={label} (model loaded OK)')\n",
335
+ " except Exception as e:\n",
336
+ " print(f' βœ“ {name}: model loaded (predict error: {e})')\n",
337
+ " else:\n",
338
+ " print(f' βœ— {name}: model not found at {model_path}')\n",
339
+ "print()\n",
340
+ "print('All done! Models available at:', str(UPLOAD_DIR))"
341
+ ]
342
+ }
343
+ ]
344
+ }