Che237 commited on
Commit
7f42f86
·
verified ·
1 Parent(s): 94487f7

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +393 -634
app.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
- 🔐 CyberForge AI - ML Training & Inference Platform
3
- Hugging Face Spaces deployment for training cybersecurity ML models
4
  """
5
 
6
  import gradio as gr
@@ -8,12 +8,12 @@ import pandas as pd
8
  import numpy as np
9
  import json
10
  import os
11
- import joblib
 
12
  from pathlib import Path
13
  from datetime import datetime
14
  import logging
15
  from typing import Dict, List, Any, Optional, Tuple
16
- import asyncio
17
 
18
  # ML Libraries
19
  from sklearn.model_selection import train_test_split, cross_val_score
@@ -21,12 +21,10 @@ from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier,
21
  from sklearn.linear_model import LogisticRegression
22
  from sklearn.preprocessing import StandardScaler, LabelEncoder
23
  from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
24
- import torch
25
- import torch.nn as nn
26
- from transformers import AutoTokenizer, AutoModel
27
 
28
  # Hugging Face Hub
29
- from huggingface_hub import HfApi, hf_hub_download, upload_file, create_repo
30
 
31
  logging.basicConfig(level=logging.INFO)
32
  logger = logging.getLogger(__name__)
@@ -41,6 +39,8 @@ MODELS_DIR.mkdir(exist_ok=True)
41
  DATASETS_DIR = Path("./datasets")
42
  DATASETS_DIR.mkdir(exist_ok=True)
43
 
 
 
44
  # Model types available for training
45
  MODEL_TYPES = {
46
  "Random Forest": RandomForestClassifier,
@@ -64,709 +64,468 @@ SECURITY_TASKS = [
64
  ]
65
 
66
  # ============================================================================
67
- # MODEL REGISTRY
68
  # ============================================================================
69
 
70
- class ModelRegistry:
71
- """Manages trained models and their metadata"""
72
-
73
- def __init__(self):
74
- self.models = {}
75
- self.scalers = {}
76
- self.metadata = {}
77
- self.registry_file = MODELS_DIR / "registry.json"
78
- self._load_registry()
79
-
80
- def _load_registry(self):
81
- """Load existing model registry"""
82
- if self.registry_file.exists():
83
- with open(self.registry_file, 'r') as f:
84
- self.metadata = json.load(f)
85
- else:
86
- self.metadata = {}
87
 
88
- def _save_registry(self):
89
- """Save model registry"""
90
- with open(self.registry_file, 'w') as f:
91
- json.dump(self.metadata, f, indent=2, default=str)
 
 
 
 
 
 
92
 
93
- def register_model(self, model_id: str, model, scaler, metrics: Dict):
94
- """Register a trained model"""
95
- self.models[model_id] = model
96
- self.scalers[model_id] = scaler
97
 
98
- # Save model and scaler
99
- model_path = MODELS_DIR / f"{model_id}_model.pkl"
100
- scaler_path = MODELS_DIR / f"{model_id}_scaler.pkl"
101
 
102
- joblib.dump(model, model_path)
103
- joblib.dump(scaler, scaler_path)
104
-
105
- # Update metadata
106
- self.metadata[model_id] = {
107
- "created_at": datetime.now().isoformat(),
108
- "metrics": metrics,
109
- "model_path": str(model_path),
110
- "scaler_path": str(scaler_path),
111
- "status": "ready"
112
- }
113
- self._save_registry()
114
 
115
- return model_id
 
 
 
 
 
 
 
116
 
117
- def get_model(self, model_id: str):
118
- """Load a model from registry"""
119
- if model_id in self.models:
120
- return self.models[model_id], self.scalers[model_id]
121
-
122
- if model_id in self.metadata:
123
- model = joblib.load(self.metadata[model_id]["model_path"])
124
- scaler = joblib.load(self.metadata[model_id]["scaler_path"])
125
- self.models[model_id] = model
126
- self.scalers[model_id] = scaler
127
- return model, scaler
128
-
129
- return None, None
130
 
131
- def list_models(self) -> List[Dict]:
132
- """List all registered models"""
133
- return [
134
- {"id": k, **v} for k, v in self.metadata.items()
135
- ]
136
-
137
- # Global registry
138
- model_registry = ModelRegistry()
139
-
140
- # ============================================================================
141
- # TRAINING FUNCTIONS
142
- # ============================================================================
143
-
144
- def prepare_dataset(file, task_type: str) -> Tuple[pd.DataFrame, str]:
145
- """Load and prepare dataset for training"""
146
  try:
147
- if file is None:
148
- return None, "No file uploaded"
149
-
150
- # Load based on file type
151
- if file.name.endswith('.csv'):
152
- df = pd.read_csv(file.name)
153
- elif file.name.endswith('.json'):
154
- df = pd.read_json(file.name)
155
- elif file.name.endswith('.parquet'):
156
- df = pd.read_parquet(file.name)
157
- else:
158
- return None, f"Unsupported file format: {file.name}"
159
 
160
- logger.info(f"Loaded dataset with shape: {df.shape}")
161
- return df, f"✅ Loaded dataset with {len(df)} samples and {len(df.columns)} features"
162
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  except Exception as e:
164
- logger.error(f"Error loading dataset: {e}")
165
- return None, f"❌ Error: {str(e)}"
166
-
167
-
168
- def train_model(
169
- file,
170
- task_type: str,
171
- model_type: str,
172
- target_column: str,
173
- test_size: float,
174
- model_name: str,
175
- progress=gr.Progress()
176
- ) -> Tuple[str, str, str]:
177
- """Train a machine learning model"""
178
  try:
179
- progress(0, desc="Loading dataset...")
 
180
 
181
- # Load dataset
182
- df, msg = prepare_dataset(file, task_type)
183
- if df is None:
184
- return msg, "", ""
185
 
186
- progress(0.1, desc="Preparing features...")
 
187
 
188
- # Validate target column
189
- if target_column not in df.columns:
190
- return f"❌ Target column '{target_column}' not found in dataset. Available: {list(df.columns)}", "", ""
191
 
192
- # Prepare features and target
193
- X = df.drop(columns=[target_column])
194
- y = df[target_column]
195
 
196
- # Handle categorical features
197
- for col in X.select_dtypes(include=['object', 'category']).columns:
198
- le = LabelEncoder()
199
- X[col] = le.fit_transform(X[col].astype(str))
200
 
201
- # Handle target encoding
202
- if y.dtype == 'object' or y.dtype.name == 'category':
203
- le = LabelEncoder()
204
- y = le.fit_transform(y.astype(str))
 
205
 
206
- # Fill NaN values
207
- X = X.fillna(0)
208
 
209
- progress(0.2, desc="Splitting data...")
 
 
 
 
 
 
210
 
211
- # Split data
212
- X_train, X_test, y_train, y_test = train_test_split(
213
- X, y, test_size=test_size, random_state=42
214
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
216
- progress(0.3, desc="Scaling features...")
 
217
 
218
- # Scale features
219
- scaler = StandardScaler()
220
- X_train_scaled = scaler.fit_transform(X_train)
221
- X_test_scaled = scaler.transform(X_test)
222
 
223
- progress(0.4, desc=f"Training {model_type}...")
224
 
225
- # Get model class
 
 
 
226
  if model_type not in MODEL_TYPES:
227
- return f"Unknown model type: {model_type}", "", ""
228
 
229
  model_class = MODEL_TYPES[model_type]
230
 
231
- # Configure and train model
232
  if model_type == "Isolation Forest (Anomaly)":
233
- model = model_class(contamination=0.1, random_state=42, n_estimators=100)
234
- model.fit(X_train_scaled)
235
- y_pred = model.predict(X_test_scaled)
236
- y_pred = np.where(y_pred == -1, 1, 0) # Convert to binary
237
  else:
238
  model = model_class(random_state=42)
239
- model.fit(X_train_scaled, y_train)
240
- y_pred = model.predict(X_test_scaled)
241
-
242
- progress(0.7, desc="Evaluating model...")
243
 
244
- # Calculate metrics
245
- accuracy = accuracy_score(y_test, y_pred)
246
- f1 = f1_score(y_test, y_pred, average='weighted')
 
 
 
247
 
248
  metrics = {
249
- "accuracy": float(accuracy),
250
- "f1_score": float(f1),
251
- "model_type": model_type,
252
- "task_type": task_type,
253
- "samples": len(df),
254
- "features": len(X.columns),
255
  }
256
 
257
- progress(0.85, desc="Saving model...")
258
-
259
- # Generate model ID
260
- model_id = f"{model_name}_{task_type.lower().replace(' ', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
261
-
262
- # Register model
263
- model_registry.register_model(model_id, model, scaler, metrics)
264
-
265
- progress(1.0, desc="Complete!")
266
-
267
- # Format results
268
- training_log = f"""
269
- ## 🎯 Training Complete!
270
-
271
- **Model ID:** `{model_id}`
272
- **Task:** {task_type}
273
- **Model Type:** {model_type}
274
 
275
- ### 📊 Dataset Info
276
- - Samples: {len(df):,}
277
- - Features: {len(X.columns)}
278
- - Train/Test Split: {int((1-test_size)*100)}/{int(test_size*100)}
279
 
280
- ### 📈 Metrics
281
- - **Accuracy:** {accuracy:.4f} ({accuracy*100:.2f}%)
282
- - **F1 Score:** {f1:.4f}
283
-
284
- ### 💾 Model Saved
285
- - Path: `{MODELS_DIR / f'{model_id}_model.pkl'}`
286
- """
287
-
288
- # Generate classification report
289
- try:
290
- report = classification_report(y_test, y_pred)
291
- except:
292
- report = "Classification report not available for this model type"
293
-
294
- return training_log, report, model_id
295
-
296
- except Exception as e:
297
- logger.error(f"Training error: {e}")
298
- import traceback
299
- return f"❌ Training failed: {str(e)}\n\n{traceback.format_exc()}", "", ""
300
-
301
-
302
- def list_trained_models() -> str:
303
- """List all trained models"""
304
- models = model_registry.list_models()
305
 
306
- if not models:
307
- return "No models trained yet. Upload a dataset and train a model to get started!"
308
 
309
- output = "## 🤖 Trained Models\n\n"
310
- for model in models:
311
- output += f"""
312
- ### {model['id']}
313
- - **Created:** {model.get('created_at', 'Unknown')}
314
- - **Accuracy:** {model.get('metrics', {}).get('accuracy', 0):.4f}
315
- - **F1 Score:** {model.get('metrics', {}).get('f1_score', 0):.4f}
316
- - **Status:** {model.get('status', 'Unknown')}
317
-
318
- ---
319
- """
320
- return output
321
-
322
-
323
- def run_inference(model_id: str, input_data: str) -> str:
324
- """Run inference on a trained model"""
325
  try:
326
- model, scaler = model_registry.get_model(model_id)
327
-
328
- if model is None:
329
- return f"❌ Model '{model_id}' not found"
330
-
331
- # Parse input data (expect JSON format)
332
- try:
333
- data = json.loads(input_data)
334
- if isinstance(data, dict):
335
- data = [data]
336
- df = pd.DataFrame(data)
337
- except json.JSONDecodeError:
338
- return "❌ Invalid JSON input. Please provide data in JSON format."
339
-
340
- # Scale and predict
341
- X_scaled = scaler.transform(df.fillna(0))
342
- predictions = model.predict(X_scaled)
343
-
344
- # Get probabilities if available
345
- try:
346
- probabilities = model.predict_proba(X_scaled)
347
- results = []
348
- for i, (pred, probs) in enumerate(zip(predictions, probabilities)):
349
- results.append({
350
- "sample": i,
351
- "prediction": int(pred),
352
- "confidence": float(max(probs)),
353
- "probabilities": probs.tolist()
354
- })
355
- except:
356
- results = [{"sample": i, "prediction": int(p)} for i, p in enumerate(predictions)]
357
-
358
- return json.dumps(results, indent=2)
359
 
360
- except Exception as e:
361
- logger.error(f"Inference error: {e}")
362
- return f"❌ Inference failed: {str(e)}"
363
-
364
-
365
- # ============================================================================
366
- # HUGGING FACE INTEGRATION
367
- # ============================================================================
368
-
369
- def upload_model_to_hub(model_id: str, repo_id: str, hf_token: str) -> str:
370
- """Upload a trained model to Hugging Face Hub"""
371
- try:
372
- if not hf_token:
373
- return "❌ Hugging Face token required for upload"
374
-
375
- model, scaler = model_registry.get_model(model_id)
376
- if model is None:
377
- return f"❌ Model '{model_id}' not found"
378
-
379
- api = HfApi(token=hf_token)
380
-
381
- # Create repo if it doesn't exist
382
- try:
383
- create_repo(repo_id, token=hf_token, repo_type="model", exist_ok=True)
384
- except Exception as e:
385
- logger.warning(f"Repo creation note: {e}")
386
-
387
- # Upload model files
388
- model_path = MODELS_DIR / f"{model_id}_model.pkl"
389
- scaler_path = MODELS_DIR / f"{model_id}_scaler.pkl"
390
-
391
- upload_file(
392
- path_or_fileobj=str(model_path),
393
- path_in_repo=f"{model_id}_model.pkl",
394
- repo_id=repo_id,
395
- token=hf_token,
396
- repo_type="model"
397
- )
398
 
399
- upload_file(
400
- path_or_fileobj=str(scaler_path),
401
- path_in_repo=f"{model_id}_scaler.pkl",
402
- repo_id=repo_id,
403
- token=hf_token,
404
- repo_type="model"
405
- )
406
 
407
- # Upload metadata
408
- metadata = model_registry.metadata.get(model_id, {})
409
- metadata_json = json.dumps(metadata, indent=2, default=str)
410
 
411
- with open(MODELS_DIR / f"{model_id}_metadata.json", 'w') as f:
412
- f.write(metadata_json)
 
 
413
 
414
- upload_file(
415
- path_or_fileobj=str(MODELS_DIR / f"{model_id}_metadata.json"),
416
- path_in_repo=f"{model_id}_metadata.json",
417
- repo_id=repo_id,
418
- token=hf_token,
419
- repo_type="model"
420
- )
421
 
422
- return f"""
423
- ## Model Uploaded Successfully!
424
 
425
- **Model ID:** `{model_id}`
426
- **Repository:** `{repo_id}`
427
- **URL:** https://huggingface.co/{repo_id}
428
 
429
- ### Files Uploaded:
430
- - `{model_id}_model.pkl`
431
- - `{model_id}_scaler.pkl`
432
- - `{model_id}_metadata.json`
433
 
434
- You can now use this model from the Hub!
435
  """
436
 
 
 
437
  except Exception as e:
438
- logger.error(f"Upload error: {e}")
439
- return f"❌ Upload failed: {str(e)}"
440
-
441
 
442
- def download_model_from_hub(repo_id: str, model_filename: str, hf_token: str) -> str:
443
- """Download a model from Hugging Face Hub"""
 
 
 
444
  try:
445
- model_path = hf_hub_download(
446
- repo_id=repo_id,
447
- filename=model_filename,
448
- token=hf_token if hf_token else None
449
- )
450
 
451
- # Also try to download scaler
452
- scaler_filename = model_filename.replace("_model.pkl", "_scaler.pkl")
453
- try:
454
- scaler_path = hf_hub_download(
455
- repo_id=repo_id,
456
- filename=scaler_filename,
457
- token=hf_token if hf_token else None
458
- )
459
- except:
460
- scaler_path = None
461
 
462
- # Load and register
463
- model = joblib.load(model_path)
464
- scaler = joblib.load(scaler_path) if scaler_path else StandardScaler()
465
 
466
- model_id = model_filename.replace("_model.pkl", "")
467
- model_registry.models[model_id] = model
468
- model_registry.scalers[model_id] = scaler
 
469
 
470
- return f"""
471
- ## Model Downloaded Successfully!
472
-
473
- **Model ID:** `{model_id}`
474
- **Source:** `{repo_id}`
475
-
476
- The model is now available for inference.
477
- """
478
 
479
  except Exception as e:
480
- logger.error(f"Download error: {e}")
481
- return f"❌ Download failed: {str(e)}"
482
-
483
-
484
- # ============================================================================
485
- # API ENDPOINTS (For Backend Integration)
486
- # ============================================================================
487
-
488
- def api_predict(model_id: str, features: Dict) -> Dict:
489
- """API endpoint for predictions"""
490
- try:
491
- model, scaler = model_registry.get_model(model_id)
492
- if model is None:
493
- return {"error": f"Model '{model_id}' not found"}
494
-
495
- df = pd.DataFrame([features])
496
- X_scaled = scaler.transform(df.fillna(0))
497
- prediction = model.predict(X_scaled)[0]
498
-
499
- try:
500
- proba = model.predict_proba(X_scaled)[0]
501
- confidence = float(max(proba))
502
- except:
503
- confidence = None
504
-
505
- return {
506
- "model_id": model_id,
507
- "prediction": int(prediction),
508
- "confidence": confidence,
509
- "timestamp": datetime.now().isoformat()
510
- }
511
- except Exception as e:
512
- return {"error": str(e)}
513
-
514
-
515
- def api_batch_predict(model_id: str, batch_data: List[Dict]) -> List[Dict]:
516
- """API endpoint for batch predictions"""
517
- results = []
518
- for item in batch_data:
519
- result = api_predict(model_id, item)
520
- results.append(result)
521
- return results
522
 
 
 
 
 
 
 
 
 
 
 
 
 
523
 
524
  # ============================================================================
525
  # GRADIO INTERFACE
526
  # ============================================================================
527
 
528
- # Custom CSS
529
- custom_css = """
530
- .gradio-container {
531
- font-family: 'Inter', sans-serif;
532
- }
533
- .main-title {
534
- text-align: center;
535
- color: #1a1a2e;
536
- margin-bottom: 20px;
537
- }
538
- .tab-content {
539
- padding: 20px;
540
- }
541
- """
542
-
543
- # Build interface
544
- with gr.Blocks(css=custom_css, title="CyberForge AI - ML Training Platform") as demo:
545
- gr.Markdown("""
546
- # 🔐 CyberForge AI - ML Training Platform
547
-
548
- **Train, Deploy, and Serve Cybersecurity ML Models**
549
 
550
- This platform enables you to:
551
- - 📊 Upload and train models on cybersecurity datasets
552
- - 🚀 Deploy models to Hugging Face Hub
553
- - 🔗 Integrate with your backend via API
554
- - 🤖 Run inference on trained models
555
- """)
556
-
557
- with gr.Tabs():
558
- # ==================== TRAINING TAB ====================
559
- with gr.TabItem("🎯 Train Model"):
560
- with gr.Row():
561
- with gr.Column(scale=1):
562
- gr.Markdown("### Dataset Configuration")
563
-
564
- train_file = gr.File(
565
- label="Upload Dataset (CSV, JSON, or Parquet)",
566
- file_types=[".csv", ".json", ".parquet"]
567
- )
568
-
569
- task_type = gr.Dropdown(
570
- choices=SECURITY_TASKS,
571
- value="Malware Detection",
572
- label="Security Task Type"
573
- )
574
-
575
- model_type = gr.Dropdown(
576
- choices=list(MODEL_TYPES.keys()),
577
- value="Random Forest",
578
- label="Model Type"
579
- )
580
-
581
- target_column = gr.Textbox(
582
- label="Target Column Name",
583
- placeholder="e.g., 'label', 'is_malicious', 'attack_type'"
584
- )
585
-
586
- test_size = gr.Slider(
587
- minimum=0.1,
588
- maximum=0.4,
589
- value=0.2,
590
- step=0.05,
591
- label="Test Size"
592
- )
593
-
594
- model_name = gr.Textbox(
595
- label="Model Name",
596
- placeholder="e.g., 'malware_detector_v1'",
597
- value="cyberforge_model"
598
- )
599
 
600
- train_btn = gr.Button("🚀 Train Model", variant="primary")
 
 
 
 
 
 
601
 
602
- with gr.Column(scale=1):
603
- gr.Markdown("### Training Results")
604
- training_output = gr.Markdown()
605
- classification_report_output = gr.Textbox(
606
- label="Classification Report",
607
- lines=10
608
- )
609
- trained_model_id = gr.Textbox(
610
- label="Trained Model ID",
611
- interactive=False
612
- )
613
 
614
- train_btn.click(
615
- fn=train_model,
616
- inputs=[train_file, task_type, model_type, target_column, test_size, model_name],
617
- outputs=[training_output, classification_report_output, trained_model_id]
618
- )
619
-
620
- # ==================== INFERENCE TAB ====================
621
- with gr.TabItem("🔮 Run Inference"):
622
- with gr.Row():
623
- with gr.Column():
624
- inference_model_id = gr.Textbox(
625
- label="Model ID",
626
- placeholder="Enter the model ID to use"
627
- )
628
-
629
- inference_input = gr.Textbox(
630
- label="Input Data (JSON format)",
631
- placeholder='[{"feature1": 0.5, "feature2": 1.2, ...}]',
632
- lines=5
633
- )
 
634
 
635
- inference_btn = gr.Button("🔮 Run Inference", variant="primary")
 
 
 
636
 
637
- with gr.Column():
638
- inference_output = gr.Textbox(
639
- label="Predictions",
640
- lines=10
641
- )
642
-
643
- inference_btn.click(
644
- fn=run_inference,
645
- inputs=[inference_model_id, inference_input],
646
- outputs=[inference_output]
647
- )
648
-
649
- # ==================== MODELS TAB ====================
650
- with gr.TabItem("🤖 Models"):
651
- gr.Markdown("### Trained Models")
652
-
653
- refresh_btn = gr.Button("🔄 Refresh Models List")
654
- models_list = gr.Markdown()
655
-
656
- refresh_btn.click(
657
- fn=list_trained_models,
658
- outputs=[models_list]
659
- )
660
-
661
- # Auto-refresh on load
662
- demo.load(
663
- fn=list_trained_models,
664
- outputs=[models_list]
665
- )
666
-
667
- # ==================== HUB TAB ====================
668
- with gr.TabItem("☁️ Hugging Face Hub"):
669
- gr.Markdown("### Upload & Download Models")
670
 
671
- with gr.Row():
672
- with gr.Column():
673
- gr.Markdown("#### Upload to Hub")
674
- upload_model_id = gr.Textbox(
675
- label="Model ID to Upload"
676
- )
677
- upload_repo_id = gr.Textbox(
678
- label="Hub Repository ID",
679
- placeholder="username/repo-name"
680
- )
681
- upload_token = gr.Textbox(
682
- label="Hugging Face Token",
683
- type="password"
684
- )
685
- upload_btn = gr.Button("⬆️ Upload Model", variant="primary")
686
- upload_result = gr.Markdown()
687
 
688
- with gr.Column():
689
- gr.Markdown("#### Download from Hub")
690
- download_repo_id = gr.Textbox(
691
- label="Hub Repository ID",
692
- placeholder="username/repo-name"
693
- )
694
- download_filename = gr.Textbox(
695
- label="Model Filename",
696
- placeholder="model_name_model.pkl"
697
- )
698
- download_token = gr.Textbox(
699
- label="Hugging Face Token (optional)",
700
- type="password"
701
- )
702
- download_btn = gr.Button("⬇️ Download Model", variant="secondary")
703
- download_result = gr.Markdown()
704
-
705
- upload_btn.click(
706
- fn=upload_model_to_hub,
707
- inputs=[upload_model_id, upload_repo_id, upload_token],
708
- outputs=[upload_result]
709
- )
710
-
711
- download_btn.click(
712
- fn=download_model_from_hub,
713
- inputs=[download_repo_id, download_filename, download_token],
714
- outputs=[download_result]
715
- )
716
-
717
- # ==================== API TAB ====================
718
- with gr.TabItem("🔗 API Integration"):
719
- gr.Markdown("""
720
- ### API Integration Guide
721
-
722
- Your backend can integrate with this Space using the Gradio Client library or direct API calls.
723
-
724
- #### Python Client Example:
725
-
726
- ```python
727
- from gradio_client import Client
728
-
729
- # Connect to your Space
730
- client = Client("Che237/cyberforge")
731
-
732
- # Run inference
733
- result = client.predict(
734
- model_id="your_model_id",
735
- input_data='[{"feature1": 0.5, "feature2": 1.2}]',
736
- api_name="/run_inference"
737
- )
738
- print(result)
739
- ```
740
-
741
- #### API Endpoints:
742
-
743
- | Endpoint | Description |
744
- |----------|-------------|
745
- | `/train_model` | Train a new model |
746
- | `/run_inference` | Run predictions |
747
- | `/list_trained_models` | List available models |
748
- | `/upload_model_to_hub` | Upload model to Hub |
749
-
750
- #### Backend Integration (Node.js):
751
 
752
- ```javascript
753
- const { Client } = require("@gradio/client");
 
 
 
 
754
 
755
- async function runPrediction(modelId, features) {
756
- const client = await Client.connect("Che237/cyberforge");
757
- const result = await client.predict("/run_inference", {
758
- model_id: modelId,
759
- input_data: JSON.stringify([features])
760
- });
761
- return JSON.parse(result.data);
762
- }
763
- ```
764
- """)
765
-
766
- # Launch the demo
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
767
  if __name__ == "__main__":
768
- demo.launch(
769
- server_name="0.0.0.0",
770
- server_port=7860,
771
- share=False
772
- )
 
1
  """
2
+ CyberForge AI - ML Training & Inference Platform
3
+ Hugging Face Spaces deployment with Notebook execution support
4
  """
5
 
6
  import gradio as gr
 
8
  import numpy as np
9
  import json
10
  import os
11
+ import subprocess
12
+ import sys
13
  from pathlib import Path
14
  from datetime import datetime
15
  import logging
16
  from typing import Dict, List, Any, Optional, Tuple
 
17
 
18
  # ML Libraries
19
  from sklearn.model_selection import train_test_split, cross_val_score
 
21
  from sklearn.linear_model import LogisticRegression
22
  from sklearn.preprocessing import StandardScaler, LabelEncoder
23
  from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score
24
+ import joblib
 
 
25
 
26
  # Hugging Face Hub
27
+ from huggingface_hub import HfApi, hf_hub_download, upload_file
28
 
29
  logging.basicConfig(level=logging.INFO)
30
  logger = logging.getLogger(__name__)
 
39
  DATASETS_DIR = Path("./datasets")
40
  DATASETS_DIR.mkdir(exist_ok=True)
41
 
42
+ NOTEBOOKS_DIR = Path("./notebooks")
43
+
44
  # Model types available for training
45
  MODEL_TYPES = {
46
  "Random Forest": RandomForestClassifier,
 
64
  ]
65
 
66
  # ============================================================================
67
+ # NOTEBOOK EXECUTION
68
  # ============================================================================
69
 
70
+ def get_available_notebooks() -> List[str]:
71
+ """Get list of available notebooks"""
72
+ if not NOTEBOOKS_DIR.exists():
73
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
+ notebooks = sorted([
76
+ f.name for f in NOTEBOOKS_DIR.glob("*.ipynb")
77
+ ])
78
+ return notebooks
79
+
80
+ def read_notebook_content(notebook_name: str) -> str:
81
+ """Read and display notebook content as markdown"""
82
+ notebook_path = NOTEBOOKS_DIR / notebook_name
83
+ if not notebook_path.exists():
84
+ return f"Notebook not found: {notebook_name}"
85
 
86
+ try:
87
+ with open(notebook_path, 'r') as f:
88
+ nb = json.load(f)
 
89
 
90
+ output = f"# {notebook_name}\n\n"
 
 
91
 
92
+ for i, cell in enumerate(nb.get('cells', []), 1):
93
+ cell_type = cell.get('cell_type', 'code')
94
+ source = ''.join(cell.get('source', []))
95
+
96
+ if cell_type == 'markdown':
97
+ output += f"{source}\n\n"
98
+ else:
99
+ output += f"### Cell {i} (Python)\n```python\n{source}\n```\n\n"
 
 
 
 
100
 
101
+ return output
102
+ except Exception as e:
103
+ return f"Error reading notebook: {str(e)}"
104
+
105
+ def execute_notebook(notebook_name: str, progress=gr.Progress()) -> Tuple[str, str]:
106
+ """Execute a notebook and return output"""
107
+ notebook_path = NOTEBOOKS_DIR / notebook_name
108
+ output_path = NOTEBOOKS_DIR / f"output_{notebook_name}"
109
 
110
+ if not notebook_path.exists():
111
+ return f"Error: Notebook not found: {notebook_name}", ""
112
+
113
+ progress(0.1, desc="Starting notebook execution...")
 
 
 
 
 
 
 
 
 
114
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  try:
116
+ # Execute notebook using nbconvert
117
+ cmd = [
118
+ sys.executable, "-m", "nbconvert",
119
+ "--to", "notebook",
120
+ "--execute",
121
+ "--output", str(output_path.name),
122
+ "--ExecutePreprocessor.timeout=600",
123
+ "--ExecutePreprocessor.kernel_name=python3",
124
+ str(notebook_path)
125
+ ]
 
 
126
 
127
+ progress(0.3, desc="Executing cells...")
128
+
129
+ result = subprocess.run(
130
+ cmd,
131
+ capture_output=True,
132
+ text=True,
133
+ cwd=str(NOTEBOOKS_DIR),
134
+ timeout=900
135
+ )
136
+
137
+ progress(0.8, desc="Processing output...")
138
+
139
+ if result.returncode == 0:
140
+ # Read executed notebook for outputs
141
+ if output_path.exists():
142
+ with open(output_path, 'r') as f:
143
+ executed_nb = json.load(f)
144
+
145
+ outputs = []
146
+ for i, cell in enumerate(executed_nb.get('cells', []), 1):
147
+ if cell.get('cell_type') == 'code':
148
+ cell_outputs = cell.get('outputs', [])
149
+ for out in cell_outputs:
150
+ if 'text' in out:
151
+ text = ''.join(out['text'])
152
+ outputs.append(f"Cell {i}:\n{text}")
153
+ elif 'data' in out:
154
+ if 'text/plain' in out['data']:
155
+ text = ''.join(out['data']['text/plain'])
156
+ outputs.append(f"Cell {i}:\n{text}")
157
+
158
+ progress(1.0, desc="Complete!")
159
+ return "Notebook executed successfully!", "\n\n".join(outputs)
160
+ else:
161
+ return "Notebook executed but output file not found", result.stdout
162
+ else:
163
+ return f"Execution failed:\n{result.stderr}", result.stdout
164
+
165
+ except subprocess.TimeoutExpired:
166
+ return "Error: Notebook execution timed out (15 min limit)", ""
167
  except Exception as e:
168
+ return f"Error executing notebook: {str(e)}", ""
169
+
170
+ def run_notebook_cell(notebook_name: str, cell_number: int) -> str:
171
+ """Execute a single cell from a notebook"""
172
+ notebook_path = NOTEBOOKS_DIR / notebook_name
173
+
174
+ if not notebook_path.exists():
175
+ return f"Error: Notebook not found"
176
+
 
 
 
 
 
177
  try:
178
+ with open(notebook_path, 'r') as f:
179
+ nb = json.load(f)
180
 
181
+ cells = [c for c in nb.get('cells', []) if c.get('cell_type') == 'code']
 
 
 
182
 
183
+ if cell_number < 1 or cell_number > len(cells):
184
+ return f"Error: Cell {cell_number} not found. Available: 1-{len(cells)}"
185
 
186
+ cell = cells[cell_number - 1]
187
+ source = ''.join(cell.get('source', []))
 
188
 
189
+ # Execute the code
190
+ import io
191
+ from contextlib import redirect_stdout, redirect_stderr
192
 
193
+ stdout_capture = io.StringIO()
194
+ stderr_capture = io.StringIO()
 
 
195
 
196
+ with redirect_stdout(stdout_capture), redirect_stderr(stderr_capture):
197
+ try:
198
+ exec(source, globals())
199
+ except Exception as e:
200
+ return f"Error: {str(e)}"
201
 
202
+ output = stdout_capture.getvalue()
203
+ errors = stderr_capture.getvalue()
204
 
205
+ result = f"### Cell {cell_number} Output:\n"
206
+ if output:
207
+ result += f"```\n{output}\n```\n"
208
+ if errors:
209
+ result += f"\n**Warnings/Errors:**\n```\n{errors}\n```"
210
+ if not output and not errors:
211
+ result += "*(No output)*"
212
 
213
+ return result
214
+
215
+ except Exception as e:
216
+ return f"Error: {str(e)}"
217
+
218
+ # ============================================================================
219
+ # MODEL TRAINING (existing functionality)
220
+ # ============================================================================
221
+
222
+ class SecurityModelTrainer:
223
+ """Train ML models for cybersecurity tasks"""
224
+
225
+ def __init__(self):
226
+ self.scaler = StandardScaler()
227
+ self.label_encoder = LabelEncoder()
228
+ self.models = {}
229
+
230
+ def prepare_data(self, df: pd.DataFrame, target_col: str = 'label') -> Tuple:
231
+ """Prepare data for training"""
232
+ if target_col not in df.columns:
233
+ raise ValueError(f"Target column '{target_col}' not found")
234
+
235
+ X = df.drop(columns=[target_col])
236
+ y = df[target_col]
237
 
238
+ # Handle categorical columns
239
+ X = X.select_dtypes(include=[np.number]).fillna(0)
240
 
241
+ if y.dtype == 'object':
242
+ y = self.label_encoder.fit_transform(y)
 
 
243
 
244
+ X_scaled = self.scaler.fit_transform(X)
245
 
246
+ return train_test_split(X_scaled, y, test_size=0.2, random_state=42)
247
+
248
+ def train_model(self, model_type: str, X_train, y_train):
249
+ """Train a model"""
250
  if model_type not in MODEL_TYPES:
251
+ raise ValueError(f"Unknown model type: {model_type}")
252
 
253
  model_class = MODEL_TYPES[model_type]
254
 
 
255
  if model_type == "Isolation Forest (Anomaly)":
256
+ model = model_class(contamination=0.1, random_state=42)
 
 
 
257
  else:
258
  model = model_class(random_state=42)
 
 
 
 
259
 
260
+ model.fit(X_train, y_train)
261
+ return model
262
+
263
+ def evaluate_model(self, model, X_test, y_test) -> Dict:
264
+ """Evaluate model performance"""
265
+ y_pred = model.predict(X_test)
266
 
267
  metrics = {
268
+ 'accuracy': accuracy_score(y_test, y_pred),
269
+ 'f1_score': f1_score(y_test, y_pred, average='weighted', zero_division=0)
 
 
 
 
270
  }
271
 
272
+ return metrics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
+ trainer = SecurityModelTrainer()
 
 
 
275
 
276
+ def train_model_from_data(data_file, model_type: str, task: str, progress=gr.Progress()):
277
+ """Train model from uploaded data"""
278
+ if data_file is None:
279
+ return "Please upload a CSV file", None, None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
 
281
+ progress(0.1, desc="Loading data...")
 
282
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
283
  try:
284
+ df = pd.read_csv(data_file.name)
285
+ progress(0.3, desc="Preparing data...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
286
 
287
+ X_train, X_test, y_train, y_test = trainer.prepare_data(df)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
+ progress(0.5, desc=f"Training {model_type}...")
290
+ model = trainer.train_model(model_type, X_train, y_train)
 
 
 
 
 
291
 
292
+ progress(0.8, desc="Evaluating model...")
293
+ metrics = trainer.evaluate_model(model, X_test, y_test)
 
294
 
295
+ # Save model
296
+ model_name = f"{task.lower().replace(' ', '_')}_{model_type.lower().replace(' ', '_')}"
297
+ model_path = MODELS_DIR / f"{model_name}.pkl"
298
+ joblib.dump(model, model_path)
299
 
300
+ progress(1.0, desc="Complete!")
 
 
 
 
 
 
301
 
302
+ result = f"""
303
+ ## Training Complete!
304
 
305
+ **Task:** {task}
306
+ **Model:** {model_type}
307
+ **Samples:** {len(df)}
308
 
309
+ ### Metrics
310
+ - Accuracy: {metrics['accuracy']:.4f}
311
+ - F1 Score: {metrics['f1_score']:.4f}
 
312
 
313
+ **Model saved to:** {model_path}
314
  """
315
 
316
+ return result, str(model_path), json.dumps(metrics, indent=2)
317
+
318
  except Exception as e:
319
+ return f"Error: {str(e)}", None, None
 
 
320
 
321
+ def run_inference(model_file, features_text: str):
322
+ """Run inference with a trained model"""
323
+ if model_file is None:
324
+ return "Please upload a model file"
325
+
326
  try:
327
+ model = joblib.load(model_file.name)
 
 
 
 
328
 
329
+ features = json.loads(features_text)
330
+ X = np.array([list(features.values())])
 
 
 
 
 
 
 
 
331
 
332
+ prediction = model.predict(X)[0]
 
 
333
 
334
+ result = {
335
+ 'prediction': int(prediction),
336
+ 'features_used': len(features)
337
+ }
338
 
339
+ if hasattr(model, 'predict_proba'):
340
+ proba = model.predict_proba(X)[0]
341
+ result['confidence'] = float(max(proba))
342
+
343
+ return json.dumps(result, indent=2)
 
 
 
344
 
345
  except Exception as e:
346
+ return f"Error: {str(e)}"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
347
 
348
+ def list_trained_models():
349
+ """List all trained models"""
350
+ models = list(MODELS_DIR.glob("*.pkl"))
351
+ if not models:
352
+ return "No trained models found"
353
+
354
+ output = "## Trained Models\n\n"
355
+ for model_path in models:
356
+ size_kb = model_path.stat().st_size / 1024
357
+ output += f"- **{model_path.name}** ({size_kb:.1f} KB)\n"
358
+
359
+ return output
360
 
361
  # ============================================================================
362
  # GRADIO INTERFACE
363
  # ============================================================================
364
 
365
+ def create_interface():
366
+ """Create the Gradio interface"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
 
368
+ with gr.Blocks(title="CyberForge AI", theme=gr.themes.Soft()) as demo:
369
+ gr.Markdown("""
370
+ # 🔐 CyberForge AI - ML Training Platform
371
+
372
+ Train cybersecurity ML models and run Jupyter notebooks on Hugging Face.
373
+ """)
374
+
375
+ with gr.Tabs():
376
+ # ============ NOTEBOOKS TAB ============
377
+ with gr.TabItem("📓 Notebooks"):
378
+ gr.Markdown("""
379
+ ### Run ML Pipeline Notebooks
380
+ Execute the CyberForge ML notebooks directly in the cloud.
381
+ """)
382
+
383
+ with gr.Row():
384
+ with gr.Column(scale=1):
385
+ notebook_dropdown = gr.Dropdown(
386
+ choices=get_available_notebooks(),
387
+ label="Select Notebook",
388
+ value=get_available_notebooks()[0] if get_available_notebooks() else None
389
+ )
390
+
391
+ refresh_btn = gr.Button("🔄 Refresh List")
392
+ view_btn = gr.Button("👁 View Content", variant="secondary")
393
+ execute_btn = gr.Button("▶ Execute Notebook", variant="primary")
394
+
395
+ gr.Markdown("### Run Single Cell")
396
+ cell_number = gr.Number(label="Cell Number", value=1, minimum=1)
397
+ run_cell_btn = gr.Button("Run Cell")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
398
 
399
+ with gr.Column(scale=2):
400
+ notebook_status = gr.Markdown("Select a notebook to view or execute.")
401
+ notebook_output = gr.Markdown("", label="Output")
402
+
403
+ def refresh_notebooks():
404
+ notebooks = get_available_notebooks()
405
+ return gr.update(choices=notebooks, value=notebooks[0] if notebooks else None)
406
 
407
+ refresh_btn.click(refresh_notebooks, outputs=notebook_dropdown)
408
+ view_btn.click(read_notebook_content, inputs=notebook_dropdown, outputs=notebook_output)
409
+ execute_btn.click(execute_notebook, inputs=notebook_dropdown, outputs=[notebook_status, notebook_output])
410
+ run_cell_btn.click(run_notebook_cell, inputs=[notebook_dropdown, cell_number], outputs=notebook_output)
 
 
 
 
 
 
 
411
 
412
+ # ============ TRAIN MODEL TAB ============
413
+ with gr.TabItem("🎯 Train Model"):
414
+ gr.Markdown("""
415
+ ### Train a Security ML Model
416
+ Upload your dataset and train a model for threat detection.
417
+ """)
418
+
419
+ with gr.Row():
420
+ with gr.Column():
421
+ task_dropdown = gr.Dropdown(
422
+ choices=SECURITY_TASKS,
423
+ label="Security Task",
424
+ value="Phishing Detection"
425
+ )
426
+ model_dropdown = gr.Dropdown(
427
+ choices=list(MODEL_TYPES.keys()),
428
+ label="Model Type",
429
+ value="Random Forest"
430
+ )
431
+ data_upload = gr.File(label="Upload Training Data (CSV)", file_types=[".csv"])
432
+ train_btn = gr.Button("🚀 Train Model", variant="primary")
433
 
434
+ with gr.Column():
435
+ train_output = gr.Markdown("Upload data and click Train to begin.")
436
+ model_path_output = gr.Textbox(label="Model Path", visible=False)
437
+ metrics_output = gr.Textbox(label="Metrics JSON", visible=False)
438
 
439
+ train_btn.click(
440
+ train_model_from_data,
441
+ inputs=[data_upload, model_dropdown, task_dropdown],
442
+ outputs=[train_output, model_path_output, metrics_output]
443
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
444
 
445
+ # ============ INFERENCE TAB ============
446
+ with gr.TabItem("🔍 Inference"):
447
+ gr.Markdown("""
448
+ ### Run Model Inference
449
+ Load a trained model and make predictions.
450
+ """)
 
 
 
 
 
 
 
 
 
 
451
 
452
+ with gr.Row():
453
+ with gr.Column():
454
+ model_upload = gr.File(label="Upload Model (.pkl)")
455
+ features_input = gr.Textbox(
456
+ label="Features (JSON)",
457
+ value='{"url_length": 50, "has_https": 1, "digit_count": 5}',
458
+ lines=5
459
+ )
460
+ predict_btn = gr.Button("🎯 Predict", variant="primary")
461
+
462
+ with gr.Column():
463
+ prediction_output = gr.Textbox(label="Prediction Result", lines=10)
464
+
465
+ predict_btn.click(run_inference, inputs=[model_upload, features_input], outputs=prediction_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
466
 
467
+ # ============ MODELS TAB ============
468
+ with gr.TabItem("📦 Models"):
469
+ gr.Markdown("### Trained Models")
470
+ models_list = gr.Markdown(list_trained_models())
471
+ refresh_models_btn = gr.Button("🔄 Refresh")
472
+ refresh_models_btn.click(list_trained_models, outputs=models_list)
473
 
474
+ # ============ API TAB ============
475
+ with gr.TabItem("🔌 API"):
476
+ gr.Markdown("""
477
+ ## API Integration
478
+
479
+ ### Python Client
480
+
481
+ ```python
482
+ from huggingface_hub import InferenceClient
483
+
484
+ client = InferenceClient("Che237/cyberforge")
485
+
486
+ # Make prediction
487
+ result = client.predict(
488
+ model_name="phishing_detection",
489
+ features={"url_length": 50, "has_https": 1}
490
+ )
491
+ print(result)
492
+ ```
493
+
494
+ ### REST API
495
+
496
+ ```bash
497
+ curl -X POST https://che237-cyberforge.hf.space/api/predict \\
498
+ -H "Content-Type: application/json" \\
499
+ -d '{"model_name": "phishing_detection", "features": {"url_length": 50}}'
500
+ ```
501
+
502
+ ### Notebook Execution
503
+
504
+ The notebooks in this Space implement the complete CyberForge ML pipeline:
505
+
506
+ | # | Notebook | Purpose |
507
+ |---|----------|---------|
508
+ | 00 | environment_setup | System validation |
509
+ | 01 | data_acquisition | Data collection |
510
+ | 02 | feature_engineering | Feature extraction |
511
+ | 03 | model_training | Train models |
512
+ | 04 | agent_intelligence | AI reasoning |
513
+ | 05 | model_validation | Testing |
514
+ | 06 | backend_integration | API packaging |
515
+ | 07 | deployment_artifacts | Deployment |
516
+ """)
517
+
518
+ gr.Markdown("""
519
+ ---
520
+ **CyberForge AI** | [GitHub](https://github.com/Che237/cyberforge) | [Datasets](https://huggingface.co/datasets/Che237/cyberforge-datasets)
521
+ """)
522
+
523
+ return demo
524
+
525
+ # ============================================================================
526
+ # MAIN
527
+ # ============================================================================
528
+
529
  if __name__ == "__main__":
530
+ demo = create_interface()
531
+ demo.launch()