Spaces:

LorenzoNava
/

deberta-cwe-training

Paused

App Files Files Community

deberta-cwe-training / app.py

LorenzoNava

feat: Use cleaned dataset (local or HuggingFace)

152f94b about 2 months ago

raw

history blame contribute delete

14.5 kB

	#!/usr/bin/env python3
	"""
	DeBERTa CWE Classification - Minimal Training Interface
	=======================================================
	Minimal Gradio interface for training DeBERTa models on CVE-CWE classification.
	Optimized for 4x NVIDIA L4 GPUs (96GB total VRAM).

	Author: Berghem - Smart Information Security
	License: MIT
	"""

	import os
	import sys
	import gradio as gr
	import torch
	from datasets import load_dataset, load_from_disk
	from transformers import (
	AutoTokenizer,
	AutoModelForSequenceClassification,
	TrainingArguments,
	Trainer,
	EarlyStoppingCallback,
	TrainerCallback,
	)
	from sklearn.metrics import accuracy_score, f1_score
	import numpy as np
	import time

	# ============================================================================
	# CONFIGURATION
	# ============================================================================

	MODELS = {
	"DeBERTa-v3-Base (86M params) - Recommended": "microsoft/deberta-v3-base",
	}

	DATASET_PATH = "./dataset/cleaned"

	# ============================================================================
	# CUDA CACHE CLEARING CALLBACK
	# ============================================================================


	class CUDACacheClearCallback(TrainerCallback):
	"""Clear CUDA cache after each epoch to prevent memory buildup"""

	def on_epoch_end(self, args, state, control, **kwargs):
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	print(f"\n🧹 CUDA cache cleared after epoch {state.epoch}")


	# ============================================================================
	# TRAINING FUNCTION
	# ============================================================================


	def train_model(
	model_name="microsoft/deberta-v3-base",
	epochs=10,
	batch_size=32,
	learning_rate=2e-5,
	max_length=256,
	early_stopping_patience=5,
	):
	"""Train DeBERTa model on CVE-CWE dataset"""

	logs = []

	def log(msg):
	logs.append(msg)
	print(msg) # Also print to console
	return "\n".join(logs)

	try:
	log("=" * 80)
	log("DEBERTA CWE CLASSIFICATION TRAINING")
	log("=" * 80)
	log(f"Model: {model_name}")
	log(f"Epochs: {epochs}")
	log(f"Total batch size: {batch_size}")
	log(f"Learning rate: {learning_rate}")
	log(f"Max length: {max_length}")
	log("=" * 80)

	# Check device
	if torch.cuda.is_available():
	device = "cuda"
	log(f"\n🖥️ Device: {device}")
	log(f" GPU: {torch.cuda.get_device_name(0)}")
	else:
	device = "cpu"
	log(f"\n🖥️ Device: {device} (CPU only)")

	# Load cleaned dataset (try local first, then HuggingFace)
	log("\n📦 Loading cleaned dataset...")
	if os.path.exists(DATASET_PATH):
	log(f" Using local: {DATASET_PATH}")
	dataset = load_from_disk(DATASET_PATH)
	else:
	log(" Local dataset not found, downloading from HuggingFace...")
	dataset = load_dataset("LorenzoNava/cve-cwe-dataset-cleaned")
	log(f" ✅ Loaded {len(dataset['train']):,} samples (cleaned, no NVD-CWE-Other)")

	# Create validation split if needed
	if "validation" not in dataset and "test" not in dataset:
	log("\n📊 Creating 90/10 train/validation split...")
	split_dataset = dataset["train"].train_test_split(test_size=0.1, seed=42)
	dataset["train"] = split_dataset["train"]
	dataset["validation"] = split_dataset["test"]
	log(f" Train: {len(dataset['train']):,} samples")
	log(f" Validation: {len(dataset['validation']):,} samples")

	# Build label mapping
	log("\n🏷️ Building CWE label mapping...")
	cwe_set = set()
	for example in dataset["train"]:
	if example.get("CWE-ID"):
	cwe_set.add(example["CWE-ID"])

	cwe_list = sorted(list(cwe_set))
	label2id = {cwe: idx for idx, cwe in enumerate(cwe_list)}
	id2label = {idx: cwe for cwe, idx in label2id.items()}
	num_labels = len(label2id)
	log(f" ✅ Found {num_labels} unique CWE classes")

	# Load tokenizer
	log(f"\n📚 Loading tokenizer: {model_name}")
	tokenizer = AutoTokenizer.from_pretrained(model_name)

	# Tokenize dataset
	log("\n🔤 Tokenizing dataset...")

	def tokenize_function(examples):
	return tokenizer(
	examples["DESCRIPTION"],
	padding="max_length",
	truncation=True,
	max_length=max_length,
	)

	tokenized_dataset = dataset.map(
	tokenize_function, batched=True, remove_columns=dataset["train"].column_names
	)
	log(" ✅ Tokenization complete")

	# Clear CUDA cache
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# Add labels
	def add_labels(examples, idx):
	cwe_ids = [dataset["train"][i]["CWE-ID"] for i in idx]
	return {"labels": [label2id.get(cwe, -100) for cwe in cwe_ids]}

	tokenized_dataset["train"] = tokenized_dataset["train"].map(
	add_labels, batched=True, with_indices=True
	)

	if "validation" in tokenized_dataset:

	def add_val_labels(examples, idx):
	cwe_ids = [dataset["validation"][i]["CWE-ID"] for i in idx]
	return {"labels": [label2id.get(cwe, -100) for cwe in cwe_ids]}

	tokenized_dataset["validation"] = tokenized_dataset["validation"].map(
	add_val_labels, batched=True, with_indices=True
	)

	# Filter invalid labels
	log("\n🔍 Filtering invalid samples...")
	tokenized_dataset["train"] = tokenized_dataset["train"].filter(lambda x: x["labels"] != -100)
	if "validation" in tokenized_dataset:
	tokenized_dataset["validation"] = tokenized_dataset["validation"].filter(
	lambda x: x["labels"] != -100
	)
	log(f" ✅ Train: {len(tokenized_dataset['train']):,} valid samples")

	# Load model
	log(f"\n🤖 Loading model: {model_name}")

	# Determine precision
	use_bf16_model = False
	use_fp16_model = False

	if torch.cuda.is_available():
	gpu_name = torch.cuda.get_device_name(0).upper()
	if any(x in gpu_name for x in ["A100", "H100", "L4", "L40"]):
	use_bf16_model = True
	else:
	use_fp16_model = True

	# Determine model dtype
	model_dtype = None
	if torch.cuda.is_available():
	model_dtype = torch.bfloat16 if use_bf16_model else torch.float16

	model = AutoModelForSequenceClassification.from_pretrained(
	model_name,
	num_labels=num_labels,
	label2id=label2id,
	id2label=id2label,
	torch_dtype=model_dtype,
	)

	model = model.to(device)
	log(f" ✅ Model loaded on {device}")
	log(f" Parameters: {sum(p.numel() for p in model.parameters()):,}")

	# Clear CUDA cache
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# Training configuration
	log("\n⚙️ Configuring training...")
	output_dir = "./models/deberta-cwe-final"

	# Precision settings
	use_bf16 = False
	use_fp16 = False

	if torch.cuda.is_available():
	gpu_name = torch.cuda.get_device_name(0).upper()
	if any(x in gpu_name for x in ["A100", "H100", "L4", "L40"]):
	use_bf16 = True
	log(f" Using bf16 precision (optimal for {gpu_name})")
	else:
	use_fp16 = True
	log(f" Using fp16 precision ({gpu_name})")

	# Multi-GPU detection
	num_gpus = torch.cuda.device_count() if torch.cuda.is_available() else 1
	log(f" GPUs detected: {num_gpus}")

	# Memory monitoring
	if torch.cuda.is_available():
	for i in range(num_gpus):
	mem_total = torch.cuda.get_device_properties(i).total_memory / 1e9
	mem_allocated = torch.cuda.memory_allocated(i) / 1e9
	log(f" GPU {i}: {mem_total:.1f}GB total, {mem_allocated:.1f}GB allocated")

	# Optimized batch size distribution for 4x L4 GPUs (96GB total VRAM)
	if num_gpus >= 4:
	per_device_batch = max(4, batch_size // num_gpus)
	gradient_accum = 1
	elif num_gpus == 2:
	per_device_batch = max(4, batch_size // num_gpus)
	gradient_accum = max(1, batch_size // (per_device_batch * num_gpus))
	else:
	per_device_batch = min(8, batch_size)
	gradient_accum = max(1, batch_size // per_device_batch)

	log(f" Per-device batch: {per_device_batch}")
	log(f" Gradient accumulation: {gradient_accum}")
	log(f" Effective batch: {per_device_batch * gradient_accum * num_gpus}")

	training_args = TrainingArguments(
	output_dir=output_dir,
	num_train_epochs=epochs,
	per_device_train_batch_size=per_device_batch,
	per_device_eval_batch_size=per_device_batch * 2,
	gradient_accumulation_steps=gradient_accum,
	learning_rate=learning_rate,
	weight_decay=0.01,
	warmup_ratio=0.1,
	lr_scheduler_type="cosine",
	eval_strategy="steps",
	eval_steps=500,
	save_strategy="steps",
	save_steps=500,
	save_total_limit=2,
	load_best_model_at_end=True,
	metric_for_best_model="f1",
	greater_is_better=True,
	logging_steps=100,
	logging_dir=f"{output_dir}/logs",
	fp16=use_fp16,
	bf16=use_bf16,
	dataloader_num_workers=0,
	report_to="none",
	push_to_hub=False,
	ddp_find_unused_parameters=False if num_gpus > 1 else None,
	# CRITICAL: Disable gradient checkpointing (fixes backward pass error)
	gradient_checkpointing=False,
	optim="paged_adamw_8bit",
	max_grad_norm=1.0,
	)

	# Metrics
	def compute_metrics(eval_pred):
	logits, labels = eval_pred
	predictions = np.argmax(logits, axis=-1)
	acc = accuracy_score(labels, predictions)
	f1 = f1_score(labels, predictions, average="weighted")
	return {"accuracy": acc, "f1": f1}

	# Create trainer
	log("\n🚀 Starting training...")
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=tokenized_dataset["train"],
	eval_dataset=tokenized_dataset.get("validation"),
	tokenizer=tokenizer,
	compute_metrics=compute_metrics,
	callbacks=[
	EarlyStoppingCallback(early_stopping_patience=early_stopping_patience),
	CUDACacheClearCallback(),
	],
	)

	# Clear cache before training
	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	# Train with OOM error handling
	try:
	train_result = trainer.train()
	except torch.cuda.OutOfMemoryError:
	log(f"\n❌ Out of Memory!")
	log(f" Reduce batch size or max length")
	raise

	# Evaluate
	log("\n📊 Final evaluation...")
	eval_result = trainer.evaluate()

	log(f"\n✅ Training complete!")
	log(f" Final Loss: {train_result.training_loss:.4f}")
	log(f" Accuracy: {eval_result.get('eval_accuracy', 0):.4f}")
	log(f" F1 Score: {eval_result.get('eval_f1', 0):.4f}")

	# Save model
	log(f"\n💾 Saving model to: {output_dir}")
	trainer.save_model(output_dir)
	tokenizer.save_pretrained(output_dir)

	# Final CUDA cache clear
	if torch.cuda.is_available():
	torch.cuda.empty_cache()
	log("\n🧹 Final CUDA cache clear complete")

	log(f"\n🎉 Done! Model saved successfully.")
	log("=" * 80)

	return "\n".join(logs)

	except Exception as e:
	log(f"\n❌ Error: {str(e)}")
	import traceback

	log(f"\n{traceback.format_exc()}")
	return "\n".join(logs)


	# ============================================================================
	# GRADIO INTERFACE - MINIMAL VERSION
	# ============================================================================

	with gr.Blocks(title="DeBERTa CWE Training", theme=gr.themes.Soft()) as demo:

	gr.Markdown(
	"""
	# 🤖 DeBERTa CWE Classification Training

	Optimized for 4x NVIDIA L4 GPUs (96GB VRAM)

	Click the button below to start training with optimized settings:
	- Model: DeBERTa-v3-Base (86M params)
	- Batch size: 32 (8 per GPU)
	- Epochs: 10
	- Learning rate: 2e-5
	- Gradient checkpointing: DISABLED (fixes errors, we have plenty of VRAM)
	"""
	)

	train_btn = gr.Button("🚀 Start Training", variant="primary", size="lg")

	logs_output = gr.Textbox(
	label="Training Logs",
	lines=30,
	max_lines=50,
	interactive=False,
	show_copy_button=True,
	)

	gr.Markdown(
	"""
	---
	### After Training

	Model will be saved to: `./models/deberta-cwe-final/`

	Developed by: Berghem - Smart Information Security \| License: MIT
	"""
	)

	# Wire up the training
	train_btn.click(fn=train_model, inputs=[], outputs=logs_output)


	# ============================================================================
	# LAUNCH
	# ============================================================================

	if __name__ == "__main__":
	import os

	# Check if running in HuggingFace Spaces
	is_hf_space = os.getenv("SPACE_ID") is not None

	demo.queue()

	if is_hf_space:
	# HF Spaces: explicit server configuration
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	show_error=True,
	quiet=False,
	prevent_thread_lock=False,
	inbrowser=False,
	show_api=False,
	)
	else:
	# Local development
	demo.launch(share=True)