Spaces:

alwin00007
/

phishing_email_detector

Sleeping

alwinvargheset@outlook.com

added_model

8ebacce over 1 year ago

3.41 kB

	from datasets import load_dataset, Dataset
	from sklearn.model_selection import train_test_split
	from transformers import (
	BertTokenizer,
	AutoModelForSequenceClassification,
	Trainer,
	TrainingArguments
	)
	import torch
	from sklearn.metrics import accuracy_score, precision_recall_fscore_support
	import numpy as np


	def compute_metrics(eval_pred):
	logits, labels = eval_pred
	preds = np.argmax(logits, axis=-1)
	precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='binary')
	acc = accuracy_score(labels, preds)
	return {
	'accuracy': acc,
	'f1': f1,
	'precision': precision,
	'recall': recall
	}


	def main():
	# Check for GPU availability
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	# Load and prepare dataset
	print("Loading dataset...")
	dataset = load_dataset("ealvaradob/phishing-dataset", "combined_reduced", trust_remote_code=True)
	df = dataset['train'].to_pandas()

	# Split dataset
	train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
	train_dataset = Dataset.from_pandas(train_df, preserve_index=False)
	test_dataset = Dataset.from_pandas(test_df, preserve_index=False)

	# Initialize tokenizer and model
	print("Initializing model...")
	tokenizer = BertTokenizer.from_pretrained('bert-large-uncased')
	model = AutoModelForSequenceClassification.from_pretrained(
	'bert-large-uncased',
	num_labels=2
	).to(device)

	def tokenize_function(examples):
	return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

	# Tokenize datasets
	print("Tokenizing datasets...")
	train_dataset = train_dataset.map(tokenize_function, batched=True)
	test_dataset = test_dataset.map(tokenize_function, batched=True)

	# Convert to PyTorch datasets
	train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
	test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

	# Set up training arguments
	epochs = 3
	batch_size = 64
	training_args = TrainingArguments(
	output_dir="./results",
	evaluation_strategy="epoch",
	save_strategy="epoch",
	learning_rate=5e-5,
	per_device_train_batch_size=batch_size,
	per_device_eval_batch_size=batch_size,
	num_train_epochs=epochs,
	weight_decay=0.01,
	logging_dir='./logs',
	logging_steps=50,
	load_best_model_at_end=True,
	metric_for_best_model="accuracy"
	)

	# Define Trainer
	trainer = Trainer(
	model=model,
	args=training_args,
	train_dataset=train_dataset,
	eval_dataset=test_dataset,
	tokenizer=tokenizer,
	compute_metrics=compute_metrics
	)

	# Train model
	print("Starting training...")
	trainer.train()

	# Evaluate the model
	print("Evaluating model...")
	eval_results = trainer.evaluate()
	print(eval_results)

	# Save the model and tokenizer
	print("Saving model...")
	model_path = "./phishing_model"
	model.save_pretrained(model_path)
	tokenizer.save_pretrained(model_path)
	print(f"Model and tokenizer saved to {model_path}")

	print("Training completed and model saved!")


	if __name__ == "__main__":
	main()