Spaces:

moraeslucas
/

PhisHunter

Sleeping

App Files Files Community

PhisHunter / app.py

moraeslucas

First Commit with 25 files

fdff15a verified 6 months ago

raw

history blame contribute delete

3.85 kB


	import torch
	from transformers import AutoTokenizer, AutoModelForSequenceClassification
	import gradio as gr
	from utils.heuristics import load_rules, explain_email, extract_keywords
	from langdetect import detect
	from pathlib import Path
	import re
	from utils.virustotal import check_url_virustotal
	import extract_msg

	# Modelo
	model_name = "ElSlay/BERT-Phishing-Email-Model"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	model = AutoModelForSequenceClassification.from_pretrained(model_name)

	# Heurísticas
	rules = load_rules("rules.yaml")

	# Classificação com BERT
	def classify_email(email_text):
	inputs = tokenizer(email_text, return_tensors="pt", truncation=True, padding=True)
	outputs = model(**inputs)
	probs = torch.softmax(outputs.logits, dim=1).detach().numpy()[0]
	labels = ["Legitimate", "Phishing"]
	prediction = labels[probs.argmax()]
	confidence = probs.max()
	return prediction, confidence

	# Análise completa do email
	def analyze_email(file_input=None, text_input=None):
	email_text = None

	if text_input:
	email_text = text_input
	elif file_input:
	path = file_input.name if hasattr(file_input, "name") else file_input
	if path.endswith(".txt"):
	with open(path, "r", encoding="utf-8") as f:
	email_text = f.read()
	elif path.endswith(".msg"):
	msg = extract_msg.Message(path)
	email_text = f"{msg.subject or ''}\n{msg.body or ''}"
	else:
	return "Unsupported file type."
	else:
	return "No input provided."

	if not email_text:
	return "Could not extract text from file."

	# Classificação
	label, confidence = classify_email(email_text)

	# Heurística
	explanations, score = explain_email(email_text, rules)

	# Keywords
	lang = detect(email_text)
	keywords = extract_keywords(email_text, lang)
	keywords_text = "Top keywords: " + ", ".join(keywords)

	# Explicação heurística
	explanation_text = "📌 Explanation:\n• " + "\n• ".join(explanations)

	# Verificação VirusTotal
	urls = re.findall(r"http[s]?://\S+", email_text)
	vt_results = []
	for url in urls:
	stats = check_url_virustotal(url)
	if "error" in stats:
	vt_results.append(f"URL: {url} \| VT: {stats['error']}")
	else:
	vt_results.append(f"URL: {url} \| Malicious: {stats.get('malicious', 0)}, Suspicious: {stats.get('suspicious', 0)}, Harmless: {stats.get('harmless', 0)}")
	vt_text = "\n".join(vt_results) if vt_results else "No URLs found."

	return f"Classification: {label} ({confidence:.2%})\n\n{explanation_text}\n\nScore: {score}\n\n{keywords_text}\n\nVirusTotal Results:\n{vt_text}"

	# Carregar exemplos
	def update_text_from_example(example_name):
	return example_emails[example_name]

	def load_example_files():
	examples_path = Path("examples")
	files = sorted(examples_path.glob("*.txt"))
	return {file.name: file.read_text(encoding="utf-8") for file in files}

	example_emails = load_example_files()

	# Interface Gradio
	with gr.Blocks() as demo:
	gr.Markdown("## 🛡️ PhishHunter – Email Phishing Detector")

	with gr.Row():
	dropdown = gr.Dropdown(
	choices=list(example_emails.keys()),
	label="Load Example Email",
	info="Select a sample email to test",
	)
	textbox = gr.Textbox(lines=15, label="Paste or load email content")
	filebox = gr.File(label="Upload email (.txt or .msg)", file_types=[".txt", ".msg"])

	dropdown.change(fn=update_text_from_example, inputs=dropdown, outputs=textbox)

	output = gr.Textbox(label="Classification & Explanation")

	btn = gr.Button("Analyze")
	btn.click(fn=analyze_email, inputs=[filebox, textbox], outputs=output)

	if __name__ == "__main__":
	demo.launch()