Spaces:

moraeslucas
/

PhisHunter

Sleeping

File size: 3,849 Bytes

fdff15a


import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import gradio as gr
from utils.heuristics import load_rules, explain_email, extract_keywords
from langdetect import detect
from pathlib import Path
import re
from utils.virustotal import check_url_virustotal
import extract_msg

# Modelo
model_name = "ElSlay/BERT-Phishing-Email-Model"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

# Heurísticas
rules = load_rules("rules.yaml")

# Classificação com BERT
def classify_email(email_text):
    inputs = tokenizer(email_text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1).detach().numpy()[0]
    labels = ["Legitimate", "Phishing"]
    prediction = labels[probs.argmax()]
    confidence = probs.max()
    return prediction, confidence

# Análise completa do email
def analyze_email(file_input=None, text_input=None):
    email_text = None

    if text_input:
        email_text = text_input
    elif file_input:
        path = file_input.name if hasattr(file_input, "name") else file_input
        if path.endswith(".txt"):
            with open(path, "r", encoding="utf-8") as f:
                email_text = f.read()
        elif path.endswith(".msg"):
            msg = extract_msg.Message(path)
            email_text = f"{msg.subject or ''}\n{msg.body or ''}"
        else:
            return "Unsupported file type."
    else:
        return "No input provided."

    if not email_text:
        return "Could not extract text from file."

    # Classificação
    label, confidence = classify_email(email_text)

    # Heurística
    explanations, score = explain_email(email_text, rules)

    # Keywords
    lang = detect(email_text)
    keywords = extract_keywords(email_text, lang)
    keywords_text = "Top keywords: " + ", ".join(keywords)

    # Explicação heurística
    explanation_text = "📌 Explanation:\n• " + "\n• ".join(explanations)

    # Verificação VirusTotal
    urls = re.findall(r"http[s]?://\S+", email_text)
    vt_results = []
    for url in urls:
        stats = check_url_virustotal(url)
        if "error" in stats:
            vt_results.append(f"URL: {url} | VT: {stats['error']}")
        else:
            vt_results.append(f"URL: {url} | Malicious: {stats.get('malicious', 0)}, Suspicious: {stats.get('suspicious', 0)}, Harmless: {stats.get('harmless', 0)}")
    vt_text = "\n".join(vt_results) if vt_results else "No URLs found."

    return f"Classification: {label} ({confidence:.2%})\n\n{explanation_text}\n\nScore: {score}\n\n{keywords_text}\n\nVirusTotal Results:\n{vt_text}"

# Carregar exemplos
def update_text_from_example(example_name):
    return example_emails[example_name]

def load_example_files():
    examples_path = Path("examples")
    files = sorted(examples_path.glob("*.txt"))
    return {file.name: file.read_text(encoding="utf-8") for file in files}

example_emails = load_example_files()

# Interface Gradio
with gr.Blocks() as demo:
    gr.Markdown("## 🛡️ PhishHunter – Email Phishing Detector")

    with gr.Row():
        dropdown = gr.Dropdown(
            choices=list(example_emails.keys()),
            label="Load Example Email",
            info="Select a sample email to test",
        )
        textbox = gr.Textbox(lines=15, label="Paste or load email content")
        filebox = gr.File(label="Upload email (.txt or .msg)", file_types=[".txt", ".msg"])

    dropdown.change(fn=update_text_from_example, inputs=dropdown, outputs=textbox)

    output = gr.Textbox(label="Classification & Explanation")

    btn = gr.Button("Analyze")
    btn.click(fn=analyze_email, inputs=[filebox, textbox], outputs=output)

if __name__ == "__main__":
    demo.launch()