Spaces:

moraeslucas
/

PhisHunter

Sleeping

App Files Files Community

moraeslucas commited on Jul 28, 2025

Commit

fdff15a

verified ·

1 Parent(s): 05524f6

First Commit with 25 files

Browse files

Files changed (25) hide show

.gitignore +23 -0
.gradio/flagged/dataset1.csv +55 -0
README.md +118 -11
app.py +111 -0
examples/Alerta Novas mensagens de Self-service.msg +0 -0
examples/Email_Phis_URL_mal.txt +7 -0
examples/Leg_email_1.txt +8 -0
examples/Leg_email_2.txt +8 -0
examples/email_5_real.txt +10 -0
examples/email_5_urg_leg.txt +8 -0
examples/email_6_phs_congrats.txt +12 -0
examples/email_7_ph_microst.txt +11 -0
examples/phishing_email_1.txt +0 -0
examples/phishing_email_2.txt +12 -0
examples/phishing_email_3.txt +6 -0
init.bat +3 -0
requirements.txt +112 -0
rules.yaml +47 -0
utils/__init__.py +1 -0
utils/heuristics.py +67 -0
utils/heuristics_old1.py +47 -0
utils/heuristics_old2.py +54 -0
utils/keyword_extractor.py +10 -0
utils/lang_detect.py +11 -0
utils/virustotal.py +28 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,23 @@

+# Ambientes virtuais
+venv/
+env/
+# Ficheiros temporários e compilados
+__pycache__/
+*.py[cod]
+*.log
+# Ficheiros do sistema
+.DS_Store
+Thumbs.db
+# Ficheiros de IDEs
+.vscode/
+.idea/
+# Ficheiros de testes ou exemplos temporários
+*.tmp
+*.bak
+# Chaves ou configs privadas
+*.env

.gradio/flagged/dataset1.csv ADDED Viewed

	@@ -0,0 +1,55 @@

+email_text,output 0,Explicação,timestamp
+"'Subject: Congratulations! You've Won a Gift Card 🎁
+Hi there,
+You’ve been selected as a lucky winner of a $100 Amazon gift card! To claim your prize, simply fill in your details at the link below:
+http://amaz0n-prize-now.ru
+Hurry — this offer expires in 24 hours!
+Best of luck,
+Rewards Center","{""label"": ""Phishing \ud83d\udea8"", ""confidences"": null}","🧠 Modelo BERT: Phishing (99.98%)
+🔎 Heurística: Score = 4.8
+• [prize] Matched 'gift.?card' (global, weight=0.9)
+• [prize] Matched 'congratulations' (en, weight=0.7)
+• [prize] Matched 'winner' (en, weight=0.8)
+• [prize] Matched 'prize' (en, weight=0.8)
+• [urgency] Matched '24 hours' (global, weight=0.6)
+• Contains suspicious link(s): http://amaz0n-prize-now.ru
+🗝️ Palavras-chave: Amazon gift card, Gift Card, Congratulations, Amazon gift, Subject",2025-07-21 15:33:49.523444
+"'Subject: Congratulations! You've Won a Gift Card 🎁
+Hi there,
+You’ve been selected as a lucky winner of a $100 Amazon gift card! To claim your prize, simply fill in your details at the link below:
+http://amaz0n-prize-now.ru
+Hurry — this offer expires in 24 hours!
+Best of luck,
+Rewards Center","{""label"": ""Phishing \ud83d\udea8"", ""confidences"": null}","🧠 Modelo BERT: Phishing (99.98%)
+🔎 Heurística: Score = 4.8
+• [prize] Matched 'gift.?card' (global, weight=0.9)
+• [prize] Matched 'congratulations' (en, weight=0.7)
+• [prize] Matched 'winner' (en, weight=0.8)
+• [prize] Matched 'prize' (en, weight=0.8)
+• [urgency] Matched '24 hours' (global, weight=0.6)
+• Contains suspicious link(s): http://amaz0n-prize-now.ru
+🗝️ Palavras-chave: Amazon gift card, Gift Card, Congratulations, Amazon gift, Subject",2025-07-21 15:33:53.036101
+"'Subject: Meeting Rescheduled
+Hi John,
+The client meeting has been rescheduled to Thursday at 3 PM. Please let me know if that works for you.
+Best,
+Maria","{""label"": ""Leg\u00edtimo \u2705"", ""confidences"": null}","🧠 Modelo BERT: Legitimate (99.99%)
+🔎 Heurística: Score = 0.0
+🗝️ Palavras-chave: Rescheduled Hi John, Meeting Rescheduled, rescheduled to Thursday, client meeting, John",2025-07-21 15:34:16.300979

README.md CHANGED Viewed

@@ -1,13 +1,120 @@
 ---
-title: PhisHunter
-emoji: 👁
-colorFrom: blue
-colorTo: gray
-sdk: gradio
-sdk_version: 5.38.2
-app_file: app.py
-pinned: false
-license: mit
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+\# PhishHunter
+PhishHunter is an open-source NLP-based email classification tool that detects phishing attempts and explains why an email might be suspicious.
+\## 🔧 Technologies Used
+\- \[Hugging Face Transformers](https://huggingface.co/)
+\- \[Gradio](https://gradio.app/)
+\- \[NLTK](https://www.nltk.org/)
+\- \[YAKE](https://github.com/LIAAD/yake)
+\- \[LangDetect](https://pypi.org/project/langdetect/)
+\- \[extract-msg](https://pypi.org/project/extract-msg/)
+\- Python 3.8+
+> \*\*Note:\*\* Although `spaCy` is listed in requirements, it is not actively used in the codebase.
+\## 🚀 Getting Started
+1\. Clone the repository or download the project folder:
+&nbsp;  ```bash
+&nbsp;  git clone https://github.com/SEU\_UTILIZADOR/phishhunter.git
+&nbsp;  cd phishhunter
+&nbsp;  ```
+2\. Create and activate a virtual environment:
+&nbsp;  ```bash
+&nbsp;  python -m venv venv
+&nbsp;  venv\\Scripts\\activate    # Windows
+&nbsp;  # ou
+&nbsp;  source venv/bin/activate  # Linux/macOS
+&nbsp;  ```
+3\. Install dependencies:
+&nbsp;  ```bash
+&nbsp;  pip install -r requirements.txt
+&nbsp;  ```
+4\. Run the app:
+&nbsp;  ```bash
+&nbsp;  python app\_improved.py
+&nbsp;  ```
+\## 📦 Features
+\- Classify email text using a fine-tuned BERT model
+\- Heuristic-based rules per language (via `rules.yaml`)
+\- Language detection for multilingual support
+\- Keyword extraction
+\- URL verification via VirusTotal API
+\- Gradio-based interface for easy use
 ---
+\## 📜 License
+This project is licensed under the MIT License.

app.py ADDED Viewed

	@@ -0,0 +1,111 @@

+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+import gradio as gr
+from utils.heuristics import load_rules, explain_email, extract_keywords
+from langdetect import detect
+from pathlib import Path
+import re
+from utils.virustotal import check_url_virustotal
+import extract_msg
+# Modelo
+model_name = "ElSlay/BERT-Phishing-Email-Model"
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForSequenceClassification.from_pretrained(model_name)
+# Heurísticas
+rules = load_rules("rules.yaml")
+# Classificação com BERT
+def classify_email(email_text):
+    inputs = tokenizer(email_text, return_tensors="pt", truncation=True, padding=True)
+    outputs = model(**inputs)
+    probs = torch.softmax(outputs.logits, dim=1).detach().numpy()[0]
+    labels = ["Legitimate", "Phishing"]
+    prediction = labels[probs.argmax()]
+    confidence = probs.max()
+    return prediction, confidence
+# Análise completa do email
+def analyze_email(file_input=None, text_input=None):
+    email_text = None
+    if text_input:
+        email_text = text_input
+    elif file_input:
+        path = file_input.name if hasattr(file_input, "name") else file_input
+        if path.endswith(".txt"):
+            with open(path, "r", encoding="utf-8") as f:
+                email_text = f.read()
+        elif path.endswith(".msg"):
+            msg = extract_msg.Message(path)
+            email_text = f"{msg.subject or ''}\n{msg.body or ''}"
+        else:
+            return "Unsupported file type."
+    else:
+        return "No input provided."
+    if not email_text:
+        return "Could not extract text from file."
+    # Classificação
+    label, confidence = classify_email(email_text)
+    # Heurística
+    explanations, score = explain_email(email_text, rules)
+    # Keywords
+    lang = detect(email_text)
+    keywords = extract_keywords(email_text, lang)
+    keywords_text = "Top keywords: " + ", ".join(keywords)
+    # Explicação heurística
+    explanation_text = "📌 Explanation:\n• " + "\n• ".join(explanations)
+    # Verificação VirusTotal
+    urls = re.findall(r"http[s]?://\S+", email_text)
+    vt_results = []
+    for url in urls:
+        stats = check_url_virustotal(url)
+        if "error" in stats:
+            vt_results.append(f"URL: {url} | VT: {stats['error']}")
+        else:
+            vt_results.append(f"URL: {url} | Malicious: {stats.get('malicious', 0)}, Suspicious: {stats.get('suspicious', 0)}, Harmless: {stats.get('harmless', 0)}")
+    vt_text = "\n".join(vt_results) if vt_results else "No URLs found."
+    return f"Classification: {label} ({confidence:.2%})\n\n{explanation_text}\n\nScore: {score}\n\n{keywords_text}\n\nVirusTotal Results:\n{vt_text}"
+# Carregar exemplos
+def update_text_from_example(example_name):
+    return example_emails[example_name]
+def load_example_files():
+    examples_path = Path("examples")
+    files = sorted(examples_path.glob("*.txt"))
+    return {file.name: file.read_text(encoding="utf-8") for file in files}
+example_emails = load_example_files()
+# Interface Gradio
+with gr.Blocks() as demo:
+    gr.Markdown("## 🛡️ PhishHunter – Email Phishing Detector")
+    with gr.Row():
+        dropdown = gr.Dropdown(
+            choices=list(example_emails.keys()),
+            label="Load Example Email",
+            info="Select a sample email to test",
+        )
+        textbox = gr.Textbox(lines=15, label="Paste or load email content")
+        filebox = gr.File(label="Upload email (.txt or .msg)", file_types=[".txt", ".msg"])
+    dropdown.change(fn=update_text_from_example, inputs=dropdown, outputs=textbox)
+    output = gr.Textbox(label="Classification & Explanation")
+    btn = gr.Button("Analyze")
+    btn.click(fn=analyze_email, inputs=[filebox, textbox], outputs=output)
+if __name__ == "__main__":
+    demo.launch()

examples/Alerta Novas mensagens de Self-service.msg ADDED Viewed

Binary file (80.4 kB). View file

examples/Email_Phis_URL_mal.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+Hi,
+it's not importante, but click in link ...
+http://www.eicar.org/download/eicar.com

examples/Leg_email_1.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+Subject: Meeting Rescheduled
+Hi John,
+The client meeting has been rescheduled to Thursday at 3 PM. Please let me know if that works for you.
+Best,
+Maria

examples/Leg_email_2.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+Subject: Invoice for Your Purchase
+Dear Customer,
+Thank you for your recent order. Attached is the invoice for your purchase. If you have any questions, feel free to contact our support team.
+Best regards,
+Online Store Team

examples/email_5_real.txt ADDED Viewed

	@@ -0,0 +1,10 @@

+Subject: [Internal] Firewall Maintenance Notification
+Dear team,
+Please note that the firewall will be updated on Saturday at 02:00 AM. Expect short-term service interruptions during the maintenance window.
+If you experience any issues, contact the IT team.
+Regards,
+IT Operations

examples/email_5_urg_leg.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+Subject: Action Required – Submit Report by EOD
+Hi Ana,
+Just a quick reminder to submit the monthly expense report by the end of the day. Let me know if you need any help.
+Thanks,
+Carla

examples/email_6_phs_congrats.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+Subject: Congratulations! You've Won a Gift Card 🎁
+Hi there,
+You’ve been selected as a lucky winner of a $100 Amazon gift card! To claim your prize, simply fill in your details at the link below:
+http://amaz0n-prize-now.ru
+Hurry — this offer expires in 24 hours!
+Best of luck,
+Rewards Center

examples/email_7_ph_microst.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+Subject: Microsoft Security Alert
+Dear user,
+We have noticed a new sign-in to your Microsoft account from an unrecognized device. If this was not you, please secure your account immediately:
+https://account.microsoft-security-check.com
+Ignoring this alert may result in temporary suspension for your protection.
+Microsoft Account Team

examples/phishing_email_1.txt ADDED Viewed

File without changes

examples/phishing_email_2.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+Subject: Urgent Account Verification Required
+Dear user,
+We have detected unusual activity on your account. To secure your data, please verify your identity immediately by clicking the link below:
+http://secure-account-verify.com/login
+Failure to comply within 24 hours will result in permanent suspension of your account.
+Sincerely,
+Security Team

examples/phishing_email_3.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+Subject: Important Security Update
+Your email account needs to be updated urgently. Please download the attached file and follow the instructions to avoid service interruption.
+Best regards,
+IT Support

init.bat ADDED Viewed

	@@ -0,0 +1,3 @@

+REM setup.bat
+pip install -r requirements.txt
+python -m spacy download en_core_web_sm

requirements.txt ADDED Viewed

	@@ -0,0 +1,112 @@

+aiofiles==24.1.0
+annotated-types==0.7.0
+anyio==4.9.0
+audioop-lts==0.2.1
+beautifulsoup4==4.13.4
+blis==1.3.0
+Brotli==1.1.0
+catalogue==2.0.10
+certifi==2025.7.14
+cffi==1.17.1
+charset-normalizer==3.4.2
+click==8.2.1
+cloudpathlib==0.21.1
+colorama==0.4.6
+colorclass==2.2.2
+compressed-rtf==1.0.7
+confection==0.1.5
+cryptography==45.0.5
+cymem==2.0.11
+easygui==0.98.3
+ebcdic==1.1.1
+en_core_web_sm @ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl#sha256=1932429db727d4bff3deed6b34cfc05df17794f4a52eeb26cf8928f7c1a0fb85
+extract-msg==0.54.1
+fastapi==0.116.1
+ffmpy==0.6.0
+filelock==3.18.0
+fsspec==2025.7.0
+gradio==5.38.0
+gradio_client==1.11.0
+groovy==0.1.2
+h11==0.16.0
+httpcore==1.0.9
+httpx==0.28.1
+huggingface-hub==0.33.4
+idna==3.10
+jellyfish==1.2.0
+Jinja2==3.1.6
+joblib==1.5.1
+langcodes==3.5.0
+langdetect==1.0.9
+language_data==1.3.0
+lark==1.1.9
+marisa-trie==1.2.1
+markdown-it-py==3.0.0
+MarkupSafe==3.0.2
+mdurl==0.1.2
+mpmath==1.3.0
+msoffcrypto-tool==5.4.2
+murmurhash==1.0.13
+networkx==3.5
+nltk==3.9.1
+numpy==2.3.1
+olefile==0.47
+oletools==0.60.2
+orjson==3.11.0
+packaging==25.0
+pandas==2.3.1
+pcodedmp==1.2.6
+pillow==11.3.0
+preshed==3.0.10
+pycparser==2.22
+pydantic==2.11.7
+pydantic_core==2.33.2
+pydub==0.25.1
+Pygments==2.19.2
+pyparsing==3.2.3
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2025.2
+PyYAML==6.0.2
+red-black-tree-mod==1.22
+regex==2024.11.6
+requests==2.32.4
+rich==14.0.0
+RTFDE==0.1.2.1
+ruff==0.12.4
+safehttpx==0.1.6
+safetensors==0.5.3
+segtok==1.5.11
+semantic-version==2.10.0
+setuptools==80.9.0
+shellingham==1.5.4
+six==1.17.0
+smart_open==7.3.0.post1
+sniffio==1.3.1
+soupsieve==2.7
+spacy==3.8.7
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+srsly==2.5.1
+starlette==0.47.1
+sympy==1.14.0
+tabulate==0.9.0
+thinc==8.3.6
+tokenizers==0.21.2
+tomlkit==0.13.3
+torch==2.7.1
+tqdm==4.67.1
+transformers==4.53.2
+typer==0.16.0
+typing-inspection==0.4.1
+typing_extensions==4.14.1
+tzdata==2025.2
+tzlocal==5.3.1
+urllib3==2.5.0
+uvicorn==0.35.0
+wasabi==1.1.3
+weasel==0.4.1
+websockets==15.0.1
+win_unicode_console==0.5
+wrapt==1.17.2
+yake==0.6.0

rules.yaml ADDED Viewed

	@@ -0,0 +1,47 @@

+keywords:
+  impersonation:
+    en:
+      - { term: "Microsoft", weight: 0.5 }
+      - { term: "DocuSign", weight: 0.7 }
+      - { term: "PayPal", weight: 0.8 }
+    pt:
+      - { term: "DocuSign", weight: 0.7 }
+      - { term: "Caixa", weight: 0.6 }
+      - { term: "Segurança Social", weight: 0.8 }
+  prize:
+    en:
+      - { term: "congratulations", weight: 0.7 }
+      - { term: "winner", weight: 0.8 }
+      - { term: "prize", weight: 0.8 }
+    pt:
+      - { term: "par(á|a)b(é|e)ns", weight: 0.9 }
+      - { term: "pr(é|e)mio", weight: 0.8 }
+      - { term: "ganhador", weight: 0.7 }
+    global:
+      - { term: "gift.?card", weight: 0.9 }
+  sensitive_info:
+    en:
+      - { term: "password", weight: 0.9 }
+      - { term: "login", weight: 0.8 }
+      - { term: "verify your account", weight: 1.0 }
+      - { term: "credentials", weight: 0.9 }
+    pt:
+      - { term: "senha", weight: 0.9 }
+      - { term: "palavra.?passe", weight: 0.8 }
+      - { term: "credenciais", weight: 0.9 }
+      - { term: "verifique sua conta", weight: 1.0 }
+    global: []
+  urgency:
+    en:
+      - { term: "urgent", weight: 0.9 }
+      - { term: "action required", weight: 0.8 }
+    pt:
+      - { term: "urgente", weight: 0.9 }
+      - { term: "ação imediata", weight: 0.8 }
+    global:
+      - { term: "immediately", weight: 0.7 }
+      - { term: "24 hours", weight: 0.6 }

utils/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+

utils/heuristics.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import re
+import yaml
+import yake
+import spacy
+from langdetect import detect
+# Carregar regras heurísticas com pesos
+def load_rules(filepath="rules_weighted.yaml"):
+    with open(filepath, "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+# Aplicar regras com base no idioma e calcular score
+def apply_heuristics(email_text, rules):
+    reasons = []
+    total_score = 0.0
+    lower = email_text.lower()
+    lang = detect(lower)
+    # Regras de negação que reduzem o score
+    negations = [
+        "não é urgente",
+        "sem urgência",
+        "não necessita ação",
+        "não requer ação imediata",
+        "sem necessidade imediata"
+    ]
+    for neg in negations:
+        if neg in lower:
+            reasons.append(f"Found negation: '{neg}' (reduces score)")
+            total_score -= 0.5
+    for category, keywords in rules.get("keywords", {}).items():
+        # Global keywords
+        for entry in keywords.get("global", []):
+            pattern = entry["term"]
+            weight = entry.get("weight", 1.0)
+            if re.search(pattern, lower, re.IGNORECASE):
+                reasons.append(f"[{category}] Matched '{pattern}' (global, weight={weight})")
+                total_score += weight
+        # Language-specific keywords
+        for entry in keywords.get(lang, []):
+            pattern = entry["term"]
+            weight = entry.get("weight", 1.0)
+            if re.search(pattern, lower, re.IGNORECASE):
+                reasons.append(f"[{category}] Matched '{pattern}' ({lang}, weight={weight})")
+                total_score += weight
+    # Heurística de links
+    urls = re.findall(r"http[s]?://\S+", email_text)
+    if urls:
+        reasons.append(f"Contains suspicious link(s): {', '.join(urls)}")
+        total_score += 1.0
+    return reasons, total_score, lang
+# Extração de palavras-chave com YAKE
+def extract_keywords(email_text, lang="en"):
+    extractor = yake.KeywordExtractor(lan=lang, top=5)
+    keywords = extractor.extract_keywords(email_text)
+    return [kw for kw, score in keywords]
+# Explicação combinada
+def explain_email(email_text, rules):
+    reasons, score, lang = apply_heuristics(email_text, rules)
+    return reasons, score

utils/heuristics_old1.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import re
+import yaml
+import spacy
+import yake
+from langdetect import detect
+# Carregar regras heurísticas a partir de ficheiro YAML
+def load_rules(filepath="rules.yaml"):
+    with open(filepath, "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+# Aplicar regras com base no idioma
+def apply_heuristics(email_text, rules):
+    reasons = []
+    lower = email_text.lower()
+    lang = detect(lower)
+    for category, keywords in rules.get("keywords", {}).items():
+        # Aplica regras globais (sem idioma)
+        for kw in keywords.get("global", []):
+            if kw in lower:
+                reasons.append(f"Contains keyword '{kw}' related to {category} (global)")
+                break
+        # Aplica regras específicas do idioma
+        for kw in keywords.get(lang, []):
+            if kw in lower:
+                reasons.append(f"Contains keyword '{kw}' related to {category} ({lang})")
+                break
+    # Heurística de links
+    urls = re.findall(r"http[s]?://\S+", email_text)
+    if urls:
+        reasons.append(f"Contains suspicious link(s): {', '.join(urls)}")
+    return reasons, lang
+# Extrair palavras-chave com YAKE (para info complementar)
+def extract_keywords(email_text, lang="en"):
+    extractor = yake.KeywordExtractor(lan=lang, top=5)
+    keywords = extractor.extract_keywords(email_text)
+    return [kw for kw, score in keywords]
+# Explicação combinada
+def explain_email(email_text, rules):
+    reasons, lang = apply_heuristics(email_text, rules)
+    return reasons

utils/heuristics_old2.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import re
+import yaml
+import yake
+import spacy
+from langdetect import detect
+# Carregar regras heurísticas com pesos
+def load_rules(filepath="rules_weighted.yaml"):
+    with open(filepath, "r", encoding="utf-8") as f:
+        return yaml.safe_load(f)
+# Aplicar regras com base no idioma e calcular score
+def apply_heuristics(email_text, rules):
+    reasons = []
+    total_score = 0.0
+    lower = email_text.lower()
+    lang = detect(lower)
+    for category, keywords in rules.get("keywords", {}).items():
+        # Global keywords
+        for entry in keywords.get("global", []):
+            pattern = entry["term"]
+            weight = entry.get("weight", 1.0)
+            if re.search(pattern, lower, re.IGNORECASE):
+                reasons.append(f"[{category}] Matched '{pattern}' (global, weight={weight})")
+                total_score += weight
+        # Language-specific keywords
+        for entry in keywords.get(lang, []):
+            pattern = entry["term"]
+            weight = entry.get("weight", 1.0)
+            if re.search(pattern, lower, re.IGNORECASE):
+                reasons.append(f"[{category}] Matched '{pattern}' ({lang}, weight={weight})")
+                total_score += weight
+    # Heurística de links
+    urls = re.findall(r"http[s]?://\S+", email_text)
+    if urls:
+        reasons.append(f"Contains suspicious link(s): {', '.join(urls)}")
+        total_score += 1.0  # peso fixo para presença de links
+    return reasons, total_score, lang
+# Extração de palavras-chave com YAKE
+def extract_keywords(email_text, lang="en"):
+    extractor = yake.KeywordExtractor(lan=lang, top=5)
+    keywords = extractor.extract_keywords(email_text)
+    return [kw for kw, score in keywords]
+# Explicação combinada
+def explain_email(email_text, rules):
+    reasons, score, lang = apply_heuristics(email_text, rules)
+    return reasons, score

utils/keyword_extractor.py ADDED Viewed

	@@ -0,0 +1,10 @@

+import yake
+# Extrai palavras-chave com YAKE
+def extract_keywords(text, lang="en", max_keywords=5):
+    try:
+        extractor = yake.KeywordExtractor(lan=lang, top=max_keywords)
+        keywords = extractor.extract_keywords(text)
+        return [kw for kw, _ in keywords]
+    except Exception:
+        return []

utils/lang_detect.py ADDED Viewed

	@@ -0,0 +1,11 @@

+from langdetect import detect, DetectorFactory
+# Garante resultados consistentes
+DetectorFactory.seed = 0
+def detect_language(text):
+    try:
+        lang = detect(text)
+        return lang
+    except Exception:
+        return "unknown"

utils/virustotal.py ADDED Viewed

	@@ -0,0 +1,28 @@

+import requests
+import os
+# Insira sua API Key do VirusTotal aqui ou defina a variável de ambiente VT_API_KEY
+VT_API_KEY = os.getenv("VT_API_KEY", "02f4bfdb0435f5235201013bd18fe7d5b0793f5fd37952eedec138b0560cdd68")
+VT_BASE_URL = "https://www.virustotal.com/api/v3"
+def check_url_virustotal(url):
+    headers = {
+        "x-apikey": VT_API_KEY
+    }
+    # Primeiro, enviar a URL para análise
+    response = requests.post(f"{VT_BASE_URL}/urls", headers=headers, data={"url": url})
+    if response.status_code == 200:
+        analysis_id = response.json()["data"]["id"]
+        # Buscar o resultado da análise
+        analysis_url = f"{VT_BASE_URL}/analyses/{analysis_id}"
+        analysis_response = requests.get(analysis_url, headers=headers)
+        if analysis_response.status_code == 200:
+            stats = analysis_response.json()["data"]["attributes"]["stats"]
+            # Exemplo: {'harmless': 70, 'malicious': 1, 'suspicious': 0, ...}
+            return stats
+        else:
+            return {"error": f"Erro ao buscar análise: {analysis_response.status_code}"}
+    else:
+        return {"error": f"Erro ao enviar URL: {response.status_code}"}