PhisHunter / utils /heuristics.py
moraeslucas's picture
First Commit with 25 files
fdff15a verified
import re
import yaml
import yake
import spacy
from langdetect import detect
# Carregar regras heurísticas com pesos
def load_rules(filepath="rules_weighted.yaml"):
with open(filepath, "r", encoding="utf-8") as f:
return yaml.safe_load(f)
# Aplicar regras com base no idioma e calcular score
def apply_heuristics(email_text, rules):
reasons = []
total_score = 0.0
lower = email_text.lower()
lang = detect(lower)
# Regras de negação que reduzem o score
negations = [
"não é urgente",
"sem urgência",
"não necessita ação",
"não requer ação imediata",
"sem necessidade imediata"
]
for neg in negations:
if neg in lower:
reasons.append(f"Found negation: '{neg}' (reduces score)")
total_score -= 0.5
for category, keywords in rules.get("keywords", {}).items():
# Global keywords
for entry in keywords.get("global", []):
pattern = entry["term"]
weight = entry.get("weight", 1.0)
if re.search(pattern, lower, re.IGNORECASE):
reasons.append(f"[{category}] Matched '{pattern}' (global, weight={weight})")
total_score += weight
# Language-specific keywords
for entry in keywords.get(lang, []):
pattern = entry["term"]
weight = entry.get("weight", 1.0)
if re.search(pattern, lower, re.IGNORECASE):
reasons.append(f"[{category}] Matched '{pattern}' ({lang}, weight={weight})")
total_score += weight
# Heurística de links
urls = re.findall(r"http[s]?://\S+", email_text)
if urls:
reasons.append(f"Contains suspicious link(s): {', '.join(urls)}")
total_score += 1.0
return reasons, total_score, lang
# Extração de palavras-chave com YAKE
def extract_keywords(email_text, lang="en"):
extractor = yake.KeywordExtractor(lan=lang, top=5)
keywords = extractor.extract_keywords(email_text)
return [kw for kw, score in keywords]
# Explicação combinada
def explain_email(email_text, rules):
reasons, score, lang = apply_heuristics(email_text, rules)
return reasons, score