Spaces:
Running
Running
Phase 3 fix: Replace DGA transformer with Shannon-entropy heuristic (model not transformers-compatible)
Browse files
app.py
CHANGED
|
@@ -517,6 +517,9 @@ class TransformerModelLoader:
|
|
| 517 |
"""Loads pretrained Transformer classifiers from the HF Hub on demand."""
|
| 518 |
|
| 519 |
# Model registry — name → HF repo + task description
|
|
|
|
|
|
|
|
|
|
| 520 |
REGISTRY = {
|
| 521 |
"url_phishing_bert": {
|
| 522 |
"repo": "elftsdmr/malware-url-detect",
|
|
@@ -524,12 +527,6 @@ class TransformerModelLoader:
|
|
| 524 |
"labels": ["benign", "malicious"],
|
| 525 |
"desc": "BERT-based URL phishing/malware classifier",
|
| 526 |
},
|
| 527 |
-
"dga_detector": {
|
| 528 |
-
"repo": "YangYang-Research/dga-detection",
|
| 529 |
-
"task": "text-classification",
|
| 530 |
-
"labels": ["legit", "dga"],
|
| 531 |
-
"desc": "Domain Generation Algorithm detector (45-char domain input)",
|
| 532 |
-
},
|
| 533 |
}
|
| 534 |
SECURITY_LLM_REPO = "ZySec-AI/SecurityLLM" # Used via HF Inference API only
|
| 535 |
|
|
@@ -584,18 +581,58 @@ class TransformerModelLoader:
|
|
| 584 |
return self._error("url_phishing_bert", e)
|
| 585 |
|
| 586 |
def predict_dga(self, domain: str) -> Dict:
|
| 587 |
-
"""
|
| 588 |
-
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 599 |
|
| 600 |
def security_chat(self, query: str, max_tokens: int = 512) -> Dict:
|
| 601 |
"""Cybersecurity Q&A via ZySec-AI/SecurityLLM hosted on HF Inference API.
|
|
|
|
| 517 |
"""Loads pretrained Transformer classifiers from the HF Hub on demand."""
|
| 518 |
|
| 519 |
# Model registry — name → HF repo + task description
|
| 520 |
+
# NOTE: DGA detector uses an inline entropy heuristic, not a transformer
|
| 521 |
+
# (YangYang-Research/dga-detection has a non-standard model config that
|
| 522 |
+
# isn't loadable via transformers.pipeline()).
|
| 523 |
REGISTRY = {
|
| 524 |
"url_phishing_bert": {
|
| 525 |
"repo": "elftsdmr/malware-url-detect",
|
|
|
|
| 527 |
"labels": ["benign", "malicious"],
|
| 528 |
"desc": "BERT-based URL phishing/malware classifier",
|
| 529 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 530 |
}
|
| 531 |
SECURITY_LLM_REPO = "ZySec-AI/SecurityLLM" # Used via HF Inference API only
|
| 532 |
|
|
|
|
| 581 |
return self._error("url_phishing_bert", e)
|
| 582 |
|
| 583 |
def predict_dga(self, domain: str) -> Dict:
|
| 584 |
+
"""Detect DGA-generated domains using a Shannon-entropy + character-pattern heuristic.
|
| 585 |
+
Real DGA domains have high entropy, low pronounceability, no real word substrings."""
|
| 586 |
+
import math
|
| 587 |
+
d = domain.lower().strip().split('.')[0][:60] # SLD only, ignore TLD
|
| 588 |
+
if not d:
|
| 589 |
+
return {"model": "dga_detector", "error": "empty domain"}
|
| 590 |
+
# Shannon entropy of character distribution
|
| 591 |
+
from collections import Counter
|
| 592 |
+
freq = Counter(d)
|
| 593 |
+
n = len(d)
|
| 594 |
+
entropy = -sum((c / n) * math.log2(c / n) for c in freq.values())
|
| 595 |
+
# Vowel ratio — DGAs typically have very few vowels
|
| 596 |
+
vowels = sum(1 for c in d if c in 'aeiou')
|
| 597 |
+
vowel_ratio = vowels / n if n else 0
|
| 598 |
+
# Digit ratio — DGAs often mix digits in
|
| 599 |
+
digits = sum(1 for c in d if c.isdigit())
|
| 600 |
+
digit_ratio = digits / n if n else 0
|
| 601 |
+
# Length signal — DGAs are usually 10-25 chars
|
| 602 |
+
length_signal = 1.0 if 12 <= n <= 30 else 0.5 if 8 <= n <= 40 else 0.2
|
| 603 |
+
# Consonant runs — DGAs often have 4+ consonants in a row
|
| 604 |
+
max_consonant_run = 0
|
| 605 |
+
run = 0
|
| 606 |
+
for c in d:
|
| 607 |
+
if c.isalpha() and c not in 'aeiou':
|
| 608 |
+
run += 1
|
| 609 |
+
max_consonant_run = max(max_consonant_run, run)
|
| 610 |
+
else:
|
| 611 |
+
run = 0
|
| 612 |
+
# Score combination — empirically tuned thresholds
|
| 613 |
+
score = 0.0
|
| 614 |
+
if entropy > 3.5: score += 0.35
|
| 615 |
+
if vowel_ratio < 0.25: score += 0.20
|
| 616 |
+
if digit_ratio > 0.15: score += 0.15
|
| 617 |
+
if max_consonant_run >= 4: score += 0.20
|
| 618 |
+
score *= length_signal
|
| 619 |
+
score = min(1.0, score)
|
| 620 |
+
is_dga = score >= 0.45
|
| 621 |
+
return {
|
| 622 |
+
"model": "dga_detector",
|
| 623 |
+
"prediction": "dga" if is_dga else "legit",
|
| 624 |
+
"is_threat": is_dga,
|
| 625 |
+
"confidence": round(score * 100, 2) if is_dga else round((1 - score) * 100, 2),
|
| 626 |
+
"threat_score": round(score, 4),
|
| 627 |
+
"features": {
|
| 628 |
+
"entropy": round(entropy, 3),
|
| 629 |
+
"vowel_ratio": round(vowel_ratio, 3),
|
| 630 |
+
"digit_ratio": round(digit_ratio, 3),
|
| 631 |
+
"max_consonant_run": max_consonant_run,
|
| 632 |
+
"length": n,
|
| 633 |
+
},
|
| 634 |
+
"inference_source": "entropy-heuristic",
|
| 635 |
+
}
|
| 636 |
|
| 637 |
def security_chat(self, query: str, max_tokens: int = 512) -> Dict:
|
| 638 |
"""Cybersecurity Q&A via ZySec-AI/SecurityLLM hosted on HF Inference API.
|