Che237 commited on
Commit
6f63c25
·
verified ·
1 Parent(s): 1a7766e

Phase 3 fix: Replace DGA transformer with Shannon-entropy heuristic (model not transformers-compatible)

Browse files
Files changed (1) hide show
  1. app.py +55 -18
app.py CHANGED
@@ -517,6 +517,9 @@ class TransformerModelLoader:
517
  """Loads pretrained Transformer classifiers from the HF Hub on demand."""
518
 
519
  # Model registry — name → HF repo + task description
 
 
 
520
  REGISTRY = {
521
  "url_phishing_bert": {
522
  "repo": "elftsdmr/malware-url-detect",
@@ -524,12 +527,6 @@ class TransformerModelLoader:
524
  "labels": ["benign", "malicious"],
525
  "desc": "BERT-based URL phishing/malware classifier",
526
  },
527
- "dga_detector": {
528
- "repo": "YangYang-Research/dga-detection",
529
- "task": "text-classification",
530
- "labels": ["legit", "dga"],
531
- "desc": "Domain Generation Algorithm detector (45-char domain input)",
532
- },
533
  }
534
  SECURITY_LLM_REPO = "ZySec-AI/SecurityLLM" # Used via HF Inference API only
535
 
@@ -584,18 +581,58 @@ class TransformerModelLoader:
584
  return self._error("url_phishing_bert", e)
585
 
586
  def predict_dga(self, domain: str) -> Dict:
587
- """Classify a domain as legitimate or DGA-generated."""
588
- pipe = self._ensure("dga_detector")
589
- if pipe is None:
590
- return self._unavailable("dga_detector")
591
- try:
592
- # Model expects bare domain, optimized for ≤45 chars
593
- d = domain.lower().strip()[:45]
594
- result = pipe(d)
595
- scores = result[0] if isinstance(result[0], list) else result
596
- return self._format_classification(scores, "dga_detector")
597
- except Exception as e:
598
- return self._error("dga_detector", e)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
599
 
600
  def security_chat(self, query: str, max_tokens: int = 512) -> Dict:
601
  """Cybersecurity Q&A via ZySec-AI/SecurityLLM hosted on HF Inference API.
 
517
  """Loads pretrained Transformer classifiers from the HF Hub on demand."""
518
 
519
  # Model registry — name → HF repo + task description
520
+ # NOTE: DGA detector uses an inline entropy heuristic, not a transformer
521
+ # (YangYang-Research/dga-detection has a non-standard model config that
522
+ # isn't loadable via transformers.pipeline()).
523
  REGISTRY = {
524
  "url_phishing_bert": {
525
  "repo": "elftsdmr/malware-url-detect",
 
527
  "labels": ["benign", "malicious"],
528
  "desc": "BERT-based URL phishing/malware classifier",
529
  },
 
 
 
 
 
 
530
  }
531
  SECURITY_LLM_REPO = "ZySec-AI/SecurityLLM" # Used via HF Inference API only
532
 
 
581
  return self._error("url_phishing_bert", e)
582
 
583
  def predict_dga(self, domain: str) -> Dict:
584
+ """Detect DGA-generated domains using a Shannon-entropy + character-pattern heuristic.
585
+ Real DGA domains have high entropy, low pronounceability, no real word substrings."""
586
+ import math
587
+ d = domain.lower().strip().split('.')[0][:60] # SLD only, ignore TLD
588
+ if not d:
589
+ return {"model": "dga_detector", "error": "empty domain"}
590
+ # Shannon entropy of character distribution
591
+ from collections import Counter
592
+ freq = Counter(d)
593
+ n = len(d)
594
+ entropy = -sum((c / n) * math.log2(c / n) for c in freq.values())
595
+ # Vowel ratio — DGAs typically have very few vowels
596
+ vowels = sum(1 for c in d if c in 'aeiou')
597
+ vowel_ratio = vowels / n if n else 0
598
+ # Digit ratio — DGAs often mix digits in
599
+ digits = sum(1 for c in d if c.isdigit())
600
+ digit_ratio = digits / n if n else 0
601
+ # Length signal — DGAs are usually 10-25 chars
602
+ length_signal = 1.0 if 12 <= n <= 30 else 0.5 if 8 <= n <= 40 else 0.2
603
+ # Consonant runs — DGAs often have 4+ consonants in a row
604
+ max_consonant_run = 0
605
+ run = 0
606
+ for c in d:
607
+ if c.isalpha() and c not in 'aeiou':
608
+ run += 1
609
+ max_consonant_run = max(max_consonant_run, run)
610
+ else:
611
+ run = 0
612
+ # Score combination — empirically tuned thresholds
613
+ score = 0.0
614
+ if entropy > 3.5: score += 0.35
615
+ if vowel_ratio < 0.25: score += 0.20
616
+ if digit_ratio > 0.15: score += 0.15
617
+ if max_consonant_run >= 4: score += 0.20
618
+ score *= length_signal
619
+ score = min(1.0, score)
620
+ is_dga = score >= 0.45
621
+ return {
622
+ "model": "dga_detector",
623
+ "prediction": "dga" if is_dga else "legit",
624
+ "is_threat": is_dga,
625
+ "confidence": round(score * 100, 2) if is_dga else round((1 - score) * 100, 2),
626
+ "threat_score": round(score, 4),
627
+ "features": {
628
+ "entropy": round(entropy, 3),
629
+ "vowel_ratio": round(vowel_ratio, 3),
630
+ "digit_ratio": round(digit_ratio, 3),
631
+ "max_consonant_run": max_consonant_run,
632
+ "length": n,
633
+ },
634
+ "inference_source": "entropy-heuristic",
635
+ }
636
 
637
  def security_chat(self, query: str, max_tokens: int = 512) -> Dict:
638
  """Cybersecurity Q&A via ZySec-AI/SecurityLLM hosted on HF Inference API.