Perth0603 commited on
Commit
a90fa5f
·
verified ·
1 Parent(s): 092cc82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +206 -25
app.py CHANGED
@@ -343,6 +343,128 @@ def _normalize_url_string(url: str) -> str:
343
  return (url or "").strip().rstrip("/")
344
 
345
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346
  # ============================================================================
347
  # API ENDPOINTS
348
  # ============================================================================
@@ -433,6 +555,13 @@ def preprocess_text(payload: PreprocessTextPayload):
433
  ])
434
  emotional_appeal = blob.sentiment.subjectivity > 0.6
435
 
 
 
 
 
 
 
 
436
  phishing_indicators = {
437
  "suspicious_keywords": detected_keywords,
438
  "keyword_count": len(detected_keywords),
@@ -440,11 +569,11 @@ def preprocess_text(payload: PreprocessTextPayload):
440
  "urgency_detected": urgency_detected,
441
  "emotional_appeal": emotional_appeal,
442
  "high_subjectivity": blob.sentiment.subjectivity > 0.6,
443
- "risk_score": min(1.0,
444
- len(detected_keywords) * 0.12 +
445
- (0.25 if urgency_detected else 0) +
446
- (0.20 if emotional_appeal else 0) +
447
- (keyword_density * 0.3)
448
  ),
449
  "risk_level": (
450
  "HIGH" if len(detected_keywords) >= 3 or urgency_detected else
@@ -488,12 +617,8 @@ def predict_url(payload: PredictUrlPayload):
488
  """
489
  Module 4: URL Analyzer
490
 
491
- Analyzes URLs for phishing using Random Forest model with:
492
- - Structural analysis (length, symbols, patterns)
493
- - Domain analysis (SLD, TLD, subdomains)
494
- - Typosquatting detection
495
- - Lookalike character detection
496
- - Brand similarity analysis
497
  """
498
  try:
499
  _load_url_model()
@@ -517,27 +642,43 @@ def predict_url(payload: PredictUrlPayload):
517
  if not url_str:
518
  return JSONResponse(status_code=400, content={"error": "Empty url"})
519
 
 
 
520
  # URL-level override via CSV lists
521
  norm_url = _normalize_url_string(url_str)
522
  phishy_set = { _normalize_url_string(u) for u in phishy_list }
523
  legit_set = { _normalize_url_string(u) for u in legit_list }
524
 
525
  if norm_url in phishy_set or norm_url in legit_set:
526
- phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
527
  label = "PHISH" if norm_url in phishy_set else "LEGIT"
 
 
 
 
 
 
 
 
 
 
 
 
 
528
  predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
529
- phish_proba = 0.99 if label == "PHISH" else 0.01
530
  score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
 
531
  return {
532
  "module": "url_analyzer",
533
  "label": label,
534
  "predicted_label": int(predicted_label),
535
  "score": float(score),
536
  "phishing_probability": float(phish_proba),
 
 
 
537
  "backend": str(model_type),
538
  "threshold": 0.5,
539
  "url_col": url_col,
540
- "override": {"reason": "csv_url_match"},
541
  }
542
 
543
  # Known-host override
@@ -545,21 +686,35 @@ def predict_url(payload: PredictUrlPayload):
545
  if host and host_map:
546
  for h, lbl in host_map.items():
547
  if _host_matches_any(host, [h]):
548
- phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
549
  label = lbl
 
 
 
 
 
 
 
 
 
 
 
 
 
550
  predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
551
- phish_proba = 0.99 if label == "PHISH" else 0.01
552
  score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
 
553
  return {
554
  "module": "url_analyzer",
555
  "label": label,
556
  "predicted_label": int(predicted_label),
557
  "score": float(score),
558
  "phishing_probability": float(phish_proba),
 
 
 
559
  "backend": str(model_type),
560
  "threshold": 0.5,
561
  "url_col": url_col,
562
- "override": {"reason": "known_host_match"},
563
  }
564
 
565
  # Lookalike character guard
@@ -579,21 +734,26 @@ def predict_url(payload: PredictUrlPayload):
579
 
580
  for char in url_str:
581
  if char in all_lookalikes:
582
- phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
583
  label = "PHISH"
 
 
 
 
584
  predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
585
- phish_proba = 0.95
586
  score = phish_proba
 
587
  return {
588
  "module": "url_analyzer",
589
  "label": label,
590
  "predicted_label": int(predicted_label),
591
  "score": float(score),
592
  "phishing_probability": float(phish_proba),
 
 
 
593
  "backend": "lookalike_guard",
594
  "threshold": 0.5,
595
  "url_col": url_col,
596
- "rule": "lookalike_character_detected",
597
  }
598
 
599
  # Typosquat guard
@@ -614,22 +774,28 @@ def predict_url(payload: PredictUrlPayload):
614
  has_digits = bool(re.search(r"\d", s_sld))
615
  has_hyphen = ("-" in s_sld)
616
  is_official = any(s_host.endswith(f"{_normalize_brand(b)}.com") for b in brands)
 
617
  if (best >= 0.90) and (has_digits or has_hyphen) and (not is_official):
618
- phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
619
  label = "PHISH"
 
 
 
 
620
  predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
621
- phish_proba = 0.90
622
  score = phish_proba
 
623
  return {
624
  "module": "url_analyzer",
625
  "label": label,
626
  "predicted_label": int(predicted_label),
627
  "score": float(score),
628
  "phishing_probability": float(phish_proba),
 
 
 
629
  "backend": "typosquat_guard",
630
  "threshold": 0.5,
631
  "url_col": url_col,
632
- "rule": "typosquat_guard",
633
  }
634
 
635
  # ML model inference
@@ -645,8 +811,20 @@ def predict_url(payload: PredictUrlPayload):
645
  pred = model.predict(feats)[0]
646
  raw_p_class1 = 1.0 if int(pred) == 1 else 0.0
647
 
648
- phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
649
- phish_proba = raw_p_class1 if phish_is_positive else (1.0 - raw_p_class1)
 
 
 
 
 
 
 
 
 
 
 
 
650
  label = "PHISH" if phish_proba >= 0.5 else "LEGIT"
651
  predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
652
  score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
@@ -657,6 +835,9 @@ def predict_url(payload: PredictUrlPayload):
657
  "predicted_label": int(predicted_label),
658
  "score": float(score),
659
  "phishing_probability": float(phish_proba),
 
 
 
660
  "backend": str(model_type),
661
  "threshold": 0.5,
662
  "url_col": url_col,
 
343
  return (url or "").strip().rstrip("/")
344
 
345
 
346
+ def _calibrate_confidence(raw_proba: float, url_str: str, detection_reason: Optional[str] = None) -> Dict[str, Any]:
347
+ """
348
+ Calibrate confidence score based on detection method to avoid showing 100% confidence.
349
+
350
+ Returns dict with:
351
+ - calibrated_proba: adjusted probability (50-85% for heuristic detections, 60-95% for ML)
352
+ - confidence_level: HIGH/MEDIUM/LOW
353
+ - detection_method: what triggered the detection
354
+ """
355
+
356
+ # Heuristic-based detections get lower confidence (50-75%)
357
+ if detection_reason == "lookalike_character_detected":
358
+ # Lookalike characters: 70-80% confidence
359
+ calibrated = 0.70 + (raw_proba * 0.10)
360
+ return {
361
+ "calibrated_proba": float(calibrated),
362
+ "confidence_level": "MEDIUM-HIGH",
363
+ "detection_method": "Homoglyph/Lookalike Character Pattern",
364
+ "explanation": "URL contains characters that visually resemble legitimate letters (e.g., Cyrillic 'а' instead of 'a')"
365
+ }
366
+
367
+ elif detection_reason == "typosquat_guard":
368
+ # Typosquatting: 65-78% confidence
369
+ calibrated = 0.65 + (raw_proba * 0.13)
370
+ return {
371
+ "calibrated_proba": float(calibrated),
372
+ "confidence_level": "MEDIUM",
373
+ "detection_method": "Brand Typosquatting Pattern",
374
+ "explanation": "Domain name closely resembles a popular brand with suspicious modifications (digits/hyphens)"
375
+ }
376
+
377
+ elif detection_reason == "csv_url_match":
378
+ # Known phishing URL from CSV: 85-95% confidence
379
+ calibrated = 0.85 + (raw_proba * 0.10)
380
+ return {
381
+ "calibrated_proba": float(calibrated),
382
+ "confidence_level": "HIGH",
383
+ "detection_method": "Known Phishing URL Database Match",
384
+ "explanation": "URL matches a verified phishing URL in our database"
385
+ }
386
+
387
+ elif detection_reason == "known_host_match":
388
+ # Known host from CSV: 80-92% confidence
389
+ calibrated = 0.80 + (raw_proba * 0.12)
390
+ return {
391
+ "calibrated_proba": float(calibrated),
392
+ "confidence_level": "HIGH",
393
+ "detection_method": "Known Malicious Host Database Match",
394
+ "explanation": "Domain is listed in our verified malicious hosts database"
395
+ }
396
+
397
+ # ML model detections: calibrate based on raw probability
398
+ else:
399
+ # Parse URL for additional context
400
+ host = (urlparse(_ensure_scheme(url_str)).hostname or "").lower()
401
+
402
+ # Count suspicious indicators
403
+ suspicious_count = 0
404
+ suspicious_features = []
405
+
406
+ # Check for suspicious keywords
407
+ suspicious_keywords = ["login", "verify", "secure", "update", "bank", "pay", "account", "webscr"]
408
+ for kw in suspicious_keywords:
409
+ if kw in url_str.lower():
410
+ suspicious_count += 1
411
+ suspicious_features.append(f"keyword:{kw}")
412
+
413
+ # Check for IP address
414
+ if re.search(r"(?:\d{1,3}\.){3}\d{1,3}", url_str):
415
+ suspicious_count += 1
416
+ suspicious_features.append("ip_address")
417
+
418
+ # Check for excessive length
419
+ if len(url_str) > 75:
420
+ suspicious_count += 1
421
+ suspicious_features.append("excessive_length")
422
+
423
+ # Check for many subdomains
424
+ if host.count('.') > 3:
425
+ suspicious_count += 1
426
+ suspicious_features.append("many_subdomains")
427
+
428
+ # Calibrate based on ML confidence and feature count
429
+ if raw_proba >= 0.90:
430
+ # Very high ML confidence + multiple indicators: 82-92%
431
+ if suspicious_count >= 3:
432
+ calibrated = 0.82 + (raw_proba * 0.10)
433
+ confidence = "HIGH"
434
+ # High ML confidence with fewer indicators: 75-88%
435
+ else:
436
+ calibrated = 0.75 + (raw_proba * 0.13)
437
+ confidence = "MEDIUM-HIGH"
438
+
439
+ elif raw_proba >= 0.75:
440
+ # Medium-high ML confidence: 68-82%
441
+ if suspicious_count >= 2:
442
+ calibrated = 0.68 + (raw_proba * 0.14)
443
+ confidence = "MEDIUM-HIGH"
444
+ else:
445
+ calibrated = 0.60 + (raw_proba * 0.15)
446
+ confidence = "MEDIUM"
447
+
448
+ elif raw_proba >= 0.60:
449
+ # Medium ML confidence: 55-72%
450
+ calibrated = 0.55 + (raw_proba * 0.17)
451
+ confidence = "MEDIUM"
452
+
453
+ else:
454
+ # Lower confidence: keep closer to original but cap at 65%
455
+ calibrated = min(0.65, 0.50 + (raw_proba * 0.15))
456
+ confidence = "LOW-MEDIUM"
457
+
458
+ feature_text = f" (Detected: {', '.join(suspicious_features[:3])})" if suspicious_features else ""
459
+
460
+ return {
461
+ "calibrated_proba": float(calibrated),
462
+ "confidence_level": confidence,
463
+ "detection_method": f"Machine Learning Analysis ({suspicious_count} suspicious indicators){feature_text}",
464
+ "explanation": "Random Forest model detected multiple phishing patterns in URL structure and content"
465
+ }
466
+
467
+
468
  # ============================================================================
469
  # API ENDPOINTS
470
  # ============================================================================
 
555
  ])
556
  emotional_appeal = blob.sentiment.subjectivity > 0.6
557
 
558
+ # Calibrated confidence for text analysis (50-85%)
559
+ base_score = min(0.85, 0.50 + (len(detected_keywords) * 0.10) + (keyword_density * 0.25))
560
+ if urgency_detected:
561
+ base_score = min(0.85, base_score + 0.15)
562
+ if emotional_appeal:
563
+ base_score = min(0.85, base_score + 0.10)
564
+
565
  phishing_indicators = {
566
  "suspicious_keywords": detected_keywords,
567
  "keyword_count": len(detected_keywords),
 
569
  "urgency_detected": urgency_detected,
570
  "emotional_appeal": emotional_appeal,
571
  "high_subjectivity": blob.sentiment.subjectivity > 0.6,
572
+ "risk_score": float(base_score),
573
+ "confidence_level": (
574
+ "HIGH" if base_score >= 0.75 else
575
+ "MEDIUM" if base_score >= 0.60 else
576
+ "LOW"
577
  ),
578
  "risk_level": (
579
  "HIGH" if len(detected_keywords) >= 3 or urgency_detected else
 
617
  """
618
  Module 4: URL Analyzer
619
 
620
+ Analyzes URLs for phishing using Random Forest model with calibrated confidence scores.
621
+ Confidence ranges: 50-85% (heuristic), 60-92% (ML model)
 
 
 
 
622
  """
623
  try:
624
  _load_url_model()
 
642
  if not url_str:
643
  return JSONResponse(status_code=400, content={"error": "Empty url"})
644
 
645
+ phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
646
+
647
  # URL-level override via CSV lists
648
  norm_url = _normalize_url_string(url_str)
649
  phishy_set = { _normalize_url_string(u) for u in phishy_list }
650
  legit_set = { _normalize_url_string(u) for u in legit_list }
651
 
652
  if norm_url in phishy_set or norm_url in legit_set:
 
653
  label = "PHISH" if norm_url in phishy_set else "LEGIT"
654
+ raw_proba = 0.99 if label == "PHISH" else 0.01
655
+
656
+ if label == "PHISH":
657
+ calibration = _calibrate_confidence(raw_proba, url_str, "csv_url_match")
658
+ phish_proba = calibration["calibrated_proba"]
659
+ else:
660
+ phish_proba = raw_proba
661
+ calibration = {
662
+ "confidence_level": "HIGH",
663
+ "detection_method": "Known Legitimate URL",
664
+ "explanation": "URL verified as legitimate in our database"
665
+ }
666
+
667
  predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
 
668
  score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
669
+
670
  return {
671
  "module": "url_analyzer",
672
  "label": label,
673
  "predicted_label": int(predicted_label),
674
  "score": float(score),
675
  "phishing_probability": float(phish_proba),
676
+ "confidence_level": calibration["confidence_level"],
677
+ "detection_method": calibration["detection_method"],
678
+ "explanation": calibration["explanation"],
679
  "backend": str(model_type),
680
  "threshold": 0.5,
681
  "url_col": url_col,
 
682
  }
683
 
684
  # Known-host override
 
686
  if host and host_map:
687
  for h, lbl in host_map.items():
688
  if _host_matches_any(host, [h]):
 
689
  label = lbl
690
+ raw_proba = 0.99 if label == "PHISH" else 0.01
691
+
692
+ if label == "PHISH":
693
+ calibration = _calibrate_confidence(raw_proba, url_str, "known_host_match")
694
+ phish_proba = calibration["calibrated_proba"]
695
+ else:
696
+ phish_proba = raw_proba
697
+ calibration = {
698
+ "confidence_level": "HIGH",
699
+ "detection_method": "Known Legitimate Host",
700
+ "explanation": "Domain verified as legitimate"
701
+ }
702
+
703
  predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
 
704
  score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
705
+
706
  return {
707
  "module": "url_analyzer",
708
  "label": label,
709
  "predicted_label": int(predicted_label),
710
  "score": float(score),
711
  "phishing_probability": float(phish_proba),
712
+ "confidence_level": calibration["confidence_level"],
713
+ "detection_method": calibration["detection_method"],
714
+ "explanation": calibration["explanation"],
715
  "backend": str(model_type),
716
  "threshold": 0.5,
717
  "url_col": url_col,
 
718
  }
719
 
720
  # Lookalike character guard
 
734
 
735
  for char in url_str:
736
  if char in all_lookalikes:
 
737
  label = "PHISH"
738
+ raw_proba = 0.95
739
+ calibration = _calibrate_confidence(raw_proba, url_str, "lookalike_character_detected")
740
+ phish_proba = calibration["calibrated_proba"]
741
+
742
  predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
 
743
  score = phish_proba
744
+
745
  return {
746
  "module": "url_analyzer",
747
  "label": label,
748
  "predicted_label": int(predicted_label),
749
  "score": float(score),
750
  "phishing_probability": float(phish_proba),
751
+ "confidence_level": calibration["confidence_level"],
752
+ "detection_method": calibration["detection_method"],
753
+ "explanation": calibration["explanation"],
754
  "backend": "lookalike_guard",
755
  "threshold": 0.5,
756
  "url_col": url_col,
 
757
  }
758
 
759
  # Typosquat guard
 
774
  has_digits = bool(re.search(r"\d", s_sld))
775
  has_hyphen = ("-" in s_sld)
776
  is_official = any(s_host.endswith(f"{_normalize_brand(b)}.com") for b in brands)
777
+
778
  if (best >= 0.90) and (has_digits or has_hyphen) and (not is_official):
 
779
  label = "PHISH"
780
+ raw_proba = 0.90
781
+ calibration = _calibrate_confidence(raw_proba, url_str, "typosquat_guard")
782
+ phish_proba = calibration["calibrated_proba"]
783
+
784
  predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
 
785
  score = phish_proba
786
+
787
  return {
788
  "module": "url_analyzer",
789
  "label": label,
790
  "predicted_label": int(predicted_label),
791
  "score": float(score),
792
  "phishing_probability": float(phish_proba),
793
+ "confidence_level": calibration["confidence_level"],
794
+ "detection_method": calibration["detection_method"],
795
+ "explanation": calibration["explanation"],
796
  "backend": "typosquat_guard",
797
  "threshold": 0.5,
798
  "url_col": url_col,
 
799
  }
800
 
801
  # ML model inference
 
811
  pred = model.predict(feats)[0]
812
  raw_p_class1 = 1.0 if int(pred) == 1 else 0.0
813
 
814
+ raw_phish_proba = raw_p_class1 if phish_is_positive else (1.0 - raw_p_class1)
815
+
816
+ # Calibrate ML model predictions
817
+ if raw_phish_proba >= 0.5:
818
+ calibration = _calibrate_confidence(raw_phish_proba, url_str, None)
819
+ phish_proba = calibration["calibrated_proba"]
820
+ else:
821
+ phish_proba = raw_phish_proba
822
+ calibration = {
823
+ "confidence_level": "HIGH",
824
+ "detection_method": "Machine Learning Analysis",
825
+ "explanation": "Random Forest model analysis indicates legitimate URL patterns"
826
+ }
827
+
828
  label = "PHISH" if phish_proba >= 0.5 else "LEGIT"
829
  predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
830
  score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
 
835
  "predicted_label": int(predicted_label),
836
  "score": float(score),
837
  "phishing_probability": float(phish_proba),
838
+ "confidence_level": calibration.get("confidence_level", "MEDIUM"),
839
+ "detection_method": calibration.get("detection_method", "ML Analysis"),
840
+ "explanation": calibration.get("explanation", "Statistical analysis of URL patterns"),
841
  "backend": str(model_type),
842
  "threshold": 0.5,
843
  "url_col": url_col,