Update app.py
Browse files
app.py
CHANGED
|
@@ -343,6 +343,128 @@ def _normalize_url_string(url: str) -> str:
|
|
| 343 |
return (url or "").strip().rstrip("/")
|
| 344 |
|
| 345 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
# ============================================================================
|
| 347 |
# API ENDPOINTS
|
| 348 |
# ============================================================================
|
|
@@ -433,6 +555,13 @@ def preprocess_text(payload: PreprocessTextPayload):
|
|
| 433 |
])
|
| 434 |
emotional_appeal = blob.sentiment.subjectivity > 0.6
|
| 435 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 436 |
phishing_indicators = {
|
| 437 |
"suspicious_keywords": detected_keywords,
|
| 438 |
"keyword_count": len(detected_keywords),
|
|
@@ -440,11 +569,11 @@ def preprocess_text(payload: PreprocessTextPayload):
|
|
| 440 |
"urgency_detected": urgency_detected,
|
| 441 |
"emotional_appeal": emotional_appeal,
|
| 442 |
"high_subjectivity": blob.sentiment.subjectivity > 0.6,
|
| 443 |
-
"risk_score":
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
),
|
| 449 |
"risk_level": (
|
| 450 |
"HIGH" if len(detected_keywords) >= 3 or urgency_detected else
|
|
@@ -488,12 +617,8 @@ def predict_url(payload: PredictUrlPayload):
|
|
| 488 |
"""
|
| 489 |
Module 4: URL Analyzer
|
| 490 |
|
| 491 |
-
Analyzes URLs for phishing using Random Forest model with
|
| 492 |
-
|
| 493 |
-
- Domain analysis (SLD, TLD, subdomains)
|
| 494 |
-
- Typosquatting detection
|
| 495 |
-
- Lookalike character detection
|
| 496 |
-
- Brand similarity analysis
|
| 497 |
"""
|
| 498 |
try:
|
| 499 |
_load_url_model()
|
|
@@ -517,27 +642,43 @@ def predict_url(payload: PredictUrlPayload):
|
|
| 517 |
if not url_str:
|
| 518 |
return JSONResponse(status_code=400, content={"error": "Empty url"})
|
| 519 |
|
|
|
|
|
|
|
| 520 |
# URL-level override via CSV lists
|
| 521 |
norm_url = _normalize_url_string(url_str)
|
| 522 |
phishy_set = { _normalize_url_string(u) for u in phishy_list }
|
| 523 |
legit_set = { _normalize_url_string(u) for u in legit_list }
|
| 524 |
|
| 525 |
if norm_url in phishy_set or norm_url in legit_set:
|
| 526 |
-
phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
|
| 527 |
label = "PHISH" if norm_url in phishy_set else "LEGIT"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
|
| 529 |
-
phish_proba = 0.99 if label == "PHISH" else 0.01
|
| 530 |
score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
|
|
|
|
| 531 |
return {
|
| 532 |
"module": "url_analyzer",
|
| 533 |
"label": label,
|
| 534 |
"predicted_label": int(predicted_label),
|
| 535 |
"score": float(score),
|
| 536 |
"phishing_probability": float(phish_proba),
|
|
|
|
|
|
|
|
|
|
| 537 |
"backend": str(model_type),
|
| 538 |
"threshold": 0.5,
|
| 539 |
"url_col": url_col,
|
| 540 |
-
"override": {"reason": "csv_url_match"},
|
| 541 |
}
|
| 542 |
|
| 543 |
# Known-host override
|
|
@@ -545,21 +686,35 @@ def predict_url(payload: PredictUrlPayload):
|
|
| 545 |
if host and host_map:
|
| 546 |
for h, lbl in host_map.items():
|
| 547 |
if _host_matches_any(host, [h]):
|
| 548 |
-
phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
|
| 549 |
label = lbl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 550 |
predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
|
| 551 |
-
phish_proba = 0.99 if label == "PHISH" else 0.01
|
| 552 |
score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
|
|
|
|
| 553 |
return {
|
| 554 |
"module": "url_analyzer",
|
| 555 |
"label": label,
|
| 556 |
"predicted_label": int(predicted_label),
|
| 557 |
"score": float(score),
|
| 558 |
"phishing_probability": float(phish_proba),
|
|
|
|
|
|
|
|
|
|
| 559 |
"backend": str(model_type),
|
| 560 |
"threshold": 0.5,
|
| 561 |
"url_col": url_col,
|
| 562 |
-
"override": {"reason": "known_host_match"},
|
| 563 |
}
|
| 564 |
|
| 565 |
# Lookalike character guard
|
|
@@ -579,21 +734,26 @@ def predict_url(payload: PredictUrlPayload):
|
|
| 579 |
|
| 580 |
for char in url_str:
|
| 581 |
if char in all_lookalikes:
|
| 582 |
-
phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
|
| 583 |
label = "PHISH"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
|
| 585 |
-
phish_proba = 0.95
|
| 586 |
score = phish_proba
|
|
|
|
| 587 |
return {
|
| 588 |
"module": "url_analyzer",
|
| 589 |
"label": label,
|
| 590 |
"predicted_label": int(predicted_label),
|
| 591 |
"score": float(score),
|
| 592 |
"phishing_probability": float(phish_proba),
|
|
|
|
|
|
|
|
|
|
| 593 |
"backend": "lookalike_guard",
|
| 594 |
"threshold": 0.5,
|
| 595 |
"url_col": url_col,
|
| 596 |
-
"rule": "lookalike_character_detected",
|
| 597 |
}
|
| 598 |
|
| 599 |
# Typosquat guard
|
|
@@ -614,22 +774,28 @@ def predict_url(payload: PredictUrlPayload):
|
|
| 614 |
has_digits = bool(re.search(r"\d", s_sld))
|
| 615 |
has_hyphen = ("-" in s_sld)
|
| 616 |
is_official = any(s_host.endswith(f"{_normalize_brand(b)}.com") for b in brands)
|
|
|
|
| 617 |
if (best >= 0.90) and (has_digits or has_hyphen) and (not is_official):
|
| 618 |
-
phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
|
| 619 |
label = "PHISH"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 620 |
predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
|
| 621 |
-
phish_proba = 0.90
|
| 622 |
score = phish_proba
|
|
|
|
| 623 |
return {
|
| 624 |
"module": "url_analyzer",
|
| 625 |
"label": label,
|
| 626 |
"predicted_label": int(predicted_label),
|
| 627 |
"score": float(score),
|
| 628 |
"phishing_probability": float(phish_proba),
|
|
|
|
|
|
|
|
|
|
| 629 |
"backend": "typosquat_guard",
|
| 630 |
"threshold": 0.5,
|
| 631 |
"url_col": url_col,
|
| 632 |
-
"rule": "typosquat_guard",
|
| 633 |
}
|
| 634 |
|
| 635 |
# ML model inference
|
|
@@ -645,8 +811,20 @@ def predict_url(payload: PredictUrlPayload):
|
|
| 645 |
pred = model.predict(feats)[0]
|
| 646 |
raw_p_class1 = 1.0 if int(pred) == 1 else 0.0
|
| 647 |
|
| 648 |
-
|
| 649 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 650 |
label = "PHISH" if phish_proba >= 0.5 else "LEGIT"
|
| 651 |
predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
|
| 652 |
score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
|
|
@@ -657,6 +835,9 @@ def predict_url(payload: PredictUrlPayload):
|
|
| 657 |
"predicted_label": int(predicted_label),
|
| 658 |
"score": float(score),
|
| 659 |
"phishing_probability": float(phish_proba),
|
|
|
|
|
|
|
|
|
|
| 660 |
"backend": str(model_type),
|
| 661 |
"threshold": 0.5,
|
| 662 |
"url_col": url_col,
|
|
|
|
| 343 |
return (url or "").strip().rstrip("/")
|
| 344 |
|
| 345 |
|
| 346 |
+
def _calibrate_confidence(raw_proba: float, url_str: str, detection_reason: Optional[str] = None) -> Dict[str, Any]:
|
| 347 |
+
"""
|
| 348 |
+
Calibrate confidence score based on detection method to avoid showing 100% confidence.
|
| 349 |
+
|
| 350 |
+
Returns dict with:
|
| 351 |
+
- calibrated_proba: adjusted probability (50-85% for heuristic detections, 60-95% for ML)
|
| 352 |
+
- confidence_level: HIGH/MEDIUM/LOW
|
| 353 |
+
- detection_method: what triggered the detection
|
| 354 |
+
"""
|
| 355 |
+
|
| 356 |
+
# Heuristic-based detections get lower confidence (50-75%)
|
| 357 |
+
if detection_reason == "lookalike_character_detected":
|
| 358 |
+
# Lookalike characters: 70-80% confidence
|
| 359 |
+
calibrated = 0.70 + (raw_proba * 0.10)
|
| 360 |
+
return {
|
| 361 |
+
"calibrated_proba": float(calibrated),
|
| 362 |
+
"confidence_level": "MEDIUM-HIGH",
|
| 363 |
+
"detection_method": "Homoglyph/Lookalike Character Pattern",
|
| 364 |
+
"explanation": "URL contains characters that visually resemble legitimate letters (e.g., Cyrillic 'а' instead of 'a')"
|
| 365 |
+
}
|
| 366 |
+
|
| 367 |
+
elif detection_reason == "typosquat_guard":
|
| 368 |
+
# Typosquatting: 65-78% confidence
|
| 369 |
+
calibrated = 0.65 + (raw_proba * 0.13)
|
| 370 |
+
return {
|
| 371 |
+
"calibrated_proba": float(calibrated),
|
| 372 |
+
"confidence_level": "MEDIUM",
|
| 373 |
+
"detection_method": "Brand Typosquatting Pattern",
|
| 374 |
+
"explanation": "Domain name closely resembles a popular brand with suspicious modifications (digits/hyphens)"
|
| 375 |
+
}
|
| 376 |
+
|
| 377 |
+
elif detection_reason == "csv_url_match":
|
| 378 |
+
# Known phishing URL from CSV: 85-95% confidence
|
| 379 |
+
calibrated = 0.85 + (raw_proba * 0.10)
|
| 380 |
+
return {
|
| 381 |
+
"calibrated_proba": float(calibrated),
|
| 382 |
+
"confidence_level": "HIGH",
|
| 383 |
+
"detection_method": "Known Phishing URL Database Match",
|
| 384 |
+
"explanation": "URL matches a verified phishing URL in our database"
|
| 385 |
+
}
|
| 386 |
+
|
| 387 |
+
elif detection_reason == "known_host_match":
|
| 388 |
+
# Known host from CSV: 80-92% confidence
|
| 389 |
+
calibrated = 0.80 + (raw_proba * 0.12)
|
| 390 |
+
return {
|
| 391 |
+
"calibrated_proba": float(calibrated),
|
| 392 |
+
"confidence_level": "HIGH",
|
| 393 |
+
"detection_method": "Known Malicious Host Database Match",
|
| 394 |
+
"explanation": "Domain is listed in our verified malicious hosts database"
|
| 395 |
+
}
|
| 396 |
+
|
| 397 |
+
# ML model detections: calibrate based on raw probability
|
| 398 |
+
else:
|
| 399 |
+
# Parse URL for additional context
|
| 400 |
+
host = (urlparse(_ensure_scheme(url_str)).hostname or "").lower()
|
| 401 |
+
|
| 402 |
+
# Count suspicious indicators
|
| 403 |
+
suspicious_count = 0
|
| 404 |
+
suspicious_features = []
|
| 405 |
+
|
| 406 |
+
# Check for suspicious keywords
|
| 407 |
+
suspicious_keywords = ["login", "verify", "secure", "update", "bank", "pay", "account", "webscr"]
|
| 408 |
+
for kw in suspicious_keywords:
|
| 409 |
+
if kw in url_str.lower():
|
| 410 |
+
suspicious_count += 1
|
| 411 |
+
suspicious_features.append(f"keyword:{kw}")
|
| 412 |
+
|
| 413 |
+
# Check for IP address
|
| 414 |
+
if re.search(r"(?:\d{1,3}\.){3}\d{1,3}", url_str):
|
| 415 |
+
suspicious_count += 1
|
| 416 |
+
suspicious_features.append("ip_address")
|
| 417 |
+
|
| 418 |
+
# Check for excessive length
|
| 419 |
+
if len(url_str) > 75:
|
| 420 |
+
suspicious_count += 1
|
| 421 |
+
suspicious_features.append("excessive_length")
|
| 422 |
+
|
| 423 |
+
# Check for many subdomains
|
| 424 |
+
if host.count('.') > 3:
|
| 425 |
+
suspicious_count += 1
|
| 426 |
+
suspicious_features.append("many_subdomains")
|
| 427 |
+
|
| 428 |
+
# Calibrate based on ML confidence and feature count
|
| 429 |
+
if raw_proba >= 0.90:
|
| 430 |
+
# Very high ML confidence + multiple indicators: 82-92%
|
| 431 |
+
if suspicious_count >= 3:
|
| 432 |
+
calibrated = 0.82 + (raw_proba * 0.10)
|
| 433 |
+
confidence = "HIGH"
|
| 434 |
+
# High ML confidence with fewer indicators: 75-88%
|
| 435 |
+
else:
|
| 436 |
+
calibrated = 0.75 + (raw_proba * 0.13)
|
| 437 |
+
confidence = "MEDIUM-HIGH"
|
| 438 |
+
|
| 439 |
+
elif raw_proba >= 0.75:
|
| 440 |
+
# Medium-high ML confidence: 68-82%
|
| 441 |
+
if suspicious_count >= 2:
|
| 442 |
+
calibrated = 0.68 + (raw_proba * 0.14)
|
| 443 |
+
confidence = "MEDIUM-HIGH"
|
| 444 |
+
else:
|
| 445 |
+
calibrated = 0.60 + (raw_proba * 0.15)
|
| 446 |
+
confidence = "MEDIUM"
|
| 447 |
+
|
| 448 |
+
elif raw_proba >= 0.60:
|
| 449 |
+
# Medium ML confidence: 55-72%
|
| 450 |
+
calibrated = 0.55 + (raw_proba * 0.17)
|
| 451 |
+
confidence = "MEDIUM"
|
| 452 |
+
|
| 453 |
+
else:
|
| 454 |
+
# Lower confidence: keep closer to original but cap at 65%
|
| 455 |
+
calibrated = min(0.65, 0.50 + (raw_proba * 0.15))
|
| 456 |
+
confidence = "LOW-MEDIUM"
|
| 457 |
+
|
| 458 |
+
feature_text = f" (Detected: {', '.join(suspicious_features[:3])})" if suspicious_features else ""
|
| 459 |
+
|
| 460 |
+
return {
|
| 461 |
+
"calibrated_proba": float(calibrated),
|
| 462 |
+
"confidence_level": confidence,
|
| 463 |
+
"detection_method": f"Machine Learning Analysis ({suspicious_count} suspicious indicators){feature_text}",
|
| 464 |
+
"explanation": "Random Forest model detected multiple phishing patterns in URL structure and content"
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
|
| 468 |
# ============================================================================
|
| 469 |
# API ENDPOINTS
|
| 470 |
# ============================================================================
|
|
|
|
| 555 |
])
|
| 556 |
emotional_appeal = blob.sentiment.subjectivity > 0.6
|
| 557 |
|
| 558 |
+
# Calibrated confidence for text analysis (50-85%)
|
| 559 |
+
base_score = min(0.85, 0.50 + (len(detected_keywords) * 0.10) + (keyword_density * 0.25))
|
| 560 |
+
if urgency_detected:
|
| 561 |
+
base_score = min(0.85, base_score + 0.15)
|
| 562 |
+
if emotional_appeal:
|
| 563 |
+
base_score = min(0.85, base_score + 0.10)
|
| 564 |
+
|
| 565 |
phishing_indicators = {
|
| 566 |
"suspicious_keywords": detected_keywords,
|
| 567 |
"keyword_count": len(detected_keywords),
|
|
|
|
| 569 |
"urgency_detected": urgency_detected,
|
| 570 |
"emotional_appeal": emotional_appeal,
|
| 571 |
"high_subjectivity": blob.sentiment.subjectivity > 0.6,
|
| 572 |
+
"risk_score": float(base_score),
|
| 573 |
+
"confidence_level": (
|
| 574 |
+
"HIGH" if base_score >= 0.75 else
|
| 575 |
+
"MEDIUM" if base_score >= 0.60 else
|
| 576 |
+
"LOW"
|
| 577 |
),
|
| 578 |
"risk_level": (
|
| 579 |
"HIGH" if len(detected_keywords) >= 3 or urgency_detected else
|
|
|
|
| 617 |
"""
|
| 618 |
Module 4: URL Analyzer
|
| 619 |
|
| 620 |
+
Analyzes URLs for phishing using Random Forest model with calibrated confidence scores.
|
| 621 |
+
Confidence ranges: 50-85% (heuristic), 60-92% (ML model)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 622 |
"""
|
| 623 |
try:
|
| 624 |
_load_url_model()
|
|
|
|
| 642 |
if not url_str:
|
| 643 |
return JSONResponse(status_code=400, content={"error": "Empty url"})
|
| 644 |
|
| 645 |
+
phish_is_positive = True if URL_POSITIVE_CLASS_ENV == "" else (URL_POSITIVE_CLASS_ENV == "PHISH")
|
| 646 |
+
|
| 647 |
# URL-level override via CSV lists
|
| 648 |
norm_url = _normalize_url_string(url_str)
|
| 649 |
phishy_set = { _normalize_url_string(u) for u in phishy_list }
|
| 650 |
legit_set = { _normalize_url_string(u) for u in legit_list }
|
| 651 |
|
| 652 |
if norm_url in phishy_set or norm_url in legit_set:
|
|
|
|
| 653 |
label = "PHISH" if norm_url in phishy_set else "LEGIT"
|
| 654 |
+
raw_proba = 0.99 if label == "PHISH" else 0.01
|
| 655 |
+
|
| 656 |
+
if label == "PHISH":
|
| 657 |
+
calibration = _calibrate_confidence(raw_proba, url_str, "csv_url_match")
|
| 658 |
+
phish_proba = calibration["calibrated_proba"]
|
| 659 |
+
else:
|
| 660 |
+
phish_proba = raw_proba
|
| 661 |
+
calibration = {
|
| 662 |
+
"confidence_level": "HIGH",
|
| 663 |
+
"detection_method": "Known Legitimate URL",
|
| 664 |
+
"explanation": "URL verified as legitimate in our database"
|
| 665 |
+
}
|
| 666 |
+
|
| 667 |
predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
|
|
|
|
| 668 |
score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
|
| 669 |
+
|
| 670 |
return {
|
| 671 |
"module": "url_analyzer",
|
| 672 |
"label": label,
|
| 673 |
"predicted_label": int(predicted_label),
|
| 674 |
"score": float(score),
|
| 675 |
"phishing_probability": float(phish_proba),
|
| 676 |
+
"confidence_level": calibration["confidence_level"],
|
| 677 |
+
"detection_method": calibration["detection_method"],
|
| 678 |
+
"explanation": calibration["explanation"],
|
| 679 |
"backend": str(model_type),
|
| 680 |
"threshold": 0.5,
|
| 681 |
"url_col": url_col,
|
|
|
|
| 682 |
}
|
| 683 |
|
| 684 |
# Known-host override
|
|
|
|
| 686 |
if host and host_map:
|
| 687 |
for h, lbl in host_map.items():
|
| 688 |
if _host_matches_any(host, [h]):
|
|
|
|
| 689 |
label = lbl
|
| 690 |
+
raw_proba = 0.99 if label == "PHISH" else 0.01
|
| 691 |
+
|
| 692 |
+
if label == "PHISH":
|
| 693 |
+
calibration = _calibrate_confidence(raw_proba, url_str, "known_host_match")
|
| 694 |
+
phish_proba = calibration["calibrated_proba"]
|
| 695 |
+
else:
|
| 696 |
+
phish_proba = raw_proba
|
| 697 |
+
calibration = {
|
| 698 |
+
"confidence_level": "HIGH",
|
| 699 |
+
"detection_method": "Known Legitimate Host",
|
| 700 |
+
"explanation": "Domain verified as legitimate"
|
| 701 |
+
}
|
| 702 |
+
|
| 703 |
predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
|
|
|
|
| 704 |
score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
|
| 705 |
+
|
| 706 |
return {
|
| 707 |
"module": "url_analyzer",
|
| 708 |
"label": label,
|
| 709 |
"predicted_label": int(predicted_label),
|
| 710 |
"score": float(score),
|
| 711 |
"phishing_probability": float(phish_proba),
|
| 712 |
+
"confidence_level": calibration["confidence_level"],
|
| 713 |
+
"detection_method": calibration["detection_method"],
|
| 714 |
+
"explanation": calibration["explanation"],
|
| 715 |
"backend": str(model_type),
|
| 716 |
"threshold": 0.5,
|
| 717 |
"url_col": url_col,
|
|
|
|
| 718 |
}
|
| 719 |
|
| 720 |
# Lookalike character guard
|
|
|
|
| 734 |
|
| 735 |
for char in url_str:
|
| 736 |
if char in all_lookalikes:
|
|
|
|
| 737 |
label = "PHISH"
|
| 738 |
+
raw_proba = 0.95
|
| 739 |
+
calibration = _calibrate_confidence(raw_proba, url_str, "lookalike_character_detected")
|
| 740 |
+
phish_proba = calibration["calibrated_proba"]
|
| 741 |
+
|
| 742 |
predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
|
|
|
|
| 743 |
score = phish_proba
|
| 744 |
+
|
| 745 |
return {
|
| 746 |
"module": "url_analyzer",
|
| 747 |
"label": label,
|
| 748 |
"predicted_label": int(predicted_label),
|
| 749 |
"score": float(score),
|
| 750 |
"phishing_probability": float(phish_proba),
|
| 751 |
+
"confidence_level": calibration["confidence_level"],
|
| 752 |
+
"detection_method": calibration["detection_method"],
|
| 753 |
+
"explanation": calibration["explanation"],
|
| 754 |
"backend": "lookalike_guard",
|
| 755 |
"threshold": 0.5,
|
| 756 |
"url_col": url_col,
|
|
|
|
| 757 |
}
|
| 758 |
|
| 759 |
# Typosquat guard
|
|
|
|
| 774 |
has_digits = bool(re.search(r"\d", s_sld))
|
| 775 |
has_hyphen = ("-" in s_sld)
|
| 776 |
is_official = any(s_host.endswith(f"{_normalize_brand(b)}.com") for b in brands)
|
| 777 |
+
|
| 778 |
if (best >= 0.90) and (has_digits or has_hyphen) and (not is_official):
|
|
|
|
| 779 |
label = "PHISH"
|
| 780 |
+
raw_proba = 0.90
|
| 781 |
+
calibration = _calibrate_confidence(raw_proba, url_str, "typosquat_guard")
|
| 782 |
+
phish_proba = calibration["calibrated_proba"]
|
| 783 |
+
|
| 784 |
predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
|
|
|
|
| 785 |
score = phish_proba
|
| 786 |
+
|
| 787 |
return {
|
| 788 |
"module": "url_analyzer",
|
| 789 |
"label": label,
|
| 790 |
"predicted_label": int(predicted_label),
|
| 791 |
"score": float(score),
|
| 792 |
"phishing_probability": float(phish_proba),
|
| 793 |
+
"confidence_level": calibration["confidence_level"],
|
| 794 |
+
"detection_method": calibration["detection_method"],
|
| 795 |
+
"explanation": calibration["explanation"],
|
| 796 |
"backend": "typosquat_guard",
|
| 797 |
"threshold": 0.5,
|
| 798 |
"url_col": url_col,
|
|
|
|
| 799 |
}
|
| 800 |
|
| 801 |
# ML model inference
|
|
|
|
| 811 |
pred = model.predict(feats)[0]
|
| 812 |
raw_p_class1 = 1.0 if int(pred) == 1 else 0.0
|
| 813 |
|
| 814 |
+
raw_phish_proba = raw_p_class1 if phish_is_positive else (1.0 - raw_p_class1)
|
| 815 |
+
|
| 816 |
+
# Calibrate ML model predictions
|
| 817 |
+
if raw_phish_proba >= 0.5:
|
| 818 |
+
calibration = _calibrate_confidence(raw_phish_proba, url_str, None)
|
| 819 |
+
phish_proba = calibration["calibrated_proba"]
|
| 820 |
+
else:
|
| 821 |
+
phish_proba = raw_phish_proba
|
| 822 |
+
calibration = {
|
| 823 |
+
"confidence_level": "HIGH",
|
| 824 |
+
"detection_method": "Machine Learning Analysis",
|
| 825 |
+
"explanation": "Random Forest model analysis indicates legitimate URL patterns"
|
| 826 |
+
}
|
| 827 |
+
|
| 828 |
label = "PHISH" if phish_proba >= 0.5 else "LEGIT"
|
| 829 |
predicted_label = 1 if ((label == "PHISH") == phish_is_positive) else 0
|
| 830 |
score = phish_proba if label == "PHISH" else (1.0 - phish_proba)
|
|
|
|
| 835 |
"predicted_label": int(predicted_label),
|
| 836 |
"score": float(score),
|
| 837 |
"phishing_probability": float(phish_proba),
|
| 838 |
+
"confidence_level": calibration.get("confidence_level", "MEDIUM"),
|
| 839 |
+
"detection_method": calibration.get("detection_method", "ML Analysis"),
|
| 840 |
+
"explanation": calibration.get("explanation", "Statistical analysis of URL patterns"),
|
| 841 |
"backend": str(model_type),
|
| 842 |
"threshold": 0.5,
|
| 843 |
"url_col": url_col,
|