Update app.py
Browse files
app.py
CHANGED
|
@@ -75,7 +75,7 @@ if NLTK_AVAILABLE:
|
|
| 75 |
'alert', 'warning', 'action required', 'unusual activity', 'compromised'
|
| 76 |
}
|
| 77 |
|
| 78 |
-
# Consolidated lookalike characters dictionary
|
| 79 |
LOOKALIKE_CHARS = {
|
| 80 |
# Cyrillic
|
| 81 |
'а': 'a', 'е': 'e', 'о': 'o', 'р': 'p', 'с': 'c', 'х': 'x',
|
|
@@ -90,11 +90,11 @@ LOOKALIKE_CHARS = {
|
|
| 90 |
|
| 91 |
BRAND_NAMES = [
|
| 92 |
"facebook", "linkedin", "paypal", "google", "amazon", "apple",
|
| 93 |
-
"microsoft", "instagram", "netflix", "twitter", "whatsapp"
|
| 94 |
]
|
| 95 |
|
| 96 |
SUSPICIOUS_KEYWORDS = ["login", "verify", "secure", "update", "bank", "pay", "account", "webscr"]
|
| 97 |
-
SUSPICIOUS_TLDS = {"tk", "ml", "ga", "cf", "gq", "xyz", "top", "buzz", "icu", "fit", "rest", "work", "click", "country", "zip"}
|
| 98 |
|
| 99 |
|
| 100 |
app = FastAPI(
|
|
@@ -425,7 +425,8 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
|
|
| 425 |
def _ratio_digits(txt: str) -> float:
|
| 426 |
if not txt:
|
| 427 |
return 0.0
|
| 428 |
-
|
|
|
|
| 429 |
|
| 430 |
out["sld_digit_ratio"] = sld_series.apply(_ratio_digits)
|
| 431 |
out["sld_entropy"] = sld_series.apply(_shannon_entropy)
|
|
@@ -435,10 +436,11 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
|
|
| 435 |
if not host:
|
| 436 |
return 0.0
|
| 437 |
sld = host.split(".")[-2] if "." in host else host
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
SequenceMatcher(None,
|
| 441 |
-
|
|
|
|
| 442 |
|
| 443 |
out["max_brand_sim"] = hosts.apply(_max_brand_similarity)
|
| 444 |
out["like_facebook"] = hosts.apply(
|
|
|
|
| 75 |
'alert', 'warning', 'action required', 'unusual activity', 'compromised'
|
| 76 |
}
|
| 77 |
|
| 78 |
+
# Consolidated lookalike characters dictionary
|
| 79 |
LOOKALIKE_CHARS = {
|
| 80 |
# Cyrillic
|
| 81 |
'а': 'a', 'е': 'e', 'о': 'o', 'р': 'p', 'с': 'c', 'х': 'x',
|
|
|
|
| 90 |
|
| 91 |
BRAND_NAMES = [
|
| 92 |
"facebook", "linkedin", "paypal", "google", "amazon", "apple",
|
| 93 |
+
"microsoft", "instagram", "netflix", "twitter", "whatsapp", "bank", "hsbc", "yahoo", "outlook"
|
| 94 |
]
|
| 95 |
|
| 96 |
SUSPICIOUS_KEYWORDS = ["login", "verify", "secure", "update", "bank", "pay", "account", "webscr"]
|
| 97 |
+
SUSPICIOUS_TLDS = {"tk", "ml", "ga", "cf", "gq", "xyz", "top", "buzz", "icu", "fit", "rest", "work", "click", "country", "zip", "ru", "kim", "support", "ltd"}
|
| 98 |
|
| 99 |
|
| 100 |
app = FastAPI(
|
|
|
|
| 425 |
def _ratio_digits(txt: str) -> float:
|
| 426 |
if not txt:
|
| 427 |
return 0.0
|
| 428 |
+
digits = sum(c.isdigit() for c in txt)
|
| 429 |
+
return float(digits) / float(len(txt))
|
| 430 |
|
| 431 |
out["sld_digit_ratio"] = sld_series.apply(_ratio_digits)
|
| 432 |
out["sld_entropy"] = sld_series.apply(_shannon_entropy)
|
|
|
|
| 436 |
if not host:
|
| 437 |
return 0.0
|
| 438 |
sld = host.split(".")[-2] if "." in host else host
|
| 439 |
+
similarities = []
|
| 440 |
+
for brand in BRAND_NAMES:
|
| 441 |
+
similarities.append(SequenceMatcher(None, host, brand).ratio())
|
| 442 |
+
similarities.append(SequenceMatcher(None, sld, brand).ratio())
|
| 443 |
+
return max(similarities) if similarities else 0.0
|
| 444 |
|
| 445 |
out["max_brand_sim"] = hosts.apply(_max_brand_similarity)
|
| 446 |
out["like_facebook"] = hosts.apply(
|