Upload app.py
Browse files
app.py
CHANGED
|
@@ -247,6 +247,45 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
|
|
| 247 |
out["max_brand_sim"] = hosts.apply(_max_brand_similarity)
|
| 248 |
out["like_facebook"] = hosts.apply(lambda h: _like_brand(h, "facebook"))
|
| 249 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
# Return columns in the exact order expected by the model; fill any
|
| 251 |
# still-missing engineered columns with zeros to stay robust across
|
| 252 |
# model updates.
|
|
|
|
| 247 |
out["max_brand_sim"] = hosts.apply(_max_brand_similarity)
|
| 248 |
out["like_facebook"] = hosts.apply(lambda h: _like_brand(h, "facebook"))
|
| 249 |
|
| 250 |
+
# Lookalike/homoglyph detection: unusual Unicode symbols that resemble ASCII letters
|
| 251 |
+
# Examples: Cyrillic а (U+0430) looks like 'a', Greek α (U+03B1) looks like 'a', etc.
|
| 252 |
+
def _detect_lookalike_chars(url: str) -> int:
|
| 253 |
+
"""
|
| 254 |
+
Detects if URL contains Unicode characters that visually resemble ASCII letters.
|
| 255 |
+
Common lookalikes used in phishing:
|
| 256 |
+
- Cyrillic: а, е, о, р, с, х, у, ч, ы, ь (look like a,e,o,p,c,x,y,4,b,b)
|
| 257 |
+
- Greek: α, ο (look like a, o)
|
| 258 |
+
- Latin Extended: ɑ, ɢ, ᴅ, ɡ, ɪ, ɴ, ɪ (look like a,G,D,g,i,N,I)
|
| 259 |
+
"""
|
| 260 |
+
url_str = url or ""
|
| 261 |
+
|
| 262 |
+
# Cyrillic characters that look like ASCII letters
|
| 263 |
+
lookalikes_cyrillic = {
|
| 264 |
+
'а': 'a', 'е': 'e', 'о': 'o', 'р': 'p', 'с': 'c', 'х': 'x',
|
| 265 |
+
'у': 'y', 'ч': '4', 'ы': 'b', 'ь': 'b', 'і': 'i', 'ї': 'yi',
|
| 266 |
+
'ґ': 'g', 'ė': 'e', 'ń': 'n', 'ș': 's', 'ț': 't'
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
# Greek characters that look like ASCII letters
|
| 270 |
+
lookalikes_greek = {
|
| 271 |
+
'α': 'a', 'ο': 'o', 'ν': 'v', 'τ': 't', 'ρ': 'p'
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
# Latin Extended lookalikes
|
| 275 |
+
lookalikes_latin = {
|
| 276 |
+
'ɑ': 'a', 'ɢ': 'g', 'ᴅ': 'd', 'ɡ': 'g', 'ɪ': 'i',
|
| 277 |
+
'ɴ': 'n', 'ᴘ': 'p', 'ᴠ': 'v', 'ᴡ': 'w', 'ɨ': 'i'
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
all_lookalikes = {**lookalikes_cyrillic, **lookalikes_greek, **lookalikes_latin}
|
| 281 |
+
|
| 282 |
+
for char in url_str:
|
| 283 |
+
if char in all_lookalikes:
|
| 284 |
+
return 1
|
| 285 |
+
return 0
|
| 286 |
+
|
| 287 |
+
out["has_lookalike_chars"] = s.apply(_detect_lookalike_chars)
|
| 288 |
+
|
| 289 |
# Return columns in the exact order expected by the model; fill any
|
| 290 |
# still-missing engineered columns with zeros to stay robust across
|
| 291 |
# model updates.
|