Perth0603 commited on
Commit
aafb4b7
·
verified ·
1 Parent(s): 0319812

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +39 -0
app.py CHANGED
@@ -247,6 +247,45 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
247
  out["max_brand_sim"] = hosts.apply(_max_brand_similarity)
248
  out["like_facebook"] = hosts.apply(lambda h: _like_brand(h, "facebook"))
249
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
250
  # Return columns in the exact order expected by the model; fill any
251
  # still-missing engineered columns with zeros to stay robust across
252
  # model updates.
 
247
  out["max_brand_sim"] = hosts.apply(_max_brand_similarity)
248
  out["like_facebook"] = hosts.apply(lambda h: _like_brand(h, "facebook"))
249
 
250
+ # Lookalike/homoglyph detection: unusual Unicode symbols that resemble ASCII letters
251
+ # Examples: Cyrillic а (U+0430) looks like 'a', Greek α (U+03B1) looks like 'a', etc.
252
+ def _detect_lookalike_chars(url: str) -> int:
253
+ """
254
+ Detects if URL contains Unicode characters that visually resemble ASCII letters.
255
+ Common lookalikes used in phishing:
256
+ - Cyrillic: а, е, о, р, с, х, у, ч, ы, ь (look like a,e,o,p,c,x,y,4,b,b)
257
+ - Greek: α, ο (look like a, o)
258
+ - Latin Extended: ɑ, ɢ, ᴅ, ɡ, ɪ, ɴ, ɪ (look like a,G,D,g,i,N,I)
259
+ """
260
+ url_str = url or ""
261
+
262
+ # Cyrillic characters that look like ASCII letters
263
+ lookalikes_cyrillic = {
264
+ 'а': 'a', 'е': 'e', 'о': 'o', 'р': 'p', 'с': 'c', 'х': 'x',
265
+ 'у': 'y', 'ч': '4', 'ы': 'b', 'ь': 'b', 'і': 'i', 'ї': 'yi',
266
+ 'ґ': 'g', 'ė': 'e', 'ń': 'n', 'ș': 's', 'ț': 't'
267
+ }
268
+
269
+ # Greek characters that look like ASCII letters
270
+ lookalikes_greek = {
271
+ 'α': 'a', 'ο': 'o', 'ν': 'v', 'τ': 't', 'ρ': 'p'
272
+ }
273
+
274
+ # Latin Extended lookalikes
275
+ lookalikes_latin = {
276
+ 'ɑ': 'a', 'ɢ': 'g', 'ᴅ': 'd', 'ɡ': 'g', 'ɪ': 'i',
277
+ 'ɴ': 'n', 'ᴘ': 'p', 'ᴠ': 'v', 'ᴡ': 'w', 'ɨ': 'i'
278
+ }
279
+
280
+ all_lookalikes = {**lookalikes_cyrillic, **lookalikes_greek, **lookalikes_latin}
281
+
282
+ for char in url_str:
283
+ if char in all_lookalikes:
284
+ return 1
285
+ return 0
286
+
287
+ out["has_lookalike_chars"] = s.apply(_detect_lookalike_chars)
288
+
289
  # Return columns in the exact order expected by the model; fill any
290
  # still-missing engineered columns with zeros to stay robust across
291
  # model updates.