Perth0603 commited on
Commit
20cb166
·
verified ·
1 Parent(s): b72253c

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -1
app.py CHANGED
@@ -3,6 +3,7 @@ import csv
3
  import re
4
  import threading
5
  from typing import Optional, List, Dict, Any
 
6
 
7
  import joblib
8
  import numpy as np
@@ -124,6 +125,8 @@ def _read_hosts_from_csv(path: str) -> Dict[str, str]:
124
  def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame:
125
  s = pd.Series(urls, dtype=str)
126
  out = pd.DataFrame()
 
 
127
  out["url_len"] = s.str.len().fillna(0)
128
  out["count_dot"] = s.str.count(r"\.")
129
  out["count_hyphen"] = s.str.count("-")
@@ -139,7 +142,98 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
139
  out["starts_https"] = s.str.startswith("https").astype(int)
140
  out["ends_with_exe"] = s.str.endswith(".exe").astype(int)
141
  out["ends_with_zip"] = s.str.endswith(".zip").astype(int)
142
- return out[feature_cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
 
145
  def _load_url_model():
 
3
  import re
4
  import threading
5
  from typing import Optional, List, Dict, Any
6
+ from difflib import SequenceMatcher
7
 
8
  import joblib
9
  import numpy as np
 
125
  def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame:
126
  s = pd.Series(urls, dtype=str)
127
  out = pd.DataFrame()
128
+
129
+ # Base URL-wide counts used by older models
130
  out["url_len"] = s.str.len().fillna(0)
131
  out["count_dot"] = s.str.count(r"\.")
132
  out["count_hyphen"] = s.str.count("-")
 
142
  out["starts_https"] = s.str.startswith("https").astype(int)
143
  out["ends_with_exe"] = s.str.endswith(".exe").astype(int)
144
  out["ends_with_zip"] = s.str.endswith(".zip").astype(int)
145
+
146
+ # Host/SLD/TLD derived features used by newer models
147
+ hosts = s.apply(lambda x: (urlparse(x).hostname or "").lower())
148
+ out["host_len"] = hosts.str.len().fillna(0)
149
+
150
+ # Subdomain count: number of labels minus 2 (for sld.tld); never below 0
151
+ label_counts = hosts.str.count(r"\.") + 1
152
+ sub_count = (label_counts - 2).clip(lower=0)
153
+ out["subdomain_count"] = sub_count.fillna(0)
154
+
155
+ # TLD and SLD extraction (simple heuristic; handles common cases)
156
+ parts_series = hosts.str.split(".")
157
+ tld_series = parts_series.apply(lambda p: p[-1] if len(p) >= 1 else "")
158
+ sld_series = parts_series.apply(lambda p: p[-2] if len(p) >= 2 else "")
159
+
160
+ # Suspicious TLD flag (expand as needed)
161
+ suspicious_tlds = {
162
+ "tk", "ml", "ga", "cf", "gq", "xyz", "top", "buzz", "icu",
163
+ "fit", "rest", "work", "click", "country", "zip"
164
+ }
165
+ out["tld_suspicious"] = tld_series.apply(lambda t: 1 if t.lower() in suspicious_tlds else 0)
166
+
167
+ # Punycode indicator
168
+ out["has_punycode"] = hosts.str.contains("xn--").astype(int)
169
+
170
+ # SLD stats
171
+ out["sld_len"] = sld_series.str.len().fillna(0)
172
+ def _ratio_digits(txt: str) -> float:
173
+ txt = txt or ""
174
+ if not txt:
175
+ return 0.0
176
+ digits = sum(c.isdigit() for c in txt)
177
+ return float(digits) / float(len(txt))
178
+ out["sld_digit_ratio"] = sld_series.apply(_ratio_digits)
179
+
180
+ def _shannon_entropy(txt: str) -> float:
181
+ txt = txt or ""
182
+ if not txt:
183
+ return 0.0
184
+ counts: Dict[str, int] = {}
185
+ for ch in txt:
186
+ counts[ch] = counts.get(ch, 0) + 1
187
+ total = float(len(txt))
188
+ entropy = 0.0
189
+ for n in counts.values():
190
+ p = n / total
191
+ entropy -= p * np.log2(p)
192
+ return float(entropy)
193
+ out["sld_entropy"] = sld_series.apply(_shannon_entropy)
194
+
195
+ # Brand similarity features (lightweight; stdlib only)
196
+ common_brands = [
197
+ "facebook", "google", "youtube", "apple", "microsoft",
198
+ "paypal", "amazon", "netflix", "instagram", "whatsapp",
199
+ "tiktok", "twitter", "telegram", "bank", "login"
200
+ ]
201
+
202
+ def _max_brand_similarity(host: str) -> float:
203
+ host = host or ""
204
+ if not host:
205
+ return 0.0
206
+ # Compare against host and sld specifically
207
+ best = 0.0
208
+ sld_local = host.split(".")[-2] if "." in host else host
209
+ for brand in common_brands:
210
+ best = max(
211
+ best,
212
+ SequenceMatcher(None, host, brand).ratio(),
213
+ SequenceMatcher(None, sld_local, brand).ratio(),
214
+ )
215
+ return float(best)
216
+
217
+ def _like_brand(host: str, brand: str, threshold: float = 0.82) -> int:
218
+ h = host or ""
219
+ if not h:
220
+ return 0
221
+ if brand in h:
222
+ return 1
223
+ sld_local = h.split(".")[-2] if "." in h else h
224
+ score = max(
225
+ SequenceMatcher(None, h, brand).ratio(),
226
+ SequenceMatcher(None, sld_local, brand).ratio(),
227
+ )
228
+ return 1 if score >= threshold else 0
229
+
230
+ out["max_brand_sim"] = hosts.apply(_max_brand_similarity)
231
+ out["like_facebook"] = hosts.apply(lambda h: _like_brand(h, "facebook"))
232
+
233
+ # Return columns in the exact order expected by the model; fill any
234
+ # still-missing engineered columns with zeros to stay robust across
235
+ # model updates.
236
+ return out.reindex(columns=feature_cols, fill_value=0)
237
 
238
 
239
  def _load_url_model():