Perth0603 commited on
Commit
639276e
·
verified ·
1 Parent(s): 20cb166

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +21 -3
app.py CHANGED
@@ -75,6 +75,23 @@ def _host_matches_any(host: str, known: List[str]) -> bool:
75
  return False
76
 
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  def _read_urls_from_csv(path: str) -> List[str]:
79
  urls: List[str] = []
80
  try:
@@ -144,7 +161,7 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
144
  out["ends_with_zip"] = s.str.endswith(".zip").astype(int)
145
 
146
  # Host/SLD/TLD derived features used by newer models
147
- hosts = s.apply(lambda x: (urlparse(x).hostname or "").lower())
148
  out["host_len"] = hosts.str.len().fillna(0)
149
 
150
  # Subdomain count: number of labels minus 2 (for sld.tld); never below 0
@@ -282,7 +299,8 @@ def predict_url(payload: PredictUrlPayload):
282
  url_col: str = bundle.get("url_col") or "url"
283
  model_type: str = bundle.get("model_type") or ""
284
 
285
- url_str = (payload.url or "").strip()
 
286
  if not url_str:
287
  return JSONResponse(status_code=400, content={"error": "Empty url"})
288
 
@@ -309,7 +327,7 @@ def predict_url(payload: PredictUrlPayload):
309
  }
310
 
311
  # Known-host override (suffix match)
312
- host = (urlparse(url_str).hostname or "").lower()
313
  if host and host_map:
314
  for h, lbl in host_map.items():
315
  if _host_matches_any(host, [h]):
 
75
  return False
76
 
77
 
78
+ _URL_EXTRACT_RE = re.compile(r"(https?://[^\s<>\"'\)\]]+)", re.IGNORECASE)
79
+
80
+ def _sanitize_input_url(text: str) -> str:
81
+ v = (text or "").strip()
82
+ if v.startswith("@"):
83
+ v = v.lstrip("@").strip()
84
+ m = _URL_EXTRACT_RE.search(v)
85
+ if m:
86
+ v = m.group(1)
87
+ v = v.strip("<>[]()")
88
+ return v
89
+
90
+ _SCHEME_RE = re.compile(r"^[a-zA-Z][a-zA-Z0-9+\-.]*://")
91
+ def _ensure_scheme(u: str) -> str:
92
+ u = (u or "").strip()
93
+ return u if _SCHEME_RE.match(u) else ("http://" + u)
94
+
95
  def _read_urls_from_csv(path: str) -> List[str]:
96
  urls: List[str] = []
97
  try:
 
161
  out["ends_with_zip"] = s.str.endswith(".zip").astype(int)
162
 
163
  # Host/SLD/TLD derived features used by newer models
164
+ hosts = s.apply(lambda x: (urlparse(_ensure_scheme(x)).hostname or "").lower())
165
  out["host_len"] = hosts.str.len().fillna(0)
166
 
167
  # Subdomain count: number of labels minus 2 (for sld.tld); never below 0
 
299
  url_col: str = bundle.get("url_col") or "url"
300
  model_type: str = bundle.get("model_type") or ""
301
 
302
+ raw_input = (payload.url or "").strip()
303
+ url_str = _sanitize_input_url(raw_input)
304
  if not url_str:
305
  return JSONResponse(status_code=400, content={"error": "Empty url"})
306
 
 
327
  }
328
 
329
  # Known-host override (suffix match)
330
+ host = (urlparse(_ensure_scheme(url_str)).hostname or "").lower()
331
  if host and host_map:
332
  for h, lbl in host_map.items():
333
  if _host_matches_any(host, [h]):