Upload app.py
Browse files
app.py
CHANGED
|
@@ -75,6 +75,23 @@ def _host_matches_any(host: str, known: List[str]) -> bool:
|
|
| 75 |
return False
|
| 76 |
|
| 77 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
def _read_urls_from_csv(path: str) -> List[str]:
|
| 79 |
urls: List[str] = []
|
| 80 |
try:
|
|
@@ -144,7 +161,7 @@ def _engineer_features(urls: List[str], feature_cols: List[str]) -> pd.DataFrame
|
|
| 144 |
out["ends_with_zip"] = s.str.endswith(".zip").astype(int)
|
| 145 |
|
| 146 |
# Host/SLD/TLD derived features used by newer models
|
| 147 |
-
hosts = s.apply(lambda x: (urlparse(x).hostname or "").lower())
|
| 148 |
out["host_len"] = hosts.str.len().fillna(0)
|
| 149 |
|
| 150 |
# Subdomain count: number of labels minus 2 (for sld.tld); never below 0
|
|
@@ -282,7 +299,8 @@ def predict_url(payload: PredictUrlPayload):
|
|
| 282 |
url_col: str = bundle.get("url_col") or "url"
|
| 283 |
model_type: str = bundle.get("model_type") or ""
|
| 284 |
|
| 285 |
-
|
|
|
|
| 286 |
if not url_str:
|
| 287 |
return JSONResponse(status_code=400, content={"error": "Empty url"})
|
| 288 |
|
|
@@ -309,7 +327,7 @@ def predict_url(payload: PredictUrlPayload):
|
|
| 309 |
}
|
| 310 |
|
| 311 |
# Known-host override (suffix match)
|
| 312 |
-
host = (urlparse(url_str).hostname or "").lower()
|
| 313 |
if host and host_map:
|
| 314 |
for h, lbl in host_map.items():
|
| 315 |
if _host_matches_any(host, [h]):
|
|
|
|
| 75 |
return False
|
| 76 |
|
| 77 |
|
| 78 |
+
_URL_EXTRACT_RE = re.compile(r"(https?://[^\s<>\"'\)\]]+)", re.IGNORECASE)
|
| 79 |
+
|
| 80 |
+
def _sanitize_input_url(text: str) -> str:
|
| 81 |
+
v = (text or "").strip()
|
| 82 |
+
if v.startswith("@"):
|
| 83 |
+
v = v.lstrip("@").strip()
|
| 84 |
+
m = _URL_EXTRACT_RE.search(v)
|
| 85 |
+
if m:
|
| 86 |
+
v = m.group(1)
|
| 87 |
+
v = v.strip("<>[]()")
|
| 88 |
+
return v
|
| 89 |
+
|
| 90 |
+
_SCHEME_RE = re.compile(r"^[a-zA-Z][a-zA-Z0-9+\-.]*://")
|
| 91 |
+
def _ensure_scheme(u: str) -> str:
|
| 92 |
+
u = (u or "").strip()
|
| 93 |
+
return u if _SCHEME_RE.match(u) else ("http://" + u)
|
| 94 |
+
|
| 95 |
def _read_urls_from_csv(path: str) -> List[str]:
|
| 96 |
urls: List[str] = []
|
| 97 |
try:
|
|
|
|
| 161 |
out["ends_with_zip"] = s.str.endswith(".zip").astype(int)
|
| 162 |
|
| 163 |
# Host/SLD/TLD derived features used by newer models
|
| 164 |
+
hosts = s.apply(lambda x: (urlparse(_ensure_scheme(x)).hostname or "").lower())
|
| 165 |
out["host_len"] = hosts.str.len().fillna(0)
|
| 166 |
|
| 167 |
# Subdomain count: number of labels minus 2 (for sld.tld); never below 0
|
|
|
|
| 299 |
url_col: str = bundle.get("url_col") or "url"
|
| 300 |
model_type: str = bundle.get("model_type") or ""
|
| 301 |
|
| 302 |
+
raw_input = (payload.url or "").strip()
|
| 303 |
+
url_str = _sanitize_input_url(raw_input)
|
| 304 |
if not url_str:
|
| 305 |
return JSONResponse(status_code=400, content={"error": "Empty url"})
|
| 306 |
|
|
|
|
| 327 |
}
|
| 328 |
|
| 329 |
# Known-host override (suffix match)
|
| 330 |
+
host = (urlparse(_ensure_scheme(url_str)).hostname or "").lower()
|
| 331 |
if host and host_map:
|
| 332 |
for h, lbl in host_map.items():
|
| 333 |
if _host_matches_any(host, [h]):
|