Spaces:
Running
Running
File size: 2,383 Bytes
f55f92e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 | from __future__ import annotations
import re
from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
BINARY_EXTENSIONS = {
".7z",
".avi",
".bin",
".bz2",
".csv",
".doc",
".docx",
".epub",
".gif",
".gz",
".ico",
".jpeg",
".jpg",
".json",
".m4a",
".m4v",
".mov",
".mp3",
".mp4",
".mpeg",
".ogg",
".pdf",
".png",
".ppt",
".pptx",
".rar",
".svg",
".tar",
".tgz",
".tif",
".tiff",
".wav",
".webm",
".webp",
".xls",
".xlsx",
".xml",
".xz",
".zip",
}
TRACKING_QUERY_KEYS = {
"fbclid",
"gclid",
"mc_cid",
"mc_eid",
"ref",
"source",
"spm",
"yclid",
}
def normalize_url(raw_url: str) -> str | None:
try:
parts = urlsplit(raw_url.strip())
except ValueError:
return None
scheme = parts.scheme.lower()
if scheme not in {"http", "https"}:
return None
host = (parts.hostname or "").lower().strip(".")
if not host:
return None
try:
port = parts.port
except ValueError:
return None
if (scheme == "http" and port == 80) or (scheme == "https" and port == 443):
netloc = host
elif port:
netloc = f"{host}:{port}"
else:
netloc = host
path = parts.path or "/"
path = re.sub(r"/{2,}", "/", path)
query_pairs: list[tuple[str, str]] = []
for key, value in parse_qsl(parts.query, keep_blank_values=True):
lowered = key.lower()
if lowered.startswith("utm_") or lowered in TRACKING_QUERY_KEYS:
continue
query_pairs.append((key, value))
query = urlencode(query_pairs, doseq=True)
return urlunsplit((scheme, netloc, path, query, ""))
def has_binary_extension(url: str) -> bool:
path = urlsplit(url).path.lower()
if not path:
return False
dot_index = path.rfind(".")
if dot_index == -1:
return False
return path[dot_index:] in BINARY_EXTENSIONS
def is_html_response(content_type: str, final_url: str) -> bool:
if has_binary_extension(final_url):
return False
if not content_type:
return True
lowered = content_type.lower()
return (
"text/html" in lowered
or "application/xhtml+xml" in lowered
or "text/plain" in lowered
)
|