AutoWS / crawler /utils.py
Roman190928's picture
Upload AutoWS app files without plan/readme
f55f92e verified
from __future__ import annotations
import re
from urllib.parse import parse_qsl, urlencode, urlsplit, urlunsplit
BINARY_EXTENSIONS = {
".7z",
".avi",
".bin",
".bz2",
".csv",
".doc",
".docx",
".epub",
".gif",
".gz",
".ico",
".jpeg",
".jpg",
".json",
".m4a",
".m4v",
".mov",
".mp3",
".mp4",
".mpeg",
".ogg",
".pdf",
".png",
".ppt",
".pptx",
".rar",
".svg",
".tar",
".tgz",
".tif",
".tiff",
".wav",
".webm",
".webp",
".xls",
".xlsx",
".xml",
".xz",
".zip",
}
TRACKING_QUERY_KEYS = {
"fbclid",
"gclid",
"mc_cid",
"mc_eid",
"ref",
"source",
"spm",
"yclid",
}
def normalize_url(raw_url: str) -> str | None:
try:
parts = urlsplit(raw_url.strip())
except ValueError:
return None
scheme = parts.scheme.lower()
if scheme not in {"http", "https"}:
return None
host = (parts.hostname or "").lower().strip(".")
if not host:
return None
try:
port = parts.port
except ValueError:
return None
if (scheme == "http" and port == 80) or (scheme == "https" and port == 443):
netloc = host
elif port:
netloc = f"{host}:{port}"
else:
netloc = host
path = parts.path or "/"
path = re.sub(r"/{2,}", "/", path)
query_pairs: list[tuple[str, str]] = []
for key, value in parse_qsl(parts.query, keep_blank_values=True):
lowered = key.lower()
if lowered.startswith("utm_") or lowered in TRACKING_QUERY_KEYS:
continue
query_pairs.append((key, value))
query = urlencode(query_pairs, doseq=True)
return urlunsplit((scheme, netloc, path, query, ""))
def has_binary_extension(url: str) -> bool:
path = urlsplit(url).path.lower()
if not path:
return False
dot_index = path.rfind(".")
if dot_index == -1:
return False
return path[dot_index:] in BINARY_EXTENSIONS
def is_html_response(content_type: str, final_url: str) -> bool:
if has_binary_extension(final_url):
return False
if not content_type:
return True
lowered = content_type.lower()
return (
"text/html" in lowered
or "application/xhtml+xml" in lowered
or "text/plain" in lowered
)