riskengine / src /bmc /link_resolver.py
datavorous's picture
intial commit
28779c1 verified
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from .config_loader import load_app_config
from .models import ResolvedLink
APP_CONFIG = load_app_config()
REQUESTS_CONFIG = APP_CONFIG["requests"]
HEADERS = {"User-Agent": REQUESTS_CONFIG["user_agent"]}
MAX_REDIRECTS = REQUESTS_CONFIG["max_redirects"]
TIMEOUT = REQUESTS_CONFIG["timeout_seconds"]
def resolve_link(url: str) -> ResolvedLink:
try:
parsed = urlparse(url)
if parsed.scheme not in {"http", "https"} or not parsed.netloc:
raise ValueError("Invalid URL format")
with requests.Session() as session:
session.max_redirects = MAX_REDIRECTS
r = session.get(url, headers=HEADERS, timeout=TIMEOUT, allow_redirects=True)
chain = [h.url for h in r.history]
final_url = r.url
domain = urlparse(final_url).netloc.lower().replace("www.", "")
title = None
if "text/html" in r.headers.get("Content-Type", ""):
soup = BeautifulSoup(r.text, "html.parser")
title_tag = soup.title
title = title_tag.text.strip() if title_tag else None
return ResolvedLink(
original_url=url,
final_url=final_url,
final_domain=domain,
status_code=r.status_code,
redirect_chain=chain,
title=title,
error=None,
)
except Exception as e:
return ResolvedLink(
original_url=url,
final_url=None,
final_domain=None,
status_code=None,
redirect_chain=[],
title=None,
error=str(e),
)