File size: 1,676 Bytes
28779c1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse
from .config_loader import load_app_config
from .models import ResolvedLink

APP_CONFIG = load_app_config()
REQUESTS_CONFIG = APP_CONFIG["requests"]
HEADERS = {"User-Agent": REQUESTS_CONFIG["user_agent"]}
MAX_REDIRECTS = REQUESTS_CONFIG["max_redirects"]
TIMEOUT = REQUESTS_CONFIG["timeout_seconds"]


def resolve_link(url: str) -> ResolvedLink:
    try:
        parsed = urlparse(url)
        if parsed.scheme not in {"http", "https"} or not parsed.netloc:
            raise ValueError("Invalid URL format")
        with requests.Session() as session:
            session.max_redirects = MAX_REDIRECTS
            r = session.get(url, headers=HEADERS, timeout=TIMEOUT, allow_redirects=True)
        chain = [h.url for h in r.history]
        final_url = r.url
        domain = urlparse(final_url).netloc.lower().replace("www.", "")
        title = None

        if "text/html" in r.headers.get("Content-Type", ""):
            soup = BeautifulSoup(r.text, "html.parser")
            title_tag = soup.title
            title = title_tag.text.strip() if title_tag else None

        return ResolvedLink(
            original_url=url,
            final_url=final_url,
            final_domain=domain,
            status_code=r.status_code,
            redirect_chain=chain,
            title=title,
            error=None,
        )
    except Exception as e:
        return ResolvedLink(
            original_url=url,
            final_url=None,
            final_domain=None,
            status_code=None,
            redirect_chain=[],
            title=None,
            error=str(e),
        )