Spaces:
Sleeping
Sleeping
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urlparse | |
| from .config_loader import load_app_config | |
| from .models import ResolvedLink | |
| APP_CONFIG = load_app_config() | |
| REQUESTS_CONFIG = APP_CONFIG["requests"] | |
| HEADERS = {"User-Agent": REQUESTS_CONFIG["user_agent"]} | |
| MAX_REDIRECTS = REQUESTS_CONFIG["max_redirects"] | |
| TIMEOUT = REQUESTS_CONFIG["timeout_seconds"] | |
| def resolve_link(url: str) -> ResolvedLink: | |
| try: | |
| parsed = urlparse(url) | |
| if parsed.scheme not in {"http", "https"} or not parsed.netloc: | |
| raise ValueError("Invalid URL format") | |
| with requests.Session() as session: | |
| session.max_redirects = MAX_REDIRECTS | |
| r = session.get(url, headers=HEADERS, timeout=TIMEOUT, allow_redirects=True) | |
| chain = [h.url for h in r.history] | |
| final_url = r.url | |
| domain = urlparse(final_url).netloc.lower().replace("www.", "") | |
| title = None | |
| if "text/html" in r.headers.get("Content-Type", ""): | |
| soup = BeautifulSoup(r.text, "html.parser") | |
| title_tag = soup.title | |
| title = title_tag.text.strip() if title_tag else None | |
| return ResolvedLink( | |
| original_url=url, | |
| final_url=final_url, | |
| final_domain=domain, | |
| status_code=r.status_code, | |
| redirect_chain=chain, | |
| title=title, | |
| error=None, | |
| ) | |
| except Exception as e: | |
| return ResolvedLink( | |
| original_url=url, | |
| final_url=None, | |
| final_domain=None, | |
| status_code=None, | |
| redirect_chain=[], | |
| title=None, | |
| error=str(e), | |
| ) | |