import logging from typing import Optional from urllib.parse import urlparse import requests from bs4 import BeautifulSoup class SoupClient: def __init__(self, base_domain: str, user_agent: str = None): self.base_domain = base_domain self.base_url = f"https://{base_domain}" self.session = requests.Session() self.session.headers.update({ "User-Agent": user_agent or "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " "(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" }) self.log = logging.getLogger(self.__class__.__name__) def same_domain(self, url: str) -> bool: host = urlparse(url).netloc.lower() return host == self.base_domain.lower() or host == self.base_domain.lower().replace("www.", "") def fetch_soup(self, url: str, timeout: int = 30) -> BeautifulSoup: r = self.session.get(url, timeout=timeout) r.raise_for_status() return BeautifulSoup(r.content, "lxml") def close(self): self.session.close() def get_meta_image(self, soup: BeautifulSoup) -> Optional[str]: for sel in [ "meta[property='og:image']", "meta[name='og:image']", "meta[name='twitter:image']", "meta[property='twitter:image']" ]: m = soup.select_one(sel) if m and m.get("content"): return m["content"] img = soup.find("img") return (img.get("src") or img.get("data-src")) if img else None