| import logging | |
| from typing import Optional | |
| from urllib.parse import urlparse | |
| import requests | |
| from bs4 import BeautifulSoup | |
| class SoupClient: | |
| def __init__(self, base_domain: str, user_agent: str = None): | |
| self.base_domain = base_domain | |
| self.base_url = f"https://{base_domain}" | |
| self.session = requests.Session() | |
| self.session.headers.update({ | |
| "User-Agent": user_agent or | |
| "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " | |
| "(KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" | |
| }) | |
| self.log = logging.getLogger(self.__class__.__name__) | |
| def same_domain(self, url: str) -> bool: | |
| host = urlparse(url).netloc.lower() | |
| return host == self.base_domain.lower() or host == self.base_domain.lower().replace("www.", "") | |
| def fetch_soup(self, url: str, timeout: int = 30) -> BeautifulSoup: | |
| r = self.session.get(url, timeout=timeout) | |
| r.raise_for_status() | |
| return BeautifulSoup(r.content, "lxml") | |
| def close(self): | |
| self.session.close() | |
| def get_meta_image(self, soup: BeautifulSoup) -> Optional[str]: | |
| for sel in [ | |
| "meta[property='og:image']", | |
| "meta[name='og:image']", | |
| "meta[name='twitter:image']", | |
| "meta[property='twitter:image']" | |
| ]: | |
| m = soup.select_one(sel) | |
| if m and m.get("content"): | |
| return m["content"] | |
| img = soup.find("img") | |
| return (img.get("src") or img.get("data-src")) if img else None | |