Spaces:
Running
Running
| from typing import Dict, List | |
| from urllib.parse import urlparse | |
| import re | |
| import requests | |
| from bs4 import BeautifulSoup | |
| USER_AGENT_PRESETS: List[Dict[str, str]] = [ | |
| { | |
| "key": "googlebot", | |
| "name": "Googlebot", | |
| "value": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)", | |
| }, | |
| { | |
| "key": "bingbot", | |
| "name": "Bingbot", | |
| "value": "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)", | |
| }, | |
| { | |
| "key": "chatgpt_user", | |
| "name": "ChatGPT-User", | |
| "value": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot", | |
| }, | |
| { | |
| "key": "gptbot", | |
| "name": "GPTBot", | |
| "value": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.0; +https://openai.com/gptbot", | |
| }, | |
| { | |
| "key": "chrome_desktop", | |
| "name": "Chrome (Desktop)", | |
| "value": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", | |
| }, | |
| ] | |
| def get_user_agent_presets() -> List[Dict[str, str]]: | |
| return USER_AGENT_PRESETS | |
| def _normalize_whitespace(text: str) -> str: | |
| text = re.sub(r"[ \t]+", " ", text) | |
| text = re.sub(r"\n{3,}", "\n\n", text) | |
| return text.strip() | |
| def _normalize_url(url: str) -> str: | |
| raw = (url or "").strip() | |
| if not raw: | |
| raise ValueError("URL пустой.") | |
| if not raw.startswith(("http://", "https://")): | |
| raw = "https://" + raw | |
| parsed = urlparse(raw) | |
| if not parsed.scheme or not parsed.netloc: | |
| raise ValueError("Некорректный URL.") | |
| return raw | |
| def _resolve_user_agent(user_agent_key: str) -> Dict[str, str]: | |
| key = (user_agent_key or "").strip() | |
| for ua in USER_AGENT_PRESETS: | |
| if ua["key"] == key: | |
| return ua | |
| for ua in USER_AGENT_PRESETS: | |
| if ua["key"] == "chrome_desktop": | |
| return ua | |
| return USER_AGENT_PRESETS[0] | |
| def _extract_main_text_and_title(html: str) -> Dict[str, str]: | |
| soup = BeautifulSoup(html or "", "html.parser") | |
| for bad in soup(["script", "style", "noscript", "svg", "nav", "footer", "header", "aside", "form", "iframe"]): | |
| bad.decompose() | |
| title = "" | |
| if soup.title: | |
| title = _normalize_whitespace(soup.title.get_text(" ", strip=True)) | |
| best_text = "" | |
| # Priority 1: semantic containers usually containing article text. | |
| candidates = [] | |
| for selector, boost in (("article", 1.2), ("main", 1.1)): | |
| for node in soup.select(selector): | |
| t = _normalize_whitespace(node.get_text("\n", strip=True)) | |
| if len(t) >= 200: | |
| candidates.append((len(t) * boost, t)) | |
| if candidates: | |
| best_text = max(candidates, key=lambda x: x[0])[1] | |
| else: | |
| # Priority 2: collect meaningful paragraphs and list content. | |
| paragraphs: List[str] = [] | |
| for p in soup.find_all(["p", "li"]): | |
| txt = _normalize_whitespace(p.get_text(" ", strip=True)) | |
| if len(txt) >= 40: | |
| paragraphs.append(txt) | |
| if len(paragraphs) >= 3: | |
| best_text = "\n\n".join(paragraphs) | |
| # Priority 3: fallback to body text. | |
| if not best_text: | |
| body = soup.body if soup.body else soup | |
| best_text = _normalize_whitespace(body.get_text("\n", strip=True)) | |
| return { | |
| "title": title, | |
| "text": best_text, | |
| } | |
| def fetch_url_content(url: str, user_agent_key: str = "chrome_desktop", timeout_seconds: int = 15) -> Dict[str, str]: | |
| normalized_url = _normalize_url(url) | |
| ua = _resolve_user_agent(user_agent_key) | |
| headers = { | |
| "User-Agent": ua["value"], | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| } | |
| response = requests.get( | |
| normalized_url, | |
| headers=headers, | |
| timeout=max(5, min(int(timeout_seconds or 15), 40)), | |
| allow_redirects=True, | |
| ) | |
| response.raise_for_status() | |
| extracted = _extract_main_text_and_title(response.text or "") | |
| return { | |
| "ok": True, | |
| "url": normalized_url, | |
| "final_url": response.url, | |
| "status_code": response.status_code, | |
| "user_agent_key": ua["key"], | |
| "user_agent_value": ua["value"], | |
| "title": extracted["title"], | |
| "text": extracted["text"], | |
| "error": "", | |
| } | |