ai-seo-analyzer / url_fetcher.py
lsdf's picture
Add URL-based text and title import with user-agent presets
2d1203a
from typing import Dict, List
from urllib.parse import urlparse
import re
import requests
from bs4 import BeautifulSoup
USER_AGENT_PRESETS: List[Dict[str, str]] = [
{
"key": "googlebot",
"name": "Googlebot",
"value": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
},
{
"key": "bingbot",
"name": "Bingbot",
"value": "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
},
{
"key": "chatgpt_user",
"name": "ChatGPT-User",
"value": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot",
},
{
"key": "gptbot",
"name": "GPTBot",
"value": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.0; +https://openai.com/gptbot",
},
{
"key": "chrome_desktop",
"name": "Chrome (Desktop)",
"value": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
},
]
def get_user_agent_presets() -> List[Dict[str, str]]:
return USER_AGENT_PRESETS
def _normalize_whitespace(text: str) -> str:
text = re.sub(r"[ \t]+", " ", text)
text = re.sub(r"\n{3,}", "\n\n", text)
return text.strip()
def _normalize_url(url: str) -> str:
raw = (url or "").strip()
if not raw:
raise ValueError("URL пустой.")
if not raw.startswith(("http://", "https://")):
raw = "https://" + raw
parsed = urlparse(raw)
if not parsed.scheme or not parsed.netloc:
raise ValueError("Некорректный URL.")
return raw
def _resolve_user_agent(user_agent_key: str) -> Dict[str, str]:
key = (user_agent_key or "").strip()
for ua in USER_AGENT_PRESETS:
if ua["key"] == key:
return ua
for ua in USER_AGENT_PRESETS:
if ua["key"] == "chrome_desktop":
return ua
return USER_AGENT_PRESETS[0]
def _extract_main_text_and_title(html: str) -> Dict[str, str]:
soup = BeautifulSoup(html or "", "html.parser")
for bad in soup(["script", "style", "noscript", "svg", "nav", "footer", "header", "aside", "form", "iframe"]):
bad.decompose()
title = ""
if soup.title:
title = _normalize_whitespace(soup.title.get_text(" ", strip=True))
best_text = ""
# Priority 1: semantic containers usually containing article text.
candidates = []
for selector, boost in (("article", 1.2), ("main", 1.1)):
for node in soup.select(selector):
t = _normalize_whitespace(node.get_text("\n", strip=True))
if len(t) >= 200:
candidates.append((len(t) * boost, t))
if candidates:
best_text = max(candidates, key=lambda x: x[0])[1]
else:
# Priority 2: collect meaningful paragraphs and list content.
paragraphs: List[str] = []
for p in soup.find_all(["p", "li"]):
txt = _normalize_whitespace(p.get_text(" ", strip=True))
if len(txt) >= 40:
paragraphs.append(txt)
if len(paragraphs) >= 3:
best_text = "\n\n".join(paragraphs)
# Priority 3: fallback to body text.
if not best_text:
body = soup.body if soup.body else soup
best_text = _normalize_whitespace(body.get_text("\n", strip=True))
return {
"title": title,
"text": best_text,
}
def fetch_url_content(url: str, user_agent_key: str = "chrome_desktop", timeout_seconds: int = 15) -> Dict[str, str]:
normalized_url = _normalize_url(url)
ua = _resolve_user_agent(user_agent_key)
headers = {
"User-Agent": ua["value"],
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
}
response = requests.get(
normalized_url,
headers=headers,
timeout=max(5, min(int(timeout_seconds or 15), 40)),
allow_redirects=True,
)
response.raise_for_status()
extracted = _extract_main_text_and_title(response.text or "")
return {
"ok": True,
"url": normalized_url,
"final_url": response.url,
"status_code": response.status_code,
"user_agent_key": ua["key"],
"user_agent_value": ua["value"],
"title": extracted["title"],
"text": extracted["text"],
"error": "",
}