Spaces:

lsdf
/

ai-seo-analyzer

Running

App Files Files Community

ai-seo-analyzer / url_fetcher.py

lsdf

Add URL-based text and title import with user-agent presets

2d1203a about 2 months ago

raw

history blame contribute delete

4.42 kB

	from typing import Dict, List
	from urllib.parse import urlparse
	import re

	import requests
	from bs4 import BeautifulSoup


	USER_AGENT_PRESETS: List[Dict[str, str]] = [
	{
	"key": "googlebot",
	"name": "Googlebot",
	"value": "Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)",
	},
	{
	"key": "bingbot",
	"name": "Bingbot",
	"value": "Mozilla/5.0 (compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm)",
	},
	{
	"key": "chatgpt_user",
	"name": "ChatGPT-User",
	"value": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; ChatGPT-User/1.0; +https://openai.com/bot",
	},
	{
	"key": "gptbot",
	"name": "GPTBot",
	"value": "Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko); compatible; GPTBot/1.0; +https://openai.com/gptbot",
	},
	{
	"key": "chrome_desktop",
	"name": "Chrome (Desktop)",
	"value": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
	},
	]


	def get_user_agent_presets() -> List[Dict[str, str]]:
	return USER_AGENT_PRESETS


	def _normalize_whitespace(text: str) -> str:
	text = re.sub(r"[ \t]+", " ", text)
	text = re.sub(r"\n{3,}", "\n\n", text)
	return text.strip()


	def _normalize_url(url: str) -> str:
	raw = (url or "").strip()
	if not raw:
	raise ValueError("URL пустой.")

	if not raw.startswith(("http://", "https://")):
	raw = "https://" + raw

	parsed = urlparse(raw)
	if not parsed.scheme or not parsed.netloc:
	raise ValueError("Некорректный URL.")
	return raw


	def _resolve_user_agent(user_agent_key: str) -> Dict[str, str]:
	key = (user_agent_key or "").strip()
	for ua in USER_AGENT_PRESETS:
	if ua["key"] == key:
	return ua
	for ua in USER_AGENT_PRESETS:
	if ua["key"] == "chrome_desktop":
	return ua
	return USER_AGENT_PRESETS[0]


	def _extract_main_text_and_title(html: str) -> Dict[str, str]:
	soup = BeautifulSoup(html or "", "html.parser")

	for bad in soup(["script", "style", "noscript", "svg", "nav", "footer", "header", "aside", "form", "iframe"]):
	bad.decompose()

	title = ""
	if soup.title:
	title = _normalize_whitespace(soup.title.get_text(" ", strip=True))

	best_text = ""

	# Priority 1: semantic containers usually containing article text.
	candidates = []
	for selector, boost in (("article", 1.2), ("main", 1.1)):
	for node in soup.select(selector):
	t = _normalize_whitespace(node.get_text("\n", strip=True))
	if len(t) >= 200:
	candidates.append((len(t) * boost, t))

	if candidates:
	best_text = max(candidates, key=lambda x: x[0])[1]
	else:
	# Priority 2: collect meaningful paragraphs and list content.
	paragraphs: List[str] = []
	for p in soup.find_all(["p", "li"]):
	txt = _normalize_whitespace(p.get_text(" ", strip=True))
	if len(txt) >= 40:
	paragraphs.append(txt)
	if len(paragraphs) >= 3:
	best_text = "\n\n".join(paragraphs)

	# Priority 3: fallback to body text.
	if not best_text:
	body = soup.body if soup.body else soup
	best_text = _normalize_whitespace(body.get_text("\n", strip=True))

	return {
	"title": title,
	"text": best_text,
	}


	def fetch_url_content(url: str, user_agent_key: str = "chrome_desktop", timeout_seconds: int = 15) -> Dict[str, str]:
	normalized_url = _normalize_url(url)
	ua = _resolve_user_agent(user_agent_key)

	headers = {
	"User-Agent": ua["value"],
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	}
	response = requests.get(
	normalized_url,
	headers=headers,
	timeout=max(5, min(int(timeout_seconds or 15), 40)),
	allow_redirects=True,
	)
	response.raise_for_status()

	extracted = _extract_main_text_and_title(response.text or "")
	return {
	"ok": True,
	"url": normalized_url,
	"final_url": response.url,
	"status_code": response.status_code,
	"user_agent_key": ua["key"],
	"user_agent_value": ua["value"],
	"title": extracted["title"],
	"text": extracted["text"],
	"error": "",
	}