Spaces:

nikkAnshul
/

RightINFO

Running

RightINFO / url_handler.py

anshul-rohilla4

Initial clean upload

d08010b 9 days ago

2.59 kB

	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse


	def scrape_url(url: str) -> dict:
	headers = {
	"User-Agent": (
	"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
	"AppleWebKit/537.36 (KHTML, like Gecko) "
	"Chrome/120.0.0.0 Safari/537.36"
	),
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.5",
	"Accept-Encoding": "gzip, deflate",
	"Connection": "keep-alive",
	}

	resp = requests.get(url, headers=headers, timeout=15)
	resp.raise_for_status()
	soup = BeautifulSoup(resp.text, "html.parser")

	# ── Title ─────────────────────────────────────────────
	title = ""
	if soup.find("h1"):
	title = soup.find("h1").get_text(strip=True)
	elif soup.title:
	title = soup.title.get_text(strip=True)

	# ── Text ──────────────────────────────────────────────
	# Remove nav, footer, script, style tags first
	for tag in soup(["script", "style", "nav", "footer", "header", "aside"]):
	tag.decompose()

	paragraphs = [
	p.get_text(strip=True)
	for p in soup.find_all("p")
	if len(p.get_text(strip=True)) > 40 # skip short/nav paragraphs
	]
	text = " ".join(paragraphs)

	if not text or len(text) < 100:
	# Fallback — grab all visible text
	text = soup.get_text(separator=" ", strip=True)

	# Limit to 10K words
	text = " ".join(text.split()[:10000])

	# ── Images ────────────────────────────────────────────
	imgs = []
	for img in soup.find_all("img"):
	if len(imgs) >= 5:
	break
	src = img.get("src") or img.get("data-src") or ""
	if not src or src.startswith("data:"):
	continue

	# Resolve relative URLs
	src = urljoin(url, src)
	if not src.startswith("http"):
	continue

	# Skip tiny icons
	try:
	w = int(img.get("width", 0))
	h = int(img.get("height", 0))
	if w and h and (w < 100 or h < 100):
	continue
	except (ValueError, TypeError):
	pass

	imgs.append(src)

	return {
	"title": title,
	"text": text,
	"images": imgs
	}