Spaces:

saran14
/

New_Aggregator_v1

Sleeping

App Files Files Community

New_Aggregator_v1 / fetcher.py

saran14

Initial commit: Pakistani News Aggregator Flask App

796e8b5 8 months ago

raw

history blame contribute delete

4.83 kB

	import requests
	import feedparser
	from bs4 import BeautifulSoup
	from readability import Document
	from dateutil import parser as dateparser
	from datetime import datetime, timezone
	import time
	from typing import Optional
	from config import USER_AGENT
	from urllib.parse import urljoin

	def http_get(url, timeout=30):
	headers = {
	"User-Agent": USER_AGENT,
	"Accept-Language": "en-US,en;q=0.9",
	"Referer": "https://duckduckgo.com/",
	"Connection": "keep-alive",
	"Accept": "application/rss+xml, application/xml, text/xml",
	}
	try:
	return requests.get(url, headers=headers, timeout=timeout, verify=True)
	except requests.exceptions.SSLError:
	# Retry without SSL verification as fallback
	return requests.get(url, headers=headers, timeout=timeout, verify=False)

	def parse_rss(rss_url: str):
	# requests first for better headers + retries
	r = http_get(rss_url)
	r.raise_for_status()
	return feedparser.parse(r.content)

	def iso_dt(s: Optional[str]) -> str:
	if not s:
	return datetime.now(timezone.utc).isoformat()
	try:
	dt = dateparser.parse(s)
	if not dt.tzinfo:
	dt = dt.replace(tzinfo=timezone.utc)
	return dt.astimezone(timezone.utc).isoformat()
	except Exception:
	return datetime.now(timezone.utc).isoformat()

	def extract_article_content(url: str):
	"""
	Fetch article page and extract readable HTML + a top image if possible.
	Uses readability + BeautifulSoup cleanup. Works for most publisher pages.
	"""
	try:
	resp = http_get(url, timeout=25)
	resp.raise_for_status()
	except Exception:
	return {"content_html": None, "top_image": None, "title": None}

	html = resp.text
	content_html = None
	title = None
	# Try readability first
	try:
	doc = Document(html)
	content_html = doc.summary(html_partial=True)
	title = doc.short_title()
	except Exception:
	content_html = None
	title = None

	# Fallback: use BeautifulSoup to extract <article> or largest <div> with text
	if not content_html:
	soup = BeautifulSoup(html, "lxml")
	article_tag = soup.find("article")
	if article_tag:
	# Clean up unwanted elements
	for unwanted in article_tag.find_all(["script", "style", "nav", "header", "footer", "aside", "button", "form"]):
	unwanted.decompose()
	# Remove social media buttons and ads
	for unwanted in article_tag.find_all(class_=["share", "social", "btn", "button", "ad", "advertisement", "promo", "read-more", "email", "subscribe"]):
	unwanted.decompose()
	# Remove unwanted text patterns
	for unwanted in article_tag.find_all(text=["Read more", "Email", "Subscribe", "Share", "Tweet", "Like"]):
	if unwanted.parent:
	unwanted.parent.decompose()
	content_html = str(article_tag)
	else:
	# fallback: find the largest <div> with text
	divs = soup.find_all("div")
	largest = max(divs, key=lambda d: len(d.get_text(strip=True)), default=None)
	if largest and len(largest.get_text(strip=True)) > 200:
	# Clean up unwanted elements
	for unwanted in largest.find_all(["script", "style", "nav", "header", "footer", "aside", "button", "form"]):
	unwanted.decompose()
	# Remove social media and navigation elements
	for unwanted in largest.find_all(class_=["share", "social", "btn", "button", "ad", "advertisement", "promo", "read-more", "email", "subscribe"]):
	unwanted.decompose()
	# Remove unwanted text patterns
	for unwanted in largest.find_all(text=["Read more", "Email", "Subscribe", "Share", "Tweet", "Like"]):
	if unwanted.parent:
	unwanted.parent.decompose()
	content_html = str(largest)
	else:
	# fallback: just use the body
	body = soup.find("body")
	if body:
	content_html = str(body)

	# Try to find a good image
	top_image = None
	try:
	soup = BeautifulSoup(html, "lxml")
	og = soup.find("meta", property="og:image") or soup.find("meta", attrs={"name":"og:image"})
	if og and og.get("content"):
	top_image = urljoin(url, og["content"])
	else:
	img = soup.find("article")
	img = img.find("img") if img else soup.find("img")
	if img and img.get("src"):
	top_image = urljoin(url, img["src"])
	except Exception:
	pass

	return {"content_html": content_html, "top_image": top_image, "title": title}