New_Aggregator_v1 / fetcher.py
saran14's picture
Initial commit: Pakistani News Aggregator Flask App
796e8b5
import requests
import feedparser
from bs4 import BeautifulSoup
from readability import Document
from dateutil import parser as dateparser
from datetime import datetime, timezone
import time
from typing import Optional
from config import USER_AGENT
from urllib.parse import urljoin
def http_get(url, timeout=30):
headers = {
"User-Agent": USER_AGENT,
"Accept-Language": "en-US,en;q=0.9",
"Referer": "https://duckduckgo.com/",
"Connection": "keep-alive",
"Accept": "application/rss+xml, application/xml, text/xml",
}
try:
return requests.get(url, headers=headers, timeout=timeout, verify=True)
except requests.exceptions.SSLError:
# Retry without SSL verification as fallback
return requests.get(url, headers=headers, timeout=timeout, verify=False)
def parse_rss(rss_url: str):
# requests first for better headers + retries
r = http_get(rss_url)
r.raise_for_status()
return feedparser.parse(r.content)
def iso_dt(s: Optional[str]) -> str:
if not s:
return datetime.now(timezone.utc).isoformat()
try:
dt = dateparser.parse(s)
if not dt.tzinfo:
dt = dt.replace(tzinfo=timezone.utc)
return dt.astimezone(timezone.utc).isoformat()
except Exception:
return datetime.now(timezone.utc).isoformat()
def extract_article_content(url: str):
"""
Fetch article page and extract readable HTML + a top image if possible.
Uses readability + BeautifulSoup cleanup. Works for most publisher pages.
"""
try:
resp = http_get(url, timeout=25)
resp.raise_for_status()
except Exception:
return {"content_html": None, "top_image": None, "title": None}
html = resp.text
content_html = None
title = None
# Try readability first
try:
doc = Document(html)
content_html = doc.summary(html_partial=True)
title = doc.short_title()
except Exception:
content_html = None
title = None
# Fallback: use BeautifulSoup to extract <article> or largest <div> with text
if not content_html:
soup = BeautifulSoup(html, "lxml")
article_tag = soup.find("article")
if article_tag:
# Clean up unwanted elements
for unwanted in article_tag.find_all(["script", "style", "nav", "header", "footer", "aside", "button", "form"]):
unwanted.decompose()
# Remove social media buttons and ads
for unwanted in article_tag.find_all(class_=["share", "social", "btn", "button", "ad", "advertisement", "promo", "read-more", "email", "subscribe"]):
unwanted.decompose()
# Remove unwanted text patterns
for unwanted in article_tag.find_all(text=["Read more", "Email", "Subscribe", "Share", "Tweet", "Like"]):
if unwanted.parent:
unwanted.parent.decompose()
content_html = str(article_tag)
else:
# fallback: find the largest <div> with text
divs = soup.find_all("div")
largest = max(divs, key=lambda d: len(d.get_text(strip=True)), default=None)
if largest and len(largest.get_text(strip=True)) > 200:
# Clean up unwanted elements
for unwanted in largest.find_all(["script", "style", "nav", "header", "footer", "aside", "button", "form"]):
unwanted.decompose()
# Remove social media and navigation elements
for unwanted in largest.find_all(class_=["share", "social", "btn", "button", "ad", "advertisement", "promo", "read-more", "email", "subscribe"]):
unwanted.decompose()
# Remove unwanted text patterns
for unwanted in largest.find_all(text=["Read more", "Email", "Subscribe", "Share", "Tweet", "Like"]):
if unwanted.parent:
unwanted.parent.decompose()
content_html = str(largest)
else:
# fallback: just use the body
body = soup.find("body")
if body:
content_html = str(body)
# Try to find a good image
top_image = None
try:
soup = BeautifulSoup(html, "lxml")
og = soup.find("meta", property="og:image") or soup.find("meta", attrs={"name":"og:image"})
if og and og.get("content"):
top_image = urljoin(url, og["content"])
else:
img = soup.find("article")
img = img.find("img") if img else soup.find("img")
if img and img.get("src"):
top_image = urljoin(url, img["src"])
except Exception:
pass
return {"content_html": content_html, "top_image": top_image, "title": title}