from typing import Optional from pypdf import PdfReader import io import trafilatura import requests from bs4 import BeautifulSoup USER_AGENT = "Mozilla/5.0 (compatible; PRIRBot/1.0)" def extract_from_pdf(file_bytes: bytes) -> str: reader = PdfReader(io.BytesIO(file_bytes)) texts = [] for page in reader.pages: try: texts.append(page.extract_text() or "") except Exception: pass return "\n".join(texts) def extract_from_url(url: str) -> str: downloaded = trafilatura.fetch_url(url) if downloaded: txt = trafilatura.extract(downloaded, include_comments=False, include_tables=True) if txt: return txt # fallback: simple soup resp = requests.get(url, headers={"User-Agent": USER_AGENT}, timeout=20) soup = BeautifulSoup(resp.text, "html.parser") return soup.get_text("\n")