from pypdf import PdfReader import io import trafilatura import requests from bs4 import BeautifulSoup UA = "Mozilla/5.0 (compatible; PRIRBot/1.0)" def extract_from_pdf(file_bytes: bytes) -> str: reader = PdfReader(io.BytesIO(file_bytes)) texts = [] for p in reader.pages: try: texts.append(p.extract_text() or "") except Exception: pass return "\n".join(texts) def extract_from_url(url: str) -> str: downloaded = trafilatura.fetch_url(url) if downloaded: txt = trafilatura.extract(downloaded, include_comments=False, include_tables=True) if txt: return txt resp = requests.get(url, headers={"User-Agent": UA}, timeout=20) soup = BeautifulSoup(resp.text, "html.parser") return soup.get_text("\n")