from bs4 import BeautifulSoup from pypdf import PdfReader from io import BytesIO def html_to_text(html: str) -> str: soup = BeautifulSoup(html, "html.parser") return soup.get_text(separator="\n") def pdf_bytes_to_text(b: bytes) -> str: reader = PdfReader(BytesIO(b)) pages = [p.extract_text() or "" for p in reader.pages] return "\n".join(pages)