Spaces:
Sleeping
Sleeping
| from bs4 import BeautifulSoup | |
| from pypdf import PdfReader | |
| from io import BytesIO | |
| def html_to_text(html: str) -> str: | |
| soup = BeautifulSoup(html, "html.parser") | |
| return soup.get_text(separator="\n") | |
| def pdf_bytes_to_text(b: bytes) -> str: | |
| reader = PdfReader(BytesIO(b)) | |
| pages = [p.extract_text() or "" for p in reader.pages] | |
| return "\n".join(pages) | |