Spaces:
Sleeping
Sleeping
File size: 368 Bytes
94b06be |
1 2 3 4 5 6 7 8 9 10 11 12 13 |
from bs4 import BeautifulSoup
from pypdf import PdfReader
from io import BytesIO
def html_to_text(html: str) -> str:
soup = BeautifulSoup(html, "html.parser")
return soup.get_text(separator="\n")
def pdf_bytes_to_text(b: bytes) -> str:
reader = PdfReader(BytesIO(b))
pages = [p.extract_text() or "" for p in reader.pages]
return "\n".join(pages)
|