File size: 368 Bytes
94b06be
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
from bs4 import BeautifulSoup
from pypdf import PdfReader
from io import BytesIO

def html_to_text(html: str) -> str:
    soup = BeautifulSoup(html, "html.parser")
    return soup.get_text(separator="\n")

def pdf_bytes_to_text(b: bytes) -> str:
    reader = PdfReader(BytesIO(b))
    pages = [p.extract_text() or "" for p in reader.pages]
    return "\n".join(pages)