deepdive-IR / ingest /parse.py
Ritabanm's picture
Upload 25 files
94b06be verified
raw
history blame contribute delete
368 Bytes
from bs4 import BeautifulSoup
from pypdf import PdfReader
from io import BytesIO
def html_to_text(html: str) -> str:
soup = BeautifulSoup(html, "html.parser")
return soup.get_text(separator="\n")
def pdf_bytes_to_text(b: bytes) -> str:
reader = PdfReader(BytesIO(b))
pages = [p.extract_text() or "" for p in reader.pages]
return "\n".join(pages)