NotebookLM / ingestion_engine /pdf_extractor.py
internomega-terrablue
ingestion changes
9f911b3
raw
history blame contribute delete
345 Bytes
"""PDF text extraction using PyMuPDF."""
import fitz
def extract(file_path: str) -> str:
"""Extract text from all pages of a PDF file."""
doc = fitz.open(file_path)
pages = []
for page in doc:
text = page.get_text()
if text.strip():
pages.append(text)
doc.close()
return "\n\n".join(pages)