TenderIQ / core /pdf_utils.py
JaydeepR's picture
Step 5: pdf_utils and chunker — PyMuPDF extraction and text chunking
f42bfb0
raw
history blame contribute delete
942 Bytes
from pathlib import Path
import fitz
import PIL.Image
def extract_pages(path: Path) -> list[dict]:
doc = fitz.open(str(path))
pages = []
for i, page in enumerate(doc):
text = page.get_text("text")
pages.append({"page": i + 1, "text": text})
doc.close()
return pages
def is_text_pdf(path: Path) -> bool:
doc = fitz.open(str(path))
if not doc.page_count:
doc.close()
return False
total_chars = sum(len(page.get_text("text")) for page in doc)
avg = total_chars / doc.page_count
doc.close()
return avg >= 50
def render_page_to_image(path: Path, page_no: int, dpi: int = 200) -> PIL.Image.Image:
doc = fitz.open(str(path))
page = doc[page_no - 1]
mat = fitz.Matrix(dpi / 72, dpi / 72)
pix = page.get_pixmap(matrix=mat, colorspace=fitz.csRGB)
img = PIL.Image.frombytes("RGB", (pix.width, pix.height), pix.samples)
doc.close()
return img