Pundit_Feynman / utils /pdf_processor.py
Wckd314's picture
Upload 7 files
fd1afd0 verified
raw
history blame contribute delete
807 Bytes
import base64
import fitz # PyMuPDF
def process_pdf_to_base64(pdf_path: str, dpi: int = 150) -> list[str]:
"""
Converts each page of a PDF into a base64-encoded JPEG string.
Preserves full RGB color (important for color-coded graphs in papers).
"""
try:
doc = fitz.open(pdf_path)
base64_images = []
for page in doc:
pix = page.get_pixmap(dpi=dpi)
img_bytes = pix.tobytes("jpeg")
img_b64 = base64.b64encode(img_bytes).decode("utf-8")
base64_images.append(img_b64)
doc.close()
print(f"Extracted {len(base64_images)} pages at {dpi} DPI (color preserved)")
return base64_images
except Exception as e:
print(f"Error processing PDF: {e}")
raise e