ocr-engine-2 / src /utils /_preprocessor.py
kanha-upadhyay's picture
Enhance PDF processing: validate file type, implement temporary file handling, and add preprocessing functionality
4994e6b
import io
import fitz
from fastapi import UploadFile
from PIL import Image, ImageEnhance, ImageFilter
class PDFPreprocessor:
@staticmethod
async def preprocess(file: UploadFile) -> UploadFile:
content = await file.read()
doc = fitz.open(stream=content, filetype="pdf")
processed_doc = fitz.open()
for page in doc:
pix = page.get_pixmap(dpi=300, colorspace=fitz.csGRAY)
img = Image.open(io.BytesIO(pix.tobytes()))
img = img.filter(
ImageFilter.UnsharpMask(radius=1, percent=150, threshold=3)
)
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(1.5)
buf = io.BytesIO()
img.save(buf, format="PNG")
buf.seek(0)
processed_doc.new_page(width=page.rect.width, height=page.rect.height)
processed_doc[-1].insert_image(
processed_doc[-1].rect, stream=buf.getvalue()
)
doc.close()
output_buf = io.BytesIO()
processed_doc.save(output_buf)
processed_doc.close()
output_buf.seek(0)
return UploadFile(
file=output_buf,
filename=file.filename,
headers=file.headers,
)