Spaces:
Running
Running
Enhance PDF processing: validate file type, implement temporary file handling, and add preprocessing functionality
4994e6b
| import io | |
| import fitz | |
| from fastapi import UploadFile | |
| from PIL import Image, ImageEnhance, ImageFilter | |
| class PDFPreprocessor: | |
| async def preprocess(file: UploadFile) -> UploadFile: | |
| content = await file.read() | |
| doc = fitz.open(stream=content, filetype="pdf") | |
| processed_doc = fitz.open() | |
| for page in doc: | |
| pix = page.get_pixmap(dpi=300, colorspace=fitz.csGRAY) | |
| img = Image.open(io.BytesIO(pix.tobytes())) | |
| img = img.filter( | |
| ImageFilter.UnsharpMask(radius=1, percent=150, threshold=3) | |
| ) | |
| enhancer = ImageEnhance.Contrast(img) | |
| img = enhancer.enhance(1.5) | |
| buf = io.BytesIO() | |
| img.save(buf, format="PNG") | |
| buf.seek(0) | |
| processed_doc.new_page(width=page.rect.width, height=page.rect.height) | |
| processed_doc[-1].insert_image( | |
| processed_doc[-1].rect, stream=buf.getvalue() | |
| ) | |
| doc.close() | |
| output_buf = io.BytesIO() | |
| processed_doc.save(output_buf) | |
| processed_doc.close() | |
| output_buf.seek(0) | |
| return UploadFile( | |
| file=output_buf, | |
| filename=file.filename, | |
| headers=file.headers, | |
| ) | |