Spaces:
Sleeping
Sleeping
| from fastapi import FastAPI, UploadFile, File | |
| from fastapi.responses import StreamingResponse | |
| import fitz # PyMuPDF | |
| import pytesseract | |
| from PIL import Image | |
| from io import BytesIO | |
| app = FastAPI() | |
| def home(): | |
| return { | |
| "message": "OCR API is running", | |
| "endpoint": "/ocr", | |
| "method": "POST", | |
| "upload": "PNG, JPG, JPEG, PDF", | |
| "output": "Searchable PDF" | |
| } | |
| def ocr_image_to_pdf(image_bytes: bytes): | |
| img = Image.open(BytesIO(image_bytes)) | |
| pdf_bytes = pytesseract.image_to_pdf_or_hocr(img, extension='pdf') | |
| return pdf_bytes | |
| def ocr_pdf_to_searchable(pdf_bytes: bytes): | |
| original_pdf = fitz.open(stream=pdf_bytes, filetype="pdf") | |
| ocr_output = fitz.open() | |
| for page_num in range(len(original_pdf)): | |
| page = original_pdf.load_page(page_num) | |
| pix = page.get_pixmap(dpi=300) | |
| img_bytes = pix.tobytes("png") | |
| ocr_pdf_page = pytesseract.image_to_pdf_or_hocr(img_bytes, extension="pdf") | |
| ocr_output.insert_pdf(fitz.open("pdf", ocr_pdf_page)) | |
| output_bytes = ocr_output.tobytes() | |
| ocr_output.close() | |
| return output_bytes | |
| async def upload_and_ocr(file: UploadFile = File(...)): | |
| file_bytes = await file.read() | |
| filename = file.filename.lower() | |
| if filename.endswith((".png", ".jpg", ".jpeg")): | |
| pdf_bytes = ocr_image_to_pdf(file_bytes) | |
| elif filename.endswith(".pdf"): | |
| pdf_bytes = ocr_pdf_to_searchable(file_bytes) | |
| else: | |
| return {"error": "Unsupported file type"} | |
| return StreamingResponse( | |
| BytesIO(pdf_bytes), | |
| media_type="application/pdf", | |
| headers={ | |
| "Content-Disposition": f"attachment; filename=searchable_{file.filename}.pdf" | |
| } | |
| ) | |