Spaces:
Sleeping
Sleeping
File size: 1,763 Bytes
b3dc3f5 46467f1 b3dc3f5 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 |
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import StreamingResponse
import fitz # PyMuPDF
import pytesseract
from PIL import Image
from io import BytesIO
app = FastAPI()
@app.get("/")
def home():
return {
"message": "OCR API is running",
"endpoint": "/ocr",
"method": "POST",
"upload": "PNG, JPG, JPEG, PDF",
"output": "Searchable PDF"
}
def ocr_image_to_pdf(image_bytes: bytes):
img = Image.open(BytesIO(image_bytes))
pdf_bytes = pytesseract.image_to_pdf_or_hocr(img, extension='pdf')
return pdf_bytes
def ocr_pdf_to_searchable(pdf_bytes: bytes):
original_pdf = fitz.open(stream=pdf_bytes, filetype="pdf")
ocr_output = fitz.open()
for page_num in range(len(original_pdf)):
page = original_pdf.load_page(page_num)
pix = page.get_pixmap(dpi=300)
img_bytes = pix.tobytes("png")
ocr_pdf_page = pytesseract.image_to_pdf_or_hocr(img_bytes, extension="pdf")
ocr_output.insert_pdf(fitz.open("pdf", ocr_pdf_page))
output_bytes = ocr_output.tobytes()
ocr_output.close()
return output_bytes
@app.post("/ocr")
async def upload_and_ocr(file: UploadFile = File(...)):
file_bytes = await file.read()
filename = file.filename.lower()
if filename.endswith((".png", ".jpg", ".jpeg")):
pdf_bytes = ocr_image_to_pdf(file_bytes)
elif filename.endswith(".pdf"):
pdf_bytes = ocr_pdf_to_searchable(file_bytes)
else:
return {"error": "Unsupported file type"}
return StreamingResponse(
BytesIO(pdf_bytes),
media_type="application/pdf",
headers={
"Content-Disposition": f"attachment; filename=searchable_{file.filename}.pdf"
}
)
|