PDF-Reader-OCR / app.py
aursalan's picture
Added OCR
4e3c340
import io
import fitz # PyMuPDF
import pytesseract
from PIL import Image
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
# --- Initialize the FastAPI app ---
app = FastAPI(
title="PDF OCR Extractor API",
description="An API that uses Tesseract OCR to extract text from PDF files.",
version="1.0.0"
)
# --- Configure CORS ---
# Allows your frontend web page to communicate with this API.
origins = [
"https://clarifyai.pages.dev", # Your production frontend
"http://127.0.0.1:5500", # Local development server
"http://localhost:5500",
"*" # In development, a wildcard can be useful. For production, be more specific.
]
app.add_middleware(
CORSMiddleware,
allow_origins=origins,
allow_credentials=True,
allow_methods=["*"], # Allows all methods (GET, POST, etc.)
allow_headers=["*"], # Allows all headers
)
# --- Define the API Endpoint ---
@app.post("/extract-text")
async def extract_text_from_pdf_ocr(file: UploadFile = File(...)):
"""
Accepts a PDF file, extracts its text content using OCR, and returns it.
"""
# Ensure the uploaded file is a PDF
if file.content_type != "application/pdf":
raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")
try:
# Read the uploaded file into memory
pdf_data = await file.read()
# --- OCR LOGIC START ---
# This replaces the old pdfplumber logic
full_text = []
# Open the PDF from the in-memory data
with fitz.open(stream=pdf_data, filetype="pdf") as doc:
for i, page in enumerate(doc):
# 1. Render the page to a high-resolution image (pixmap)
# DPI is critical for OCR accuracy. 300 is a good standard.
pix = page.get_pixmap(dpi=300)
# 2. Convert the pixmap to a PIL Image object
img_data = pix.tobytes("png")
image = Image.open(io.BytesIO(img_data))
# 3. Use Tesseract to extract text from the image
# Specify language if known, e.g., lang='eng'
page_text = pytesseract.image_to_string(image)
if page_text:
full_text.append(page_text)
# Join all pages' text with a clear separator
final_text = "\n\n--- Page Break ---\n\n".join(full_text)
# --- OCR LOGIC END ---
# Return the extracted text in a JSON response
return JSONResponse(content={"text": final_text})
except Exception as e:
# Handle potential errors during OCR processing
print(f"An error occurred during OCR processing: {e}")
raise HTTPException(status_code=500, detail=f"Failed to process PDF file: {e}")
# A simple root endpoint to confirm the server is running
@app.get("/")
def read_root():
return {"status": "PDF OCR extraction service is running."}