Spaces:

aursalan
/

PDF-Reader-OCR

Sleeping

App Files Files Community

PDF-Reader-OCR / app.py

aursalan

Added OCR

4e3c340 5 months ago

raw

history blame contribute delete

3.09 kB

	import io
	import fitz # PyMuPDF
	import pytesseract
	from PIL import Image
	from fastapi import FastAPI, UploadFile, File, HTTPException
	from fastapi.responses import JSONResponse
	from fastapi.middleware.cors import CORSMiddleware

	# --- Initialize the FastAPI app ---
	app = FastAPI(
	title="PDF OCR Extractor API",
	description="An API that uses Tesseract OCR to extract text from PDF files.",
	version="1.0.0"
	)

	# --- Configure CORS ---
	# Allows your frontend web page to communicate with this API.
	origins = [
	"https://clarifyai.pages.dev", # Your production frontend
	"http://127.0.0.1:5500", # Local development server
	"http://localhost:5500",
	"*" # In development, a wildcard can be useful. For production, be more specific.
	]

	app.add_middleware(
	CORSMiddleware,
	allow_origins=origins,
	allow_credentials=True,
	allow_methods=["*"], # Allows all methods (GET, POST, etc.)
	allow_headers=["*"], # Allows all headers
	)

	# --- Define the API Endpoint ---
	@app.post("/extract-text")
	async def extract_text_from_pdf_ocr(file: UploadFile = File(...)):
	"""
	Accepts a PDF file, extracts its text content using OCR, and returns it.
	"""
	# Ensure the uploaded file is a PDF
	if file.content_type != "application/pdf":
	raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.")

	try:
	# Read the uploaded file into memory
	pdf_data = await file.read()

	# --- OCR LOGIC START ---
	# This replaces the old pdfplumber logic

	full_text = []
	# Open the PDF from the in-memory data
	with fitz.open(stream=pdf_data, filetype="pdf") as doc:
	for i, page in enumerate(doc):
	# 1. Render the page to a high-resolution image (pixmap)
	# DPI is critical for OCR accuracy. 300 is a good standard.
	pix = page.get_pixmap(dpi=300)

	# 2. Convert the pixmap to a PIL Image object
	img_data = pix.tobytes("png")
	image = Image.open(io.BytesIO(img_data))

	# 3. Use Tesseract to extract text from the image
	# Specify language if known, e.g., lang='eng'
	page_text = pytesseract.image_to_string(image)

	if page_text:
	full_text.append(page_text)

	# Join all pages' text with a clear separator
	final_text = "\n\n--- Page Break ---\n\n".join(full_text)
	# --- OCR LOGIC END ---

	# Return the extracted text in a JSON response
	return JSONResponse(content={"text": final_text})

	except Exception as e:
	# Handle potential errors during OCR processing
	print(f"An error occurred during OCR processing: {e}")
	raise HTTPException(status_code=500, detail=f"Failed to process PDF file: {e}")

	# A simple root endpoint to confirm the server is running
	@app.get("/")
	def read_root():
	return {"status": "PDF OCR extraction service is running."}