import io import fitz # PyMuPDF import pytesseract from PIL import Image from fastapi import FastAPI, UploadFile, File, HTTPException from fastapi.responses import JSONResponse from fastapi.middleware.cors import CORSMiddleware # --- Initialize the FastAPI app --- app = FastAPI( title="PDF OCR Extractor API", description="An API that uses Tesseract OCR to extract text from PDF files.", version="1.0.0" ) # --- Configure CORS --- # Allows your frontend web page to communicate with this API. origins = [ "https://clarifyai.pages.dev", # Your production frontend "http://127.0.0.1:5500", # Local development server "http://localhost:5500", "*" # In development, a wildcard can be useful. For production, be more specific. ] app.add_middleware( CORSMiddleware, allow_origins=origins, allow_credentials=True, allow_methods=["*"], # Allows all methods (GET, POST, etc.) allow_headers=["*"], # Allows all headers ) # --- Define the API Endpoint --- @app.post("/extract-text") async def extract_text_from_pdf_ocr(file: UploadFile = File(...)): """ Accepts a PDF file, extracts its text content using OCR, and returns it. """ # Ensure the uploaded file is a PDF if file.content_type != "application/pdf": raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.") try: # Read the uploaded file into memory pdf_data = await file.read() # --- OCR LOGIC START --- # This replaces the old pdfplumber logic full_text = [] # Open the PDF from the in-memory data with fitz.open(stream=pdf_data, filetype="pdf") as doc: for i, page in enumerate(doc): # 1. Render the page to a high-resolution image (pixmap) # DPI is critical for OCR accuracy. 300 is a good standard. pix = page.get_pixmap(dpi=300) # 2. Convert the pixmap to a PIL Image object img_data = pix.tobytes("png") image = Image.open(io.BytesIO(img_data)) # 3. Use Tesseract to extract text from the image # Specify language if known, e.g., lang='eng' page_text = pytesseract.image_to_string(image) if page_text: full_text.append(page_text) # Join all pages' text with a clear separator final_text = "\n\n--- Page Break ---\n\n".join(full_text) # --- OCR LOGIC END --- # Return the extracted text in a JSON response return JSONResponse(content={"text": final_text}) except Exception as e: # Handle potential errors during OCR processing print(f"An error occurred during OCR processing: {e}") raise HTTPException(status_code=500, detail=f"Failed to process PDF file: {e}") # A simple root endpoint to confirm the server is running @app.get("/") def read_root(): return {"status": "PDF OCR extraction service is running."}