Spaces:
Sleeping
Sleeping
| import io | |
| import fitz # PyMuPDF | |
| import pytesseract | |
| from PIL import Image | |
| from fastapi import FastAPI, UploadFile, File, HTTPException | |
| from fastapi.responses import JSONResponse | |
| from fastapi.middleware.cors import CORSMiddleware | |
| # --- Initialize the FastAPI app --- | |
| app = FastAPI( | |
| title="PDF OCR Extractor API", | |
| description="An API that uses Tesseract OCR to extract text from PDF files.", | |
| version="1.0.0" | |
| ) | |
| # --- Configure CORS --- | |
| # Allows your frontend web page to communicate with this API. | |
| origins = [ | |
| "https://clarifyai.pages.dev", # Your production frontend | |
| "http://127.0.0.1:5500", # Local development server | |
| "http://localhost:5500", | |
| "*" # In development, a wildcard can be useful. For production, be more specific. | |
| ] | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=origins, | |
| allow_credentials=True, | |
| allow_methods=["*"], # Allows all methods (GET, POST, etc.) | |
| allow_headers=["*"], # Allows all headers | |
| ) | |
| # --- Define the API Endpoint --- | |
| async def extract_text_from_pdf_ocr(file: UploadFile = File(...)): | |
| """ | |
| Accepts a PDF file, extracts its text content using OCR, and returns it. | |
| """ | |
| # Ensure the uploaded file is a PDF | |
| if file.content_type != "application/pdf": | |
| raise HTTPException(status_code=400, detail="Invalid file type. Please upload a PDF.") | |
| try: | |
| # Read the uploaded file into memory | |
| pdf_data = await file.read() | |
| # --- OCR LOGIC START --- | |
| # This replaces the old pdfplumber logic | |
| full_text = [] | |
| # Open the PDF from the in-memory data | |
| with fitz.open(stream=pdf_data, filetype="pdf") as doc: | |
| for i, page in enumerate(doc): | |
| # 1. Render the page to a high-resolution image (pixmap) | |
| # DPI is critical for OCR accuracy. 300 is a good standard. | |
| pix = page.get_pixmap(dpi=300) | |
| # 2. Convert the pixmap to a PIL Image object | |
| img_data = pix.tobytes("png") | |
| image = Image.open(io.BytesIO(img_data)) | |
| # 3. Use Tesseract to extract text from the image | |
| # Specify language if known, e.g., lang='eng' | |
| page_text = pytesseract.image_to_string(image) | |
| if page_text: | |
| full_text.append(page_text) | |
| # Join all pages' text with a clear separator | |
| final_text = "\n\n--- Page Break ---\n\n".join(full_text) | |
| # --- OCR LOGIC END --- | |
| # Return the extracted text in a JSON response | |
| return JSONResponse(content={"text": final_text}) | |
| except Exception as e: | |
| # Handle potential errors during OCR processing | |
| print(f"An error occurred during OCR processing: {e}") | |
| raise HTTPException(status_code=500, detail=f"Failed to process PDF file: {e}") | |
| # A simple root endpoint to confirm the server is running | |
| def read_root(): | |
| return {"status": "PDF OCR extraction service is running."} | |