import os import shutil import uuid import cv2 import fitz # PyMuPDF import numpy as np from fastapi import FastAPI, File, UploadFile from fastapi.middleware.cors import CORSMiddleware from paddleocr import PaddleOCR, PPStructure # <-- ADDED PaddleOCR IMPORT HERE app = FastAPI() app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_methods=["POST", "GET"], allow_headers=["*"], ) # Initialize PPStructure (The Table Recognition Engine) table_engine = PPStructure(show_log=False, lang="en") # Initialize Basic PaddleOCR (Fast, for plain text extraction) basic_ocr = PaddleOCR(use_angle_cls=True, lang="en", show_log=False) @app.post("/ocr-text") async def run_ocr_text(file: UploadFile = File(...)): os.makedirs("uploads", exist_ok=True) temp_file = f"uploads/{uuid.uuid4()}.jpg" with open(temp_file, "wb") as buffer: shutil.copyfileobj(file.file, buffer) result = basic_ocr.ocr(temp_file) text = "" for page in result: if page: for line in page: text += line[1][0] + "\n" os.remove(temp_file) return {"text": text} @app.get("/health") def health(): return {"ok": True} @app.post("/ocr") async def run_ocr(file: UploadFile = File(...)): os.makedirs("uploads", exist_ok=True) file_ext = os.path.splitext(file.filename)[1] or ".pdf" temp_file = f"uploads/{uuid.uuid4()}{file_ext}" with open(temp_file, "wb") as buffer: shutil.copyfileobj(file.file, buffer) tables_html = [] try: # Open PDF and convert pages to images doc = fitz.open(temp_file) for page_num in range(min(len(doc), 2)): # Limited to first 2 pages for speed page = doc.load_page(page_num) # Render page to an image (200 DPI for good accuracy) pix = page.get_pixmap(dpi=200) img_bytes = pix.tobytes("png") # Convert to OpenCV format (which PPStructure needs) nparr = np.frombuffer(img_bytes, np.uint8) img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) # Run Table Engine on the image result = table_engine(img) # Safely extract HTML tables for item in result: if item.get("type") == "table": html = item.get("res", {}).get("html") if html: tables_html.append(html) doc.close() except Exception as e: print(f"Error processing file: {e}") finally: # Always clean up the uploaded file if os.path.exists(temp_file): os.remove(temp_file) return {"tables": tables_html}