Spaces:
Running
Running
| import os | |
| import shutil | |
| import uuid | |
| import cv2 | |
| import fitz # PyMuPDF | |
| import numpy as np | |
| from fastapi import FastAPI, File, UploadFile | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from paddleocr import PaddleOCR, PPStructure # <-- ADDED PaddleOCR IMPORT HERE | |
| app = FastAPI() | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["POST", "GET"], | |
| allow_headers=["*"], | |
| ) | |
| # Initialize PPStructure (The Table Recognition Engine) | |
| table_engine = PPStructure(show_log=False, lang="en") | |
| # Initialize Basic PaddleOCR (Fast, for plain text extraction) | |
| basic_ocr = PaddleOCR(use_angle_cls=True, lang="en", show_log=False) | |
| async def run_ocr_text(file: UploadFile = File(...)): | |
| os.makedirs("uploads", exist_ok=True) | |
| temp_file = f"uploads/{uuid.uuid4()}.jpg" | |
| with open(temp_file, "wb") as buffer: | |
| shutil.copyfileobj(file.file, buffer) | |
| result = basic_ocr.ocr(temp_file) | |
| text = "" | |
| for page in result: | |
| if page: | |
| for line in page: | |
| text += line[1][0] + "\n" | |
| os.remove(temp_file) | |
| return {"text": text} | |
| def health(): | |
| return {"ok": True} | |
| async def run_ocr(file: UploadFile = File(...)): | |
| os.makedirs("uploads", exist_ok=True) | |
| file_ext = os.path.splitext(file.filename)[1] or ".pdf" | |
| temp_file = f"uploads/{uuid.uuid4()}{file_ext}" | |
| with open(temp_file, "wb") as buffer: | |
| shutil.copyfileobj(file.file, buffer) | |
| tables_html = [] | |
| try: | |
| # Open PDF and convert pages to images | |
| doc = fitz.open(temp_file) | |
| for page_num in range(min(len(doc), 2)): # Limited to first 2 pages for speed | |
| page = doc.load_page(page_num) | |
| # Render page to an image (200 DPI for good accuracy) | |
| pix = page.get_pixmap(dpi=200) | |
| img_bytes = pix.tobytes("png") | |
| # Convert to OpenCV format (which PPStructure needs) | |
| nparr = np.frombuffer(img_bytes, np.uint8) | |
| img = cv2.imdecode(nparr, cv2.IMREAD_COLOR) | |
| # Run Table Engine on the image | |
| result = table_engine(img) | |
| # Safely extract HTML tables | |
| for item in result: | |
| if item.get("type") == "table": | |
| html = item.get("res", {}).get("html") | |
| if html: | |
| tables_html.append(html) | |
| doc.close() | |
| except Exception as e: | |
| print(f"Error processing file: {e}") | |
| finally: | |
| # Always clean up the uploaded file | |
| if os.path.exists(temp_file): | |
| os.remove(temp_file) | |
| return {"tables": tables_html} | |