File size: 2,677 Bytes
0f3d108
 
 
 
fc6f38d
 
 
0f3d108
 
f139115
0f3d108
 
 
 
 
 
 
 
 
 
38c6c72
 
0f3d108
f139115
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f3d108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38c6c72
fc6f38d
 
 
 
 
f139115
fc6f38d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f3d108
38c6c72
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
import shutil
import uuid

import cv2
import fitz  # PyMuPDF
import numpy as np
from fastapi import FastAPI, File, UploadFile
from fastapi.middleware.cors import CORSMiddleware
from paddleocr import PaddleOCR, PPStructure  # <-- ADDED PaddleOCR IMPORT HERE

app = FastAPI()

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_methods=["POST", "GET"],
    allow_headers=["*"],
)

# Initialize PPStructure (The Table Recognition Engine)
table_engine = PPStructure(show_log=False, lang="en")

# Initialize Basic PaddleOCR (Fast, for plain text extraction)
basic_ocr = PaddleOCR(use_angle_cls=True, lang="en", show_log=False)


@app.post("/ocr-text")
async def run_ocr_text(file: UploadFile = File(...)):
    os.makedirs("uploads", exist_ok=True)
    temp_file = f"uploads/{uuid.uuid4()}.jpg"

    with open(temp_file, "wb") as buffer:
        shutil.copyfileobj(file.file, buffer)

    result = basic_ocr.ocr(temp_file)

    text = ""
    for page in result:
        if page:
            for line in page:
                text += line[1][0] + "\n"

    os.remove(temp_file)
    return {"text": text}


@app.get("/health")
def health():
    return {"ok": True}


@app.post("/ocr")
async def run_ocr(file: UploadFile = File(...)):
    os.makedirs("uploads", exist_ok=True)
    file_ext = os.path.splitext(file.filename)[1] or ".pdf"
    temp_file = f"uploads/{uuid.uuid4()}{file_ext}"

    with open(temp_file, "wb") as buffer:
        shutil.copyfileobj(file.file, buffer)

    tables_html = []

    try:
        # Open PDF and convert pages to images
        doc = fitz.open(temp_file)

        for page_num in range(min(len(doc), 2)):  # Limited to first 2 pages for speed
            page = doc.load_page(page_num)
            # Render page to an image (200 DPI for good accuracy)
            pix = page.get_pixmap(dpi=200)
            img_bytes = pix.tobytes("png")

            # Convert to OpenCV format (which PPStructure needs)
            nparr = np.frombuffer(img_bytes, np.uint8)
            img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

            # Run Table Engine on the image
            result = table_engine(img)

            # Safely extract HTML tables
            for item in result:
                if item.get("type") == "table":
                    html = item.get("res", {}).get("html")
                    if html:
                        tables_html.append(html)

        doc.close()
    except Exception as e:
        print(f"Error processing file: {e}")
    finally:
        # Always clean up the uploaded file
        if os.path.exists(temp_file):
            os.remove(temp_file)

    return {"tables": tables_html}