File size: 4,597 Bytes
8e08792
11c7f99
8e08792
 
 
 
 
 
 
 
 
 
 
346fc60
11c7f99
346fc60
 
 
11c7f99
 
 
 
 
346fc60
11c7f99
 
8e08792
 
 
346fc60
 
8e08792
 
11c7f99
 
 
 
8e08792
 
 
 
 
 
 
 
 
 
 
 
 
346fc60
 
8e08792
 
 
 
 
 
 
 
 
 
 
 
 
 
 
346fc60
 
 
 
 
 
 
 
 
 
 
 
11c7f99
 
8e08792
 
346fc60
11c7f99
8e08792
11c7f99
 
 
 
 
346fc60
 
11c7f99
8e08792
 
 
346fc60
8e08792
 
11c7f99
8e08792
346fc60
11c7f99
8e08792
346fc60
 
 
 
 
 
 
8e08792
346fc60
 
 
 
 
 
8e08792
11c7f99
346fc60
 
 
 
 
 
 
 
11c7f99
 
8e08792
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
# app.py
import os

# --- Space-safe flags (place BEFORE importing paddle/paddleocr) ---
os.environ.setdefault("FLAGS_use_mkldnn", "0")
os.environ.setdefault("FLAGS_enable_mkldnn", "0")
os.environ.setdefault("OMP_NUM_THREADS", "1")
os.environ.setdefault("KMP_BLOCKTIME", "0")

# Gradio on Spaces uses these
os.environ.setdefault("GRADIO_SERVER_NAME", "0.0.0.0")
os.environ.setdefault("GRADIO_ANALYTICS_ENABLED", "False")

import io
import sys
import json
import traceback
from typing import List, Tuple

import numpy as np
from PIL import Image
import fitz  # PyMuPDF
import cv2
import gradio as gr
from paddleocr import PaddleOCR

# --------- Config knobs ----------
LANG = os.getenv("OCR_LANG", "en")
USE_GPU = os.getenv("OCR_USE_GPU", "false").lower() == "true"  # Spaces CPU → keep false
DET = os.getenv("OCR_DET_MODEL", "ch_PP-OCRv4_det")
REC = os.getenv("OCR_REC_MODEL", "en_PP-OCRv4")
CLS = True
CONF_THRESHOLD = float(os.getenv("OCR_CONF_THRESHOLD", "0.0"))

def _pil_to_cv(img: Image.Image) -> np.ndarray:
    return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

def _build_ocr(use_cls: bool) -> PaddleOCR:
    return PaddleOCR(
        use_angle_cls=use_cls,
        lang=LANG,
        use_gpu=USE_GPU,
        det_model_dir=None,
        rec_model_dir=None,
        show_log=False
    )

# Primary OCR instance (CLS on). If CLS crashes, we'll rebuild w/o CLS just-in-time.
_OCR = _build_ocr(CLS)

def ocr_image(pil_img: Image.Image) -> List[Tuple[str, float]]:
    img_cv = _pil_to_cv(pil_img)

    def _run(ocr: PaddleOCR, cls_flag: bool):
        return ocr.ocr(img_cv, cls=cls_flag)

    try:
        result = _run(_OCR, CLS)
    except RuntimeError as e:
        msg = str(e).lower()
        if "primitive" in msg or "mkldnn" in msg or "predictor.run" in msg:
            # One-time fallback without angle classifier
            fallback_ocr = _build_ocr(False)
            result = _run(fallback_ocr, False)
        else:
            raise

    lines: List[Tuple[str, float]] = []
    if not result:
        return lines
    for line in result[0]:
        txt = line[1][0]
        conf = float(line[1][1])
        if conf >= CONF_THRESHOLD:
            lines.append((txt, conf))
    return lines

def read_image(filepath: str) -> Image.Image:
    with Image.open(filepath) as im:
        return im.convert("RGB")

def read_pdf_pages(filepath: str):
    pages = []
    with fitz.open(filepath) as doc:
        for page in doc:
            mat = fitz.Matrix(2, 2)
            pix = page.get_pixmap(matrix=mat, alpha=False)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            pages.append(img)
    return pages

def extract_text_from_file(filepath: str) -> str:
    lower = filepath.lower()
    if lower.endswith(".pdf"):
        texts = []
        for i, img in enumerate(read_pdf_pages(filepath), start=1):
            lines = ocr_image(img)
            page_text = "\n".join([t for t, _ in lines])
            texts.append(f"--- Page {i} ---\n{page_text}".strip())
        return "\n\n".join([t for t in texts if t])
    elif lower.endswith((".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp")):
        lines = ocr_image(read_image(filepath))
        return "\n".join([t for t, _ in lines]).strip()
    else:
        raise ValueError("Unsupported file type. Upload an image or a PDF.")

def infer(file_obj) -> str:
    try:
        if file_obj is None:
            return "No file uploaded."
        filepath = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
        text = extract_text_from_file(filepath)
        print("\n===== OCR RAW TEXT =====\n", text, "\n===== END =====\n", flush=True)
        return text or "[No text detected]"
    except Exception as e:
        traceback.print_exc()
        return f"Error during OCR: {e}"

TITLE = "PaddleOCR Text Extractor (Images & PDFs)"
DESC = "Upload an image or PDF. Runs PP-OCRv4 on CPU with Space-safe settings."

with gr.Blocks(title=TITLE) as demo:
    gr.Markdown(f"# {TITLE}\n{DESC}")
    with gr.Row():
        file_in = gr.File(label="Upload Image or PDF", file_count="single", file_types=["image", ".pdf"])
    out = gr.Textbox(label="Extracted Text", lines=25, show_copy_button=True)
    run_btn = gr.Button("Run OCR", variant="primary")
    run_btn.click(fn=infer, inputs=[file_in], outputs=[out])
    file_in.change(fn=infer, inputs=[file_in], outputs=[out])

if __name__ == "__main__":
    demo.launch(server_name=os.getenv("GRADIO_SERVER_NAME", "0.0.0.0"),
                server_port=int(os.getenv("PORT", "7860")),
                show_error=True)