File size: 4,953 Bytes
11c7f99
346fc60
11c7f99
346fc60
 
 
11c7f99
 
 
 
 
346fc60
11c7f99
 
346fc60
 
11c7f99
346fc60
 
 
 
11c7f99
346fc60
 
11c7f99
 
 
 
346fc60
 
11c7f99
 
 
 
346fc60
11c7f99
 
346fc60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11c7f99
 
346fc60
 
 
 
11c7f99
346fc60
11c7f99
346fc60
 
11c7f99
 
 
 
 
346fc60
 
 
 
 
11c7f99
346fc60
 
 
 
 
 
 
 
11c7f99
346fc60
 
 
11c7f99
346fc60
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11c7f99
346fc60
 
 
 
 
 
11c7f99
346fc60
 
 
11c7f99
 
346fc60
 
402ad7d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
import os
import io
import sys
import json
import traceback
from typing import List, Tuple

import numpy as np
from PIL import Image
import fitz  # PyMuPDF
import cv2
import gradio as gr
from paddleocr import PaddleOCR

# --------- Config knobs (safe defaults) ----------
LANG = os.getenv("OCR_LANG", "en")          # e.g., "en", "ar", "en_number", "en_PP-OCRv3"
USE_GPU = os.getenv("OCR_USE_GPU", "false").lower() == "true"
DET = os.getenv("OCR_DET_MODEL", "ch_PP-OCRv4_det")
REC = os.getenv("OCR_REC_MODEL", "en_PP-OCRv4")
CLS = True                                  # angle classification
CONF_THRESHOLD = float(os.getenv("OCR_CONF_THRESHOLD", "0.0"))  # 0.0 → keep everything

# Initialize once (download models once, reuse across requests)
# Tip: If you want Arabic/English mixed, set LANG="ar" or "en" variants per PaddleOCR docs
OCR = PaddleOCR(
    use_angle_cls=CLS,
    lang=LANG,
    use_gpu=USE_GPU,
    det_model_dir=None,   # use default
    rec_model_dir=None,   # use default
    show_log=False
)

def _pil_to_cv(img: Image.Image) -> np.ndarray:
    """PIL RGB -> OpenCV BGR ndarray"""
    return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

def ocr_image(pil_img: Image.Image) -> List[Tuple[str, float]]:
    """
    Run OCR on a PIL image and return list of (text, confidence).
    """
    img_cv = _pil_to_cv(pil_img)
    result = OCR.ocr(img_cv, cls=CLS)
    lines: List[Tuple[str, float]] = []
    if not result:
        return lines
    # PaddleOCR returns a list per image; each item has [ [box, (text, conf)], ... ]
    for line in result[0]:
        txt = line[1][0]
        conf = float(line[1][1])
        if conf >= CONF_THRESHOLD:
            lines.append((txt, conf))
    return lines

def read_image(filepath: str) -> Image.Image:
    """
    Open an image robustly via PIL (also handles TIFF, JPG, PNG).
    """
    with Image.open(filepath) as im:
        return im.convert("RGB")

def read_pdf_pages(filepath: str) -> List[Image.Image]:
    """
    Render each PDF page to a PIL image (RGB) using PyMuPDF.
    """
    pages: List[Image.Image] = []
    with fitz.open(filepath) as doc:
        for page in doc:
            # Render with a scale factor for better OCR accuracy
            mat = fitz.Matrix(2, 2)  # 2x upscaling
            pix = page.get_pixmap(matrix=mat, alpha=False)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            pages.append(img)
    return pages

def extract_text_from_file(filepath: str) -> str:
    """
    Dispatch by file type; return plain text.
    """
    lower = filepath.lower()
    if lower.endswith(".pdf"):
        pages = read_pdf_pages(filepath)
        all_text: List[str] = []
        for i, pil_img in enumerate(pages, start=1):
            lines = ocr_image(pil_img)
            page_text = "\n".join([t for t, _ in lines])
            # Add a page header for clarity on multi-page docs
            all_text.append(f"--- Page {i} ---\n{page_text}".strip())
        return "\n\n".join([s for s in all_text if s])
    elif lower.endswith((".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp")):
        img = read_image(filepath)
        lines = ocr_image(img)
        return "\n".join([t for t, _ in lines]).strip()
    else:
        raise ValueError("Unsupported file type. Please upload an image (PNG/JPG/TIFF/WEBP/BMP) or a PDF.")

def infer(file_obj) -> str:
    try:
        if file_obj is None:
            return "No file uploaded."
        filepath = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
        text = extract_text_from_file(filepath)
        # 🔊 Console telemetry: dump raw text to terminal
        print("\n================ OCR RAW TEXT ================\n")
        print(text)
        print("\n==================== END =====================\n", flush=True)
        return text or "[No text detected]"
    except Exception as e:
        traceback.print_exc()
        return f"Error during OCR: {e}"

# ------------- Gradio UI ----------------
TITLE = "PaddleOCR Text Extractor (Images & PDFs)"
DESC = (
    "Upload an image or PDF. The app runs PaddleOCR (PP-OCRv4 pipeline) and returns plain text. "
    "Set `OCR_LANG`, `OCR_USE_GPU`, and `OCR_CONF_THRESHOLD` as env vars to tune."
)

with gr.Blocks(title=TITLE) as demo:
    gr.Markdown(f"# {TITLE}\n{DESC}")
    with gr.Row():
        file_in = gr.File(label="Upload Image or PDF", file_count="single", file_types=["image", ".pdf"])
    out = gr.Textbox(label="Extracted Text", lines=25, show_copy_button=True)
    run_btn = gr.Button("Run OCR", variant="primary")

    run_btn.click(fn=infer, inputs=[file_in], outputs=[out])
    # Also trigger on file change for convenience
    file_in.change(fn=infer, inputs=[file_in], outputs=[out])

if __name__ == "__main__":
    # Tip: Set server_name="0.0.0.0" for containers; share=True for quick external testing
    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)