Spaces:

KarthiEz
/

Paddleocr

Running

File size: 4,953 Bytes

import os
import io
import sys
import json
import traceback
from typing import List, Tuple

import numpy as np
from PIL import Image
import fitz  # PyMuPDF
import cv2
import gradio as gr
from paddleocr import PaddleOCR

# --------- Config knobs (safe defaults) ----------
LANG = os.getenv("OCR_LANG", "en")          # e.g., "en", "ar", "en_number", "en_PP-OCRv3"
USE_GPU = os.getenv("OCR_USE_GPU", "false").lower() == "true"
DET = os.getenv("OCR_DET_MODEL", "ch_PP-OCRv4_det")
REC = os.getenv("OCR_REC_MODEL", "en_PP-OCRv4")
CLS = True                                  # angle classification
CONF_THRESHOLD = float(os.getenv("OCR_CONF_THRESHOLD", "0.0"))  # 0.0 → keep everything

# Initialize once (download models once, reuse across requests)
# Tip: If you want Arabic/English mixed, set LANG="ar" or "en" variants per PaddleOCR docs
OCR = PaddleOCR(
    use_angle_cls=CLS,
    lang=LANG,
    use_gpu=USE_GPU,
    det_model_dir=None,   # use default
    rec_model_dir=None,   # use default
    show_log=False
)

def _pil_to_cv(img: Image.Image) -> np.ndarray:
    """PIL RGB -> OpenCV BGR ndarray"""
    return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)

def ocr_image(pil_img: Image.Image) -> List[Tuple[str, float]]:
    """
    Run OCR on a PIL image and return list of (text, confidence).
    """
    img_cv = _pil_to_cv(pil_img)
    result = OCR.ocr(img_cv, cls=CLS)
    lines: List[Tuple[str, float]] = []
    if not result:
        return lines
    # PaddleOCR returns a list per image; each item has [ [box, (text, conf)], ... ]
    for line in result[0]:
        txt = line[1][0]
        conf = float(line[1][1])
        if conf >= CONF_THRESHOLD:
            lines.append((txt, conf))
    return lines

def read_image(filepath: str) -> Image.Image:
    """
    Open an image robustly via PIL (also handles TIFF, JPG, PNG).
    """
    with Image.open(filepath) as im:
        return im.convert("RGB")

def read_pdf_pages(filepath: str) -> List[Image.Image]:
    """
    Render each PDF page to a PIL image (RGB) using PyMuPDF.
    """
    pages: List[Image.Image] = []
    with fitz.open(filepath) as doc:
        for page in doc:
            # Render with a scale factor for better OCR accuracy
            mat = fitz.Matrix(2, 2)  # 2x upscaling
            pix = page.get_pixmap(matrix=mat, alpha=False)
            img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
            pages.append(img)
    return pages

def extract_text_from_file(filepath: str) -> str:
    """
    Dispatch by file type; return plain text.
    """
    lower = filepath.lower()
    if lower.endswith(".pdf"):
        pages = read_pdf_pages(filepath)
        all_text: List[str] = []
        for i, pil_img in enumerate(pages, start=1):
            lines = ocr_image(pil_img)
            page_text = "\n".join([t for t, _ in lines])
            # Add a page header for clarity on multi-page docs
            all_text.append(f"--- Page {i} ---\n{page_text}".strip())
        return "\n\n".join([s for s in all_text if s])
    elif lower.endswith((".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp")):
        img = read_image(filepath)
        lines = ocr_image(img)
        return "\n".join([t for t, _ in lines]).strip()
    else:
        raise ValueError("Unsupported file type. Please upload an image (PNG/JPG/TIFF/WEBP/BMP) or a PDF.")

def infer(file_obj) -> str:
    try:
        if file_obj is None:
            return "No file uploaded."
        filepath = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
        text = extract_text_from_file(filepath)
        # 🔊 Console telemetry: dump raw text to terminal
        print("\n================ OCR RAW TEXT ================\n")
        print(text)
        print("\n==================== END =====================\n", flush=True)
        return text or "[No text detected]"
    except Exception as e:
        traceback.print_exc()
        return f"Error during OCR: {e}"

# ------------- Gradio UI ----------------
TITLE = "PaddleOCR Text Extractor (Images & PDFs)"
DESC = (
    "Upload an image or PDF. The app runs PaddleOCR (PP-OCRv4 pipeline) and returns plain text. "
    "Set `OCR_LANG`, `OCR_USE_GPU`, and `OCR_CONF_THRESHOLD` as env vars to tune."
)

with gr.Blocks(title=TITLE) as demo:
    gr.Markdown(f"# {TITLE}\n{DESC}")
    with gr.Row():
        file_in = gr.File(label="Upload Image or PDF", file_count="single", file_types=["image", ".pdf"])
    out = gr.Textbox(label="Extracted Text", lines=25, show_copy_button=True)
    run_btn = gr.Button("Run OCR", variant="primary")

    run_btn.click(fn=infer, inputs=[file_in], outputs=[out])
    # Also trigger on file change for convenience
    file_in.change(fn=infer, inputs=[file_in], outputs=[out])

if __name__ == "__main__":
    # Tip: Set server_name="0.0.0.0" for containers; share=True for quick external testing
    demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)