import os import io import sys import json import traceback from typing import List, Tuple import numpy as np from PIL import Image import fitz # PyMuPDF import cv2 import gradio as gr from paddleocr import PaddleOCR # --------- Config knobs (safe defaults) ---------- LANG = os.getenv("OCR_LANG", "en") # e.g., "en", "ar", "en_number", "en_PP-OCRv3" USE_GPU = os.getenv("OCR_USE_GPU", "false").lower() == "true" DET = os.getenv("OCR_DET_MODEL", "ch_PP-OCRv4_det") REC = os.getenv("OCR_REC_MODEL", "en_PP-OCRv4") CLS = True # angle classification CONF_THRESHOLD = float(os.getenv("OCR_CONF_THRESHOLD", "0.0")) # 0.0 → keep everything # Initialize once (download models once, reuse across requests) # Tip: If you want Arabic/English mixed, set LANG="ar" or "en" variants per PaddleOCR docs OCR = PaddleOCR( use_angle_cls=CLS, lang=LANG, use_gpu=USE_GPU, det_model_dir=None, # use default rec_model_dir=None, # use default show_log=False ) def _pil_to_cv(img: Image.Image) -> np.ndarray: """PIL RGB -> OpenCV BGR ndarray""" return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR) def ocr_image(pil_img: Image.Image) -> List[Tuple[str, float]]: """ Run OCR on a PIL image and return list of (text, confidence). """ img_cv = _pil_to_cv(pil_img) result = OCR.ocr(img_cv, cls=CLS) lines: List[Tuple[str, float]] = [] if not result: return lines # PaddleOCR returns a list per image; each item has [ [box, (text, conf)], ... ] for line in result[0]: txt = line[1][0] conf = float(line[1][1]) if conf >= CONF_THRESHOLD: lines.append((txt, conf)) return lines def read_image(filepath: str) -> Image.Image: """ Open an image robustly via PIL (also handles TIFF, JPG, PNG). """ with Image.open(filepath) as im: return im.convert("RGB") def read_pdf_pages(filepath: str) -> List[Image.Image]: """ Render each PDF page to a PIL image (RGB) using PyMuPDF. """ pages: List[Image.Image] = [] with fitz.open(filepath) as doc: for page in doc: # Render with a scale factor for better OCR accuracy mat = fitz.Matrix(2, 2) # 2x upscaling pix = page.get_pixmap(matrix=mat, alpha=False) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) pages.append(img) return pages def extract_text_from_file(filepath: str) -> str: """ Dispatch by file type; return plain text. """ lower = filepath.lower() if lower.endswith(".pdf"): pages = read_pdf_pages(filepath) all_text: List[str] = [] for i, pil_img in enumerate(pages, start=1): lines = ocr_image(pil_img) page_text = "\n".join([t for t, _ in lines]) # Add a page header for clarity on multi-page docs all_text.append(f"--- Page {i} ---\n{page_text}".strip()) return "\n\n".join([s for s in all_text if s]) elif lower.endswith((".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp")): img = read_image(filepath) lines = ocr_image(img) return "\n".join([t for t, _ in lines]).strip() else: raise ValueError("Unsupported file type. Please upload an image (PNG/JPG/TIFF/WEBP/BMP) or a PDF.") def infer(file_obj) -> str: try: if file_obj is None: return "No file uploaded." filepath = file_obj.name if hasattr(file_obj, "name") else str(file_obj) text = extract_text_from_file(filepath) # 🔊 Console telemetry: dump raw text to terminal print("\n================ OCR RAW TEXT ================\n") print(text) print("\n==================== END =====================\n", flush=True) return text or "[No text detected]" except Exception as e: traceback.print_exc() return f"Error during OCR: {e}" # ------------- Gradio UI ---------------- TITLE = "PaddleOCR Text Extractor (Images & PDFs)" DESC = ( "Upload an image or PDF. The app runs PaddleOCR (PP-OCRv4 pipeline) and returns plain text. " "Set `OCR_LANG`, `OCR_USE_GPU`, and `OCR_CONF_THRESHOLD` as env vars to tune." ) with gr.Blocks(title=TITLE) as demo: gr.Markdown(f"# {TITLE}\n{DESC}") with gr.Row(): file_in = gr.File(label="Upload Image or PDF", file_count="single", file_types=["image", ".pdf"]) out = gr.Textbox(label="Extracted Text", lines=25, show_copy_button=True) run_btn = gr.Button("Run OCR", variant="primary") run_btn.click(fn=infer, inputs=[file_in], outputs=[out]) # Also trigger on file change for convenience file_in.change(fn=infer, inputs=[file_in], outputs=[out]) if __name__ == "__main__": # Tip: Set server_name="0.0.0.0" for containers; share=True for quick external testing demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)