KarthiEz commited on
Commit
11c7f99
·
verified ·
1 Parent(s): 769fd42

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +132 -0
app.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import sys
4
+ import json
5
+ import traceback
6
+ from typing import List, Tuple
7
+
8
+ import numpy as np
9
+ from PIL import Image
10
+ import fitz # PyMuPDF
11
+ import cv2
12
+ import gradio as gr
13
+ from paddleocr import PaddleOCR
14
+
15
+ # --------- Config knobs (safe defaults) ----------
16
+ LANG = os.getenv("OCR_LANG", "en") # e.g., "en", "ar", "en_number", "en_PP-OCRv3"
17
+ USE_GPU = os.getenv("OCR_USE_GPU", "false").lower() == "true"
18
+ DET = os.getenv("OCR_DET_MODEL", "ch_PP-OCRv4_det")
19
+ REC = os.getenv("OCR_REC_MODEL", "en_PP-OCRv4")
20
+ CLS = True # angle classification
21
+ CONF_THRESHOLD = float(os.getenv("OCR_CONF_THRESHOLD", "0.0")) # 0.0 → keep everything
22
+
23
+ # Initialize once (download models once, reuse across requests)
24
+ # Tip: If you want Arabic/English mixed, set LANG="ar" or "en" variants per PaddleOCR docs
25
+ OCR = PaddleOCR(
26
+ use_angle_cls=CLS,
27
+ lang=LANG,
28
+ use_gpu=USE_GPU,
29
+ det_model_dir=None, # use default
30
+ rec_model_dir=None, # use default
31
+ show_log=False
32
+ )
33
+
34
+ def _pil_to_cv(img: Image.Image) -> np.ndarray:
35
+ """PIL RGB -> OpenCV BGR ndarray"""
36
+ return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
37
+
38
+ def ocr_image(pil_img: Image.Image) -> List[Tuple[str, float]]:
39
+ """
40
+ Run OCR on a PIL image and return list of (text, confidence).
41
+ """
42
+ img_cv = _pil_to_cv(pil_img)
43
+ result = OCR.ocr(img_cv, cls=CLS)
44
+ lines: List[Tuple[str, float]] = []
45
+ if not result:
46
+ return lines
47
+ # PaddleOCR returns a list per image; each item has [ [box, (text, conf)], ... ]
48
+ for line in result[0]:
49
+ txt = line[1][0]
50
+ conf = float(line[1][1])
51
+ if conf >= CONF_THRESHOLD:
52
+ lines.append((txt, conf))
53
+ return lines
54
+
55
+ def read_image(filepath: str) -> Image.Image:
56
+ """
57
+ Open an image robustly via PIL (also handles TIFF, JPG, PNG).
58
+ """
59
+ with Image.open(filepath) as im:
60
+ return im.convert("RGB")
61
+
62
+ def read_pdf_pages(filepath: str) -> List[Image.Image]:
63
+ """
64
+ Render each PDF page to a PIL image (RGB) using PyMuPDF.
65
+ """
66
+ pages: List[Image.Image] = []
67
+ with fitz.open(filepath) as doc:
68
+ for page in doc:
69
+ # Render with a scale factor for better OCR accuracy
70
+ mat = fitz.Matrix(2, 2) # 2x upscaling
71
+ pix = page.get_pixmap(matrix=mat, alpha=False)
72
+ img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
73
+ pages.append(img)
74
+ return pages
75
+
76
+ def extract_text_from_file(filepath: str) -> str:
77
+ """
78
+ Dispatch by file type; return plain text.
79
+ """
80
+ lower = filepath.lower()
81
+ if lower.endswith(".pdf"):
82
+ pages = read_pdf_pages(filepath)
83
+ all_text: List[str] = []
84
+ for i, pil_img in enumerate(pages, start=1):
85
+ lines = ocr_image(pil_img)
86
+ page_text = "\n".join([t for t, _ in lines])
87
+ # Add a page header for clarity on multi-page docs
88
+ all_text.append(f"--- Page {i} ---\n{page_text}".strip())
89
+ return "\n\n".join([s for s in all_text if s])
90
+ elif lower.endswith((".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp")):
91
+ img = read_image(filepath)
92
+ lines = ocr_image(img)
93
+ return "\n".join([t for t, _ in lines]).strip()
94
+ else:
95
+ raise ValueError("Unsupported file type. Please upload an image (PNG/JPG/TIFF/WEBP/BMP) or a PDF.")
96
+
97
+ def infer(file_obj) -> str:
98
+ try:
99
+ if file_obj is None:
100
+ return "No file uploaded."
101
+ filepath = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
102
+ text = extract_text_from_file(filepath)
103
+ # 🔊 Console telemetry: dump raw text to terminal
104
+ print("\n================ OCR RAW TEXT ================\n")
105
+ print(text)
106
+ print("\n==================== END =====================\n", flush=True)
107
+ return text or "[No text detected]"
108
+ except Exception as e:
109
+ traceback.print_exc()
110
+ return f"Error during OCR: {e}"
111
+
112
+ # ------------- Gradio UI ----------------
113
+ TITLE = "PaddleOCR Text Extractor (Images & PDFs)"
114
+ DESC = (
115
+ "Upload an image or PDF. The app runs PaddleOCR (PP-OCRv4 pipeline) and returns plain text. "
116
+ "Set `OCR_LANG`, `OCR_USE_GPU`, and `OCR_CONF_THRESHOLD` as env vars to tune."
117
+ )
118
+
119
+ with gr.Blocks(title=TITLE) as demo:
120
+ gr.Markdown(f"# {TITLE}\n{DESC}")
121
+ with gr.Row():
122
+ file_in = gr.File(label="Upload Image or PDF", file_count="single", file_types=["image", ".pdf"])
123
+ out = gr.Textbox(label="Extracted Text", lines=25, show_copy_button=True)
124
+ run_btn = gr.Button("Run OCR", variant="primary")
125
+
126
+ run_btn.click(fn=infer, inputs=[file_in], outputs=[out])
127
+ # Also trigger on file change for convenience
128
+ file_in.change(fn=infer, inputs=[file_in], outputs=[out])
129
+
130
+ if __name__ == "__main__":
131
+ # Tip: Set server_name="0.0.0.0" for containers; share=True for quick external testing
132
+ demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)