KarthiEz commited on
Commit
346fc60
·
verified ·
1 Parent(s): 6ad9a40

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +95 -119
app.py CHANGED
@@ -1,156 +1,132 @@
1
- # raw_paddleocr.py
2
- # Standalone raw-text extractor using PaddleOCR (no changes to your app).
3
- # Modes:
4
- # OCR_RAW_MODE = "block" (default) | "paragraph" | "lines"
5
- # OCR_CONF_THRESHOLD = 0.0..1.0 (default 0.0)
6
- # OCR_LANG = "en" (default) or other PaddleOCR langs like "ar", "en_number"
7
- # OCR_USE_GPU = "true" | "false" (default "false")
8
-
9
  import os
 
10
  import sys
11
- from typing import List, Tuple, Dict, Any
 
 
12
 
13
  import numpy as np
14
  from PIL import Image
15
  import fitz # PyMuPDF
16
  import cv2
17
-
18
  from paddleocr import PaddleOCR
19
 
20
- # -------- Config (env-driven) ----------
21
- LANG = os.getenv("OCR_LANG", "en")
22
  USE_GPU = os.getenv("OCR_USE_GPU", "false").lower() == "true"
23
- CLS = True
24
- CONF_THRESHOLD = float(os.getenv("OCR_CONF_THRESHOLD", "0.0"))
25
-
26
- RAW_MODE = os.getenv("OCR_RAW_MODE", "block") # "block" | "paragraph" | "lines"
27
- LINE_GAP_RATIO = float(os.getenv("OCR_LINE_GAP_RATIO", "0.6"))
28
 
29
- # -------- Init OCR once ---------------
 
30
  OCR = PaddleOCR(
31
  use_angle_cls=CLS,
32
  lang=LANG,
33
  use_gpu=USE_GPU,
34
- det_model_dir=None,
35
- rec_model_dir=None,
36
  show_log=False
37
  )
38
 
39
- # -------- Utils -----------------------
40
  def _pil_to_cv(img: Image.Image) -> np.ndarray:
 
41
  return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
42
 
43
- def _bbox_center(bbox):
44
- xs = [p[0] for p in bbox]; ys = [p[1] for p in bbox]
45
- return (sum(xs) / 4.0, sum(ys) / 4.0)
46
-
47
- def read_image(path: str) -> Image.Image:
48
- with Image.open(path) as im:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  return im.convert("RGB")
50
 
51
- def read_pdf_pages(path: str) -> List[Image.Image]:
 
 
 
52
  pages: List[Image.Image] = []
53
- with fitz.open(path) as doc:
54
  for page in doc:
55
- mat = fitz.Matrix(2, 2) # scale up for better OCR
 
56
  pix = page.get_pixmap(matrix=mat, alpha=False)
57
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
58
  pages.append(img)
59
  return pages
60
 
61
- # -------- Core OCR --------------------
62
- def ocr_tokens(pil_img: Image.Image) -> List[Dict[str, Any]]:
63
- img_cv = _pil_to_cv(pil_img)
64
- result = OCR.ocr(img_cv, cls=CLS)
65
- tokens = []
66
- if not result:
67
- return tokens
68
- for box, (txt, conf) in result[0]:
69
- conf = float(conf)
70
- if not txt or conf < CONF_THRESHOLD:
71
- continue
72
- cx, cy = _bbox_center(box)
73
- ys = [p[1] for p in box]
74
- h = max(ys) - min(ys) + 1e-6
75
- tokens.append({"text": txt.strip(), "conf": conf, "bbox": box, "cx": cx, "cy": cy, "h": h})
76
- return tokens
77
-
78
- def _sort_reading_order(tokens: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
79
- return sorted(tokens, key=lambda t: (round(t["cy"], 1), t["cx"]))
80
-
81
- def _group_paragraphs(sorted_tokens: List[Dict[str, Any]], gap_ratio: float = 0.6):
82
- if not sorted_tokens:
83
- return []
84
- heights = sorted(t["h"] for t in sorted_tokens)
85
- median_h = heights[len(heights)//2] or 1.0
86
-
87
- paras, cur = [], [sorted_tokens[0]]
88
- for prev, cur_tok in zip(sorted_tokens, sorted_tokens[1:]):
89
- vertical_gap = cur_tok["cy"] - prev["cy"]
90
- if vertical_gap > gap_ratio * median_h:
91
- paras.append(cur)
92
- cur = [cur_tok]
93
- else:
94
- cur.append(cur_tok)
95
- paras.append(cur)
96
- return paras
97
-
98
- def _post_clean(text: str) -> str:
99
- text = " ".join(text.split())
100
- text = text.replace("- ", "")
101
- return text
102
-
103
- def tokens_to_text(tokens: List[Dict[str, Any]], mode: str = "block", gap_ratio: float = 0.6) -> str:
104
- if not tokens:
105
- return ""
106
- tokens = _sort_reading_order(tokens)
107
-
108
- if mode == "block":
109
- return _post_clean(" ".join(t["text"] for t in tokens))
110
-
111
- if mode == "paragraph":
112
- paras = _group_paragraphs(tokens, gap_ratio=gap_ratio)
113
- chunks = [_post_clean(" ".join(t["text"] for t in p)) for p in paras]
114
- return "\n\n".join(c for c in chunks if c)
115
-
116
- # lines
117
- lines, current = [], [tokens[0]]
118
- for prev, cur_tok in zip(tokens, tokens[1:]):
119
- same_line = abs(cur_tok["cy"] - prev["cy"]) <= 0.35 * max(prev["h"], cur_tok["h"])
120
- if same_line:
121
- current.append(cur_tok)
122
- else:
123
- lines.append(current)
124
- current = [cur_tok]
125
- lines.append(current)
126
- line_texts = [_post_clean(" ".join(t["text"] for t in row)) for row in lines]
127
- return "\n".join(l for l in line_texts if l)
128
-
129
- def extract_raw_text(path: str) -> str:
130
- lower = path.lower()
131
  if lower.endswith(".pdf"):
132
- pages = read_pdf_pages(path)
 
 
 
 
 
 
 
133
  elif lower.endswith((".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp")):
134
- pages = [read_image(path)]
 
 
135
  else:
136
- raise ValueError("Unsupported file type. Provide an image or PDF.")
137
-
138
- outputs = []
139
- for pil_img in pages:
140
- toks = ocr_tokens(pil_img)
141
- outputs.append(tokens_to_text(toks, mode=RAW_MODE, gap_ratio=LINE_GAP_RATIO))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
142
 
143
- text = "\n\n".join(o for o in outputs if o).strip()
144
- return text or "[No text detected]"
 
 
 
 
145
 
146
- # -------- CLI -------------------------
147
- def main():
148
- if len(sys.argv) < 2:
149
- print("Usage: python raw_paddleocr.py <path-to-image-or-pdf>")
150
- sys.exit(2)
151
- path = sys.argv[1]
152
- out = extract_raw_text(path)
153
- print(out)
154
 
155
  if __name__ == "__main__":
156
- main()
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import io
3
  import sys
4
+ import json
5
+ import traceback
6
+ from typing import List, Tuple
7
 
8
  import numpy as np
9
  from PIL import Image
10
  import fitz # PyMuPDF
11
  import cv2
12
+ import gradio as gr
13
  from paddleocr import PaddleOCR
14
 
15
+ # --------- Config knobs (safe defaults) ----------
16
+ LANG = os.getenv("OCR_LANG", "en") # e.g., "en", "ar", "en_number", "en_PP-OCRv3"
17
  USE_GPU = os.getenv("OCR_USE_GPU", "false").lower() == "true"
18
+ DET = os.getenv("OCR_DET_MODEL", "ch_PP-OCRv4_det")
19
+ REC = os.getenv("OCR_REC_MODEL", "en_PP-OCRv4")
20
+ CLS = True # angle classification
21
+ CONF_THRESHOLD = float(os.getenv("OCR_CONF_THRESHOLD", "0.0")) # 0.0 keep everything
 
22
 
23
+ # Initialize once (download models once, reuse across requests)
24
+ # Tip: If you want Arabic/English mixed, set LANG="ar" or "en" variants per PaddleOCR docs
25
  OCR = PaddleOCR(
26
  use_angle_cls=CLS,
27
  lang=LANG,
28
  use_gpu=USE_GPU,
29
+ det_model_dir=None, # use default
30
+ rec_model_dir=None, # use default
31
  show_log=False
32
  )
33
 
 
34
  def _pil_to_cv(img: Image.Image) -> np.ndarray:
35
+ """PIL RGB -> OpenCV BGR ndarray"""
36
  return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
37
 
38
+ def ocr_image(pil_img: Image.Image) -> List[Tuple[str, float]]:
39
+ """
40
+ Run OCR on a PIL image and return list of (text, confidence).
41
+ """
42
+ img_cv = _pil_to_cv(pil_img)
43
+ result = OCR.ocr(img_cv, cls=CLS)
44
+ lines: List[Tuple[str, float]] = []
45
+ if not result:
46
+ return lines
47
+ # PaddleOCR returns a list per image; each item has [ [box, (text, conf)], ... ]
48
+ for line in result[0]:
49
+ txt = line[1][0]
50
+ conf = float(line[1][1])
51
+ if conf >= CONF_THRESHOLD:
52
+ lines.append((txt, conf))
53
+ return lines
54
+
55
+ def read_image(filepath: str) -> Image.Image:
56
+ """
57
+ Open an image robustly via PIL (also handles TIFF, JPG, PNG).
58
+ """
59
+ with Image.open(filepath) as im:
60
  return im.convert("RGB")
61
 
62
+ def read_pdf_pages(filepath: str) -> List[Image.Image]:
63
+ """
64
+ Render each PDF page to a PIL image (RGB) using PyMuPDF.
65
+ """
66
  pages: List[Image.Image] = []
67
+ with fitz.open(filepath) as doc:
68
  for page in doc:
69
+ # Render with a scale factor for better OCR accuracy
70
+ mat = fitz.Matrix(2, 2) # 2x upscaling
71
  pix = page.get_pixmap(matrix=mat, alpha=False)
72
  img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
73
  pages.append(img)
74
  return pages
75
 
76
+ def extract_text_from_file(filepath: str) -> str:
77
+ """
78
+ Dispatch by file type; return plain text.
79
+ """
80
+ lower = filepath.lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  if lower.endswith(".pdf"):
82
+ pages = read_pdf_pages(filepath)
83
+ all_text: List[str] = []
84
+ for i, pil_img in enumerate(pages, start=1):
85
+ lines = ocr_image(pil_img)
86
+ page_text = "\n".join([t for t, _ in lines])
87
+ # Add a page header for clarity on multi-page docs
88
+ all_text.append(f"--- Page {i} ---\n{page_text}".strip())
89
+ return "\n\n".join([s for s in all_text if s])
90
  elif lower.endswith((".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp")):
91
+ img = read_image(filepath)
92
+ lines = ocr_image(img)
93
+ return "\n".join([t for t, _ in lines]).strip()
94
  else:
95
+ raise ValueError("Unsupported file type. Please upload an image (PNG/JPG/TIFF/WEBP/BMP) or a PDF.")
96
+
97
+ def infer(file_obj) -> str:
98
+ try:
99
+ if file_obj is None:
100
+ return "No file uploaded."
101
+ filepath = file_obj.name if hasattr(file_obj, "name") else str(file_obj)
102
+ text = extract_text_from_file(filepath)
103
+ # 🔊 Console telemetry: dump raw text to terminal
104
+ print("\n================ OCR RAW TEXT ================\n")
105
+ print(text)
106
+ print("\n==================== END =====================\n", flush=True)
107
+ return text or "[No text detected]"
108
+ except Exception as e:
109
+ traceback.print_exc()
110
+ return f"Error during OCR: {e}"
111
+
112
+ # ------------- Gradio UI ----------------
113
+ TITLE = "PaddleOCR Text Extractor (Images & PDFs)"
114
+ DESC = (
115
+ "Upload an image or PDF. The app runs PaddleOCR (PP-OCRv4 pipeline) and returns plain text. "
116
+ "Set `OCR_LANG`, `OCR_USE_GPU`, and `OCR_CONF_THRESHOLD` as env vars to tune."
117
+ )
118
 
119
+ with gr.Blocks(title=TITLE) as demo:
120
+ gr.Markdown(f"# {TITLE}\n{DESC}")
121
+ with gr.Row():
122
+ file_in = gr.File(label="Upload Image or PDF", file_count="single", file_types=["image", ".pdf"])
123
+ out = gr.Textbox(label="Extracted Text", lines=25, show_copy_button=True)
124
+ run_btn = gr.Button("Run OCR", variant="primary")
125
 
126
+ run_btn.click(fn=infer, inputs=[file_in], outputs=[out])
127
+ # Also trigger on file change for convenience
128
+ file_in.change(fn=infer, inputs=[file_in], outputs=[out])
 
 
 
 
 
129
 
130
  if __name__ == "__main__":
131
+ # Tip: Set server_name="0.0.0.0" for containers; share=True for quick external testing
132
+ demo.launch(server_name="0.0.0.0", server_port=7860, show_error=True)