imperiusrex commited on
Commit
e0e0134
Β·
verified Β·
1 Parent(s): b504595

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -21
app.py CHANGED
@@ -3,10 +3,9 @@ import torch
3
  import numpy as np
4
  import cv2
5
  import os
6
- import json
7
  from PIL import Image
8
  from transformers import CLIPProcessor, CLIPModel
9
- from paddleocr import PaddleOCR, TextDetection
10
  from spaces import GPU # Required for ZeroGPU on Hugging Face
11
 
12
  # Setup
@@ -27,22 +26,19 @@ candidates = [
27
  "This is Korean text"
28
  ]
29
 
30
- text_detector = TextDetection(model_name="PP-OCRv5_server_det")
31
 
32
  @GPU
33
  def ocr_pipeline(image_np):
34
  image_pil = Image.fromarray(image_np).convert("RGB")
35
- width, height = image_pil.size
36
  img_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
37
 
38
- output = text_detector.predict(image_np, batch_size=1)
39
 
40
- arr = []
41
- for res in output:
42
- polys = res.get("dt_polys", [])
43
- if polys is not None:
44
- arr.extend(polys.tolist())
45
 
 
46
  arr = sorted(arr, key=lambda box: (box[0][1], box[0][0]))
47
 
48
  cropped_images = []
@@ -64,21 +60,19 @@ def ocr_pipeline(image_np):
64
  if crop.shape[0] < 10 or crop.shape[1] < 10:
65
  continue
66
 
67
- # Language detection
68
- clip_inputs = clip_processor(text=candidates, images=crop, return_tensors="pt", padding=True)
69
  with torch.no_grad():
70
  probs = clip_model(**clip_inputs).logits_per_image.softmax(dim=1)
71
  lang_index = probs.argmax().item()
72
  lang_detected = candidates[lang_index].split()[-2].lower()
73
  lang_code = lang_map.get(lang_detected, "en")
74
 
75
- ocr = PaddleOCR(lang=lang_code, use_doc_orientation_classify=False,
76
- use_doc_unwarping=False, use_textline_orientation=False, device='cpu')
77
-
78
  result = ocr.ocr(crop)
79
  if not result or not result[0]:
80
  continue
81
-
82
  for line in result[0]:
83
  text = line[1][0]
84
  box = line[0]
@@ -89,7 +83,6 @@ def ocr_pipeline(image_np):
89
  if not final_output_lines:
90
  return "❌ No text detected."
91
 
92
- # Grouping by line
93
  sorted_blocks = sorted(final_output_lines, key=lambda b: (b["cy"], b["cx"]))
94
  lines = []
95
  current_line = [sorted_blocks[0]]
@@ -105,16 +98,15 @@ def ocr_pipeline(image_np):
105
  return "\n".join(lines)
106
 
107
 
108
- # Gradio Interface
109
  def build_interface():
110
  return gr.Interface(
111
  fn=ocr_pipeline,
112
  inputs=gr.Image(type="numpy", label="Upload Handwritten Image"),
113
  outputs="text",
114
- title="🌐 Multilingual Handwritten OCR with CLIP + PaddleOCR",
115
- description="πŸ“„ Upload a handwritten document image. Detects language using CLIP and performs text detection + recognition with PaddleOCR."
116
  )
117
 
118
  if __name__ == "__main__":
119
  iface = build_interface()
120
- iface.launch()
 
3
  import numpy as np
4
  import cv2
5
  import os
 
6
  from PIL import Image
7
  from transformers import CLIPProcessor, CLIPModel
8
+ from paddleocr import PaddleOCR
9
  from spaces import GPU # Required for ZeroGPU on Hugging Face
10
 
11
  # Setup
 
26
  "This is Korean text"
27
  ]
28
 
29
+ ocr_detector = PaddleOCR(use_angle_cls=False, lang='en', det=True, rec=False, use_gpu=True)
30
 
31
  @GPU
32
  def ocr_pipeline(image_np):
33
  image_pil = Image.fromarray(image_np).convert("RGB")
 
34
  img_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
35
 
36
+ detection_result = ocr_detector.ocr(image_np, det=True, rec=False)
37
 
38
+ if not detection_result or not detection_result[0]:
39
+ return "❌ No text detected."
 
 
 
40
 
41
+ arr = [line[0] for line in detection_result[0]]
42
  arr = sorted(arr, key=lambda box: (box[0][1], box[0][0]))
43
 
44
  cropped_images = []
 
60
  if crop.shape[0] < 10 or crop.shape[1] < 10:
61
  continue
62
 
63
+ clip_inputs = clip_processor(text=candidates, images=Image.fromarray(crop), return_tensors="pt", padding=True)
 
64
  with torch.no_grad():
65
  probs = clip_model(**clip_inputs).logits_per_image.softmax(dim=1)
66
  lang_index = probs.argmax().item()
67
  lang_detected = candidates[lang_index].split()[-2].lower()
68
  lang_code = lang_map.get(lang_detected, "en")
69
 
70
+ ocr = PaddleOCR(lang=lang_code, use_angle_cls=False, det=False, rec=True, use_gpu=False)
71
+
 
72
  result = ocr.ocr(crop)
73
  if not result or not result[0]:
74
  continue
75
+
76
  for line in result[0]:
77
  text = line[1][0]
78
  box = line[0]
 
83
  if not final_output_lines:
84
  return "❌ No text detected."
85
 
 
86
  sorted_blocks = sorted(final_output_lines, key=lambda b: (b["cy"], b["cx"]))
87
  lines = []
88
  current_line = [sorted_blocks[0]]
 
98
  return "\n".join(lines)
99
 
100
 
 
101
  def build_interface():
102
  return gr.Interface(
103
  fn=ocr_pipeline,
104
  inputs=gr.Image(type="numpy", label="Upload Handwritten Image"),
105
  outputs="text",
106
+ title="\U0001F310 Multilingual Handwritten OCR with CLIP + PaddleOCR",
107
+ description="\U0001F4C4 Upload a handwritten document image. Detects language using CLIP and performs text detection + recognition with PaddleOCR."
108
  )
109
 
110
  if __name__ == "__main__":
111
  iface = build_interface()
112
+ iface.launch()