imperiusrex commited on
Commit
fe770f9
·
verified ·
1 Parent(s): 24e6226

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +28 -121
app.py CHANGED
@@ -1,7 +1,6 @@
1
  import os
2
  os.environ["CUDA_VISIBLE_DEVICES"] = ""
3
 
4
-
5
  import gradio as gr
6
  import torch
7
  import numpy as np
@@ -9,10 +8,9 @@ import cv2
9
  from PIL import Image
10
  from transformers import CLIPProcessor, CLIPModel
11
  from paddleocr import PaddleOCR, TextDetection
12
- from functools import lru_cache
13
-
14
 
15
  MODEL_HUB_ID = "imperiusrex/printedpaddle"
 
16
  # Setup
17
  clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
18
  clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
@@ -21,115 +19,27 @@ clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
21
  device = "cpu"
22
  clip_model.to(device)
23
 
24
- # Language map for OCR models
25
  def process_image(img_path):
26
- """
27
- Processes an image to detect, crop, and OCR text, returning it in reading order.
28
-
29
- Args:
30
- img_path: The path to the image file.
31
-
32
- Returns:
33
- A string containing the reconstructed text.
34
- """
35
- # Load CLIP model and processor
36
- clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
37
- processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
38
-
39
  # Candidate language phrases for detection
40
  candidates = [
41
  "This is English text",
42
- # "This is Hindi text",
43
- # "This is Tamil text",
44
  "This is Telugu text",
45
- # "This is Bengali text",
46
- # "This is Arabic text",
47
  "This is Chinese text",
48
- # "This is Japanese text",
49
  "This is Korean text",
50
  "This is Russian text",
51
- # "This is Kannada text",
52
- # "This is Malayalam text",
53
- # "This is Marathi text",
54
- # "This is Urdu text",
55
  "This is French text",
56
- # "This is Spanish text",
57
- # "This is Italian text",
58
- # "This is Portuguese text",
59
- # "This is Romanian text",
60
- # "This is Hungarian text",
61
- # "This is Indonesian text",
62
- # "This is Lithuanian text",
63
- # "This is Chinese Traditional text",
64
- # "This is Malay text",
65
- # "This is Dutch text",
66
- # "This is Norwegian text",
67
- # "This is Bosnian text",
68
- # "This is Polish text",
69
- # "This is Czech text",
70
- # "This is Slovak text",
71
- # "This is Welsh text",
72
- # "This is Slovenian text",
73
- # "This is Danish text",
74
- # "This is Albanian text",
75
- # "This is Estonian text",
76
- # "This is Swedish text",
77
- # "This is Irish text",
78
- # "This is Swahili text",
79
- # "This is Croatian text",
80
- # "This is Uzbek text",
81
- # "This is Turkish text",
82
  "This is Latin text",
83
- # "This is Belarusian text",
84
- # "This is Ukrainian text"
85
  ]
86
 
87
  # Map detected languages to PaddleOCR language codes
88
  lang_map = {
89
  "english": "en",
90
- # "hindi": "hi",
91
- # "tamil": "ta",
92
  "telugu": "te",
93
- # "bengali": "bn",
94
- # "arabic": "ar",
95
  "chinese": "ch",
96
- # "japanese": "japan",
97
  "korean": "korean",
98
  "russian": "ru",
99
- # "kannada": "kn",
100
- # "malayalam": "ml",
101
- # "marathi": "mr",
102
- # "urdu": "ur",
103
  "french": "fr",
104
- # "spanish": "es",
105
- # "italian": "it",
106
- # "portuguese": "pt",
107
- # "romanian": "ro",
108
- # "hungarian": "hu",
109
- # "indonesian": "id",
110
- # "lithuanian": "lt",
111
- # "chinese traditional": "chinese_cht",
112
- # "malay": "ms",
113
- # "dutch": "nl",
114
- # "norwegian": "no",
115
- # "bosnian": "bs",
116
- # "polish": "pl",
117
- # "czech": "cs",
118
- # "slovak": "sk",
119
- # "welsh": "cy",
120
- # "slovenian": "sl",
121
- # "danish": "da",
122
- # "albanian": "sq",
123
- # "estonian": "et",
124
- # "swedish": "sv",
125
- # "irish": "ga",
126
- # "swahili": "sw",
127
- # "croatian": "hr",
128
- # "uzbek": "uz",
129
- # "turkish": "tr",
130
  "latin": "la",
131
- # "belarusian": "be",
132
- # "ukrainian": "uk"
133
  }
134
 
135
  # Text Detection
@@ -160,9 +70,11 @@ def process_image(img_path):
160
 
161
  # Perform language detection for each cropped image and then OCR
162
  predicted_texts = []
 
 
163
  for i, cropped_img in enumerate(cropped_images):
164
  # Get probabilities
165
- inputs = processor(text=candidates, images=cropped_img, return_tensors="pt", padding=True)
166
  with torch.no_grad():
167
  logits_per_image = clip_model(**inputs).logits_per_image
168
  probs = logits_per_image.softmax(dim=1)
@@ -171,6 +83,8 @@ def process_image(img_path):
171
  best = probs.argmax().item()
172
  detected_lang_phrase = candidates[best]
173
  detected_lang = detected_lang_phrase.split()[-2].lower()
 
 
174
  lang_code = lang_map.get(detected_lang, "en")
175
 
176
  # Perform OCR for the current cropped image with the detected language
@@ -186,25 +100,22 @@ def process_image(img_path):
186
 
187
  text_for_this_image = ""
188
  if result and result[0] and 'rec_texts' in result[0]:
189
- text_for_this_image = " ".join(result[0]['rec_texts'])
190
 
191
  predicted_texts.append(text_for_this_image)
192
 
193
-
194
  def get_box_center(box):
195
- """Calculates the center of a bounding box."""
196
- x_coords = [p[0] for p in box]
197
- y_coords = [p[1] for p in box]
198
- center_x = sum(x_coords) / len(x_coords)
199
- center_y = sum(y_coords) / len(y_coords)
200
- return center_x, center_y
201
-
202
- # --- Step 1: Read all text and their centroid coordinates ---
203
  all_text_blocks = []
204
  for i, box in enumerate(arr):
205
- # Use the predicted text from the list
206
  text = predicted_texts[i]
207
- if text: # Only add if text is not empty
208
  center_x, center_y = get_box_center(box)
209
  all_text_blocks.append({
210
  "text": text,
@@ -212,44 +123,40 @@ def process_image(img_path):
212
  "center_y": center_y
213
  })
214
 
215
-
216
- # --- Step 2: Sort by y-coordinate, then by x-coordinate, and group into lines ---
217
  reconstructed_text = ""
218
  if all_text_blocks:
219
- # Sort by center_y, then by center_x
220
  sorted_blocks = sorted(all_text_blocks, key=lambda item: (item["center_y"], item["center_x"]))
221
-
222
  lines = []
223
  if sorted_blocks:
224
  current_line = [sorted_blocks[0]]
225
  for block in sorted_blocks[1:]:
226
- # Check if the vertical centers are close enough to be on the same line
227
- if abs(block["center_y"] - current_line[-1]["center_y"]) < 40: # Y-threshold
228
  current_line.append(block)
229
  else:
230
- # Sort the current line by x-coordinate and add it to the lines list
231
  current_line.sort(key=lambda item: item["center_x"])
232
  lines.append(" ".join([item["text"] for item in current_line]))
233
  current_line = [block]
234
-
235
- # Add the last line
236
  if current_line:
237
  current_line.sort(key=lambda item: item["center_x"])
238
  lines.append(" ".join([item["text"] for item in current_line]))
239
-
240
- # --- Step 3: Join the lines into a single string ---
241
  reconstructed_text = "\n".join(lines)
242
 
243
- return reconstructed_text
 
 
 
244
 
245
  iface = gr.Interface(
246
  fn=process_image,
247
  inputs=gr.Image(type="filepath"),
248
- outputs=gr.Text(),
249
- title="Image OCR and Text Reconstruction",
250
- description="Upload an image to perform text detection, cropping, language detection, OCR, and reconstruct the text in reading order."
 
 
 
251
  )
252
 
253
- if __name__== "__main__":
254
  iface.launch(debug=True)
255
-
 
1
  import os
2
  os.environ["CUDA_VISIBLE_DEVICES"] = ""
3
 
 
4
  import gradio as gr
5
  import torch
6
  import numpy as np
 
8
  from PIL import Image
9
  from transformers import CLIPProcessor, CLIPModel
10
  from paddleocr import PaddleOCR, TextDetection
 
 
11
 
12
  MODEL_HUB_ID = "imperiusrex/printedpaddle"
13
+
14
  # Setup
15
  clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
16
  clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
 
19
  device = "cpu"
20
  clip_model.to(device)
21
 
 
22
  def process_image(img_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  # Candidate language phrases for detection
24
  candidates = [
25
  "This is English text",
 
 
26
  "This is Telugu text",
 
 
27
  "This is Chinese text",
 
28
  "This is Korean text",
29
  "This is Russian text",
 
 
 
 
30
  "This is French text",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  "This is Latin text",
 
 
32
  ]
33
 
34
  # Map detected languages to PaddleOCR language codes
35
  lang_map = {
36
  "english": "en",
 
 
37
  "telugu": "te",
 
 
38
  "chinese": "ch",
 
39
  "korean": "korean",
40
  "russian": "ru",
 
 
 
 
41
  "french": "fr",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42
  "latin": "la",
 
 
43
  }
44
 
45
  # Text Detection
 
70
 
71
  # Perform language detection for each cropped image and then OCR
72
  predicted_texts = []
73
+ detected_languages_list = [] # store detected languages
74
+
75
  for i, cropped_img in enumerate(cropped_images):
76
  # Get probabilities
77
+ inputs = clip_processor(text=candidates, images=cropped_img, return_tensors="pt", padding=True)
78
  with torch.no_grad():
79
  logits_per_image = clip_model(**inputs).logits_per_image
80
  probs = logits_per_image.softmax(dim=1)
 
83
  best = probs.argmax().item()
84
  detected_lang_phrase = candidates[best]
85
  detected_lang = detected_lang_phrase.split()[-2].lower()
86
+ detected_languages_list.append(detected_lang) # store detected language
87
+
88
  lang_code = lang_map.get(detected_lang, "en")
89
 
90
  # Perform OCR for the current cropped image with the detected language
 
100
 
101
  text_for_this_image = ""
102
  if result and result[0] and 'rec_texts' in result[0]:
103
+ text_for_this_image = " ".join(result[0]['rec_texts'])
104
 
105
  predicted_texts.append(text_for_this_image)
106
 
 
107
  def get_box_center(box):
108
+ x_coords = [p[0] for p in box]
109
+ y_coords = [p[1] for p in box]
110
+ center_x = sum(x_coords) / len(x_coords)
111
+ center_y = sum(y_coords) / len(y_coords)
112
+ return center_x, center_y
113
+
114
+ # Step 1: Read all text and their centroid coordinates
 
115
  all_text_blocks = []
116
  for i, box in enumerate(arr):
 
117
  text = predicted_texts[i]
118
+ if text:
119
  center_x, center_y = get_box_center(box)
120
  all_text_blocks.append({
121
  "text": text,
 
123
  "center_y": center_y
124
  })
125
 
126
+ # Step 2: Sort and group into lines
 
127
  reconstructed_text = ""
128
  if all_text_blocks:
 
129
  sorted_blocks = sorted(all_text_blocks, key=lambda item: (item["center_y"], item["center_x"]))
 
130
  lines = []
131
  if sorted_blocks:
132
  current_line = [sorted_blocks[0]]
133
  for block in sorted_blocks[1:]:
134
+ if abs(block["center_y"] - current_line[-1]["center_y"]) < 40:
 
135
  current_line.append(block)
136
  else:
 
137
  current_line.sort(key=lambda item: item["center_x"])
138
  lines.append(" ".join([item["text"] for item in current_line]))
139
  current_line = [block]
 
 
140
  if current_line:
141
  current_line.sort(key=lambda item: item["center_x"])
142
  lines.append(" ".join([item["text"] for item in current_line]))
 
 
143
  reconstructed_text = "\n".join(lines)
144
 
145
+ # Step 3: Get unique detected languages
146
+ unique_languages = sorted(set(detected_languages_list))
147
+
148
+ return reconstructed_text, ", ".join(unique_languages)
149
 
150
  iface = gr.Interface(
151
  fn=process_image,
152
  inputs=gr.Image(type="filepath"),
153
+ outputs=[
154
+ gr.Textbox(label="Reconstructed Text"),
155
+ gr.Textbox(label="Detected Languages")
156
+ ],
157
+ title="Image OCR with Language Detection",
158
+ description="Upload an image to detect text, OCR it, reconstruct in reading order, and list unique detected languages."
159
  )
160
 
161
+ if __name__ == "__main__":
162
  iface.launch(debug=True)