File size: 8,558 Bytes
bc5fb2a
 
 
9e961a7
5828679
3e0219f
b89622d
5828679
4b6aee6
 
827b330
9e961a7
03d24f8
fe770f9
9e961a7
4b6aee6
 
 
051ce33
fa96b42
 
 
 
9e961a7
7ed45dc
9e961a7
 
 
 
 
 
 
 
 
 
 
7ed45dc
 
 
9e961a7
 
7ed45dc
9e961a7
 
7ed45dc
9e961a7
7ed45dc
24e6226
9e961a7
 
 
 
 
 
 
 
 
 
3fc583f
9e961a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ed45dc
 
 
 
 
9e961a7
 
7ed45dc
9e961a7
 
7ed45dc
9e961a7
7ed45dc
24e6226
9e961a7
 
 
 
 
 
 
 
 
 
3fc583f
9e961a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ed45dc
 
 
 
 
 
 
 
 
 
4b6aee6
5828679
7ed45dc
 
5828679
7ed45dc
5828679
7ed45dc
 
 
 
 
 
4b6aee6
5828679
7ed45dc
5828679
388a5c9
7ed45dc
 
 
 
9e961a7
5828679
7ed45dc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9e961a7
7ed45dc
 
 
fe770f9
9e961a7
 
 
 
 
 
 
 
 
7ed45dc
 
9e961a7
7ed45dc
9e961a7
7ed45dc
 
 
 
 
 
 
9e961a7
 
7ed45dc
 
9e961a7
7ed45dc
9e961a7
7ed45dc
 
 
 
9e961a7
 
7ed45dc
 
9e961a7
7ed45dc
 
 
9e961a7
 
7ed45dc
 
 
 
9e961a7
 
fe770f9
9e961a7
7ed45dc
 
 
 
9e961a7
 
 
7ed45dc
 
9e961a7
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
import os
os.environ["CUDA_VISIBLE_DEVICES"] = ""


import gradio as gr
import torch
import numpy as np
import cv2
from PIL import Image
from transformers import CLIPProcessor, CLIPModel
from paddleocr import PaddleOCR, TextDetection
from functools import lru_cache


MODEL_HUB_ID = "imperiusrex/printedpaddle"
# Setup
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

# Set device to CPU
device = "cpu"
clip_model.to(device)

# Language map for OCR models
def process_image(img_path):
    """
    Processes an image to detect, crop, and OCR text, returning it in reading order.
    Args:
        img_path: The path to the image file.
    Returns:
        A string containing the reconstructed text.
    """
    # Load CLIP model and processor
    clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

    # Candidate language phrases for detection
    candidates = [
        "This is English text",
        # "This is Hindi text",
        # "This is Tamil text",
        "This is Telugu text",
        # "This is Bengali text",
        # "This is Arabic text",
        "This is Chinese text",
        # "This is Japanese text",
        "This is Korean text",
        "This is Russian text",
        # "This is Kannada text",
        # "This is Malayalam text",
        # "This is Marathi text",
        # "This is Urdu text",
        # "This is French text",
        # "This is Spanish text",
        # "This is Italian text",
        # "This is Portuguese text",
        # "This is Romanian text",
        # "This is Hungarian text",
        # "This is Indonesian text",
        # "This is Lithuanian text",
        # "This is Chinese Traditional text",
        # "This is Malay text",
        # "This is Dutch text",
        # "This is Norwegian text",
        # "This is Bosnian text",
        # "This is Polish text",
        # "This is Czech text",
        # "This is Slovak text",
        # "This is Welsh text",
        # "This is Slovenian text",
        # "This is Danish text",
        # "This is Albanian text",
        # "This is Estonian text",
        # "This is Swedish text",
        # "This is Irish text",
        # "This is Swahili text",
        # "This is Croatian text",
        # "This is Uzbek text",
        # "This is Turkish text",
        "This is Latin text",
        # "This is Belarusian text",
        # "This is Ukrainian text"
    ]

    # Map detected languages to PaddleOCR language codes
    lang_map = {
        "english": "en",
        # "hindi": "hi",
        # "tamil": "ta",
        "telugu": "te",
        # "bengali": "bn",
        # "arabic": "ar",
        "chinese": "ch",
        # "japanese": "japan",
        "korean": "korean",
        "russian": "ru",
        # "kannada": "kn",
        # "malayalam": "ml",
        # "marathi": "mr",
        # "urdu": "ur",
        # "french": "fr",
        # "spanish": "es",
        # "italian": "it",
        # "portuguese": "pt",
        # "romanian": "ro",
        # "hungarian": "hu",
        # "indonesian": "id",
        # "lithuanian": "lt",
        # "chinese traditional": "chinese_cht",
        # "malay": "ms",
        # "dutch": "nl",
        # "norwegian": "no",
        # "bosnian": "bs",
        # "polish": "pl",
        # "czech": "cs",
        # "slovak": "sk",
        # "welsh": "cy",
        # "slovenian": "sl",
        # "danish": "da",
        # "albanian": "sq",
        # "estonian": "et",
        # "swedish": "sv",
        # "irish": "ga",
        # "swahili": "sw",
        # "croatian": "hr",
        # "uzbek": "uz",
        # "turkish": "tr",
        "latin": "la",
        # "belarusian": "be",
        # "ukrainian": "uk"
    }

    # Text Detection
    arr = []
    model_det = TextDetection(model_name="PP-OCRv5_server_det")
    output = model_det.predict(img_path, batch_size=1)
    for res in output:
        polys = res['dt_polys']
        if polys is not None:
            arr.extend(polys.tolist())
    arr = sorted(arr, key=lambda box: (box[0][1], box[0][0]))

    # Image Cropping and Warping
    img = cv2.imread(img_path)
    cropped_images = []
    for i, box in enumerate(arr):
        box = np.array(box, dtype=np.float32)
        width_a = np.linalg.norm(box[0] - box[1])
        width_b = np.linalg.norm(box[2] - box[3])
        height_a = np.linalg.norm(box[0] - box[3])
        height_b = np.linalg.norm(box[1] - box[2])
        width = int(max(width_a, width_b))
        height = int(max(height_a, height_b))
        dst_rect = np.array([[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]], dtype=np.float32)
        M = cv2.getPerspectiveTransform(box, dst_rect)
        warped = cv2.warpPerspective(img, M, (width, height))
        cropped_images.append(warped)

    # Perform language detection for each cropped image and then OCR
    predicted_texts = []
    for i, cropped_img in enumerate(cropped_images):
        # Get probabilities
        inputs = processor(text=candidates, images=cropped_img, return_tensors="pt", padding=True)
        with torch.no_grad():
            logits_per_image = clip_model(**inputs).logits_per_image
            probs = logits_per_image.softmax(dim=1)

        # Get best language match
        best = probs.argmax().item()
        detected_lang_phrase = candidates[best]
        detected_lang = detected_lang_phrase.split()[-2].lower()
        lang_code = lang_map.get(detected_lang, "en")

        # Perform OCR for the current cropped image with the detected language
        ocr = PaddleOCR(
            use_doc_orientation_classify=False,
            use_doc_unwarping=False,
            use_textline_orientation=False,
            lang=lang_code,
            device="cpu"
        )

        result = ocr.predict(cropped_img)

        text_for_this_image = ""
        if result and result[0] and 'rec_texts' in result[0]:
             text_for_this_image = " ".join(result[0]['rec_texts'])

        predicted_texts.append(text_for_this_image)


    def get_box_center(box):
      """Calculates the center of a bounding box."""
      x_coords = [p[0] for p in box]
      y_coords = [p[1] for p in box]
      center_x = sum(x_coords) / len(x_coords)
      center_y = sum(y_coords) / len(y_coords)
      return center_x, center_y

    # --- Step 1: Read all text and their centroid coordinates ---
    all_text_blocks = []
    for i, box in enumerate(arr):
        # Use the predicted text from the list
        text = predicted_texts[i]
        if text: # Only add if text is not empty
            center_x, center_y = get_box_center(box)
            all_text_blocks.append({
                "text": text,
                "center_x": center_x,
                "center_y": center_y
            })


    # --- Step 2: Sort by y-coordinate, then by x-coordinate, and group into lines ---
    reconstructed_text = ""
    if all_text_blocks:
        # Sort by center_y, then by center_x
        sorted_blocks = sorted(all_text_blocks, key=lambda item: (item["center_y"], item["center_x"]))

        lines = []
        if sorted_blocks:
            current_line = [sorted_blocks[0]]
            for block in sorted_blocks[1:]:
                # Check if the vertical centers are close enough to be on the same line
                if abs(block["center_y"] - current_line[-1]["center_y"]) < 40: # Y-threshold
                    current_line.append(block)
                else:
                    # Sort the current line by x-coordinate and add it to the lines list
                    current_line.sort(key=lambda item: item["center_x"])
                    lines.append(" ".join([item["text"] for item in current_line]))
                    current_line = [block]

            # Add the last line
            if current_line:
                current_line.sort(key=lambda item: item["center_x"])
                lines.append(" ".join([item["text"] for item in current_line]))

        # --- Step 3: Join the lines into a single string ---
        reconstructed_text = "\n".join(lines)

    return reconstructed_text

iface = gr.Interface(
    fn=process_image,
    inputs=gr.Image(type="filepath"),
    outputs=gr.Text(),
    title="Image OCR and Text Reconstruction",
    description="Upload an image to perform text detection, cropping, language detection, OCR, and reconstruct the text in reading order."
)

if __name__== "__main__":
    iface.launch(debug=True)