Spaces:

imperiusrex
/

PrintedTextOCR

Sleeping

App Files Files Community

imperiusrex commited on Jul 31, 2025

Commit

4b6aee6

verified ·

1 Parent(s): d99150f

Update app.py

Browse files

Files changed (1) hide show

app.py +92 -147

app.py CHANGED Viewed

@@ -1,175 +1,120 @@
 import gradio as gr
 import torch
-from transformers import CLIPProcessor, CLIPModel
-from paddleocr import PaddleOCR, TextDetection
-from PIL import Image
 import numpy as np
 import cv2
-import spaces
-# --- Global setup for models and data ---
-print("🔄 Initializing models...")
-# Check for GPU and set device
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(f"Device being used: {device}")
-# Load CLIP model once. This is memory-intensive, so we do it once.
-clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
-processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
-# Initialize Paddle's text detection model.
-# The latest versions of PaddlePaddle/PaddleOCR automatically use the GPU.
-det_model = TextDetection(model_name="PP-OCRv5_server_det")
-# Candidate language phrases for detection
 candidates = [
     "This is English text",
     "This is Telugu text",
     "This is Chinese text",
-    "This is Korean text",
 ]
-# Map detected languages to PaddleOCR language codes
-lang_map = {
-    "english": "en",
-    "telugu": "te",
-    "chinese": "ch",
-    "korean": "korean",
-}
-print("✅ Models loaded successfully.")
-# --- Utility Functions ---
-def get_box_center(box):
-    """Calculates the center of a bounding box."""
-    x_coords = [p[0] for p in box]
-    y_coords = [p[1] for p in box]
-    center_x = sum(x_coords) / len(x_coords)
-    center_y = sum(y_coords) / len(y_coords)
-    return center_x, center_y
-@spaces.GPU
-def ocr_pipeline(image_pil: Image.Image) -> str:
-    """
-    Performs OCR on an input image using a multi-step pipeline.
-    Args:
-        image_pil: A PIL Image object from the Gradio interface.
-    Returns:
-        A string containing the reconstructed text.
-    """
-    if image_pil is None:
-        return "No image provided."
-    print("Starting OCR pipeline...")
-    # Convert PIL image to a NumPy array for OpenCV and Paddle
-    img_np = np.array(image_pil.convert("RGB"))
-    # Step 1: Text Detection with PaddleOCR's model
-    output = det_model.predict(img_np, batch_size=1)
     arr = []
-    if output and output[0] and 'dt_polys' in output[0] and output[0]['dt_polys'] is not None:
-        arr.extend(output[0]['dt_polys'].tolist())
-    # Sort the bounding boxes in reading order
-    sorted_polys = sorted(arr, key=lambda box: (box[0][1], box[0][0]))
-    if not sorted_polys:
-        print("No text regions detected.")
-        return "No text regions detected."
     cropped_images = []
-    for box in sorted_polys:
         box = np.array(box, dtype=np.float32)
-        width_a = np.linalg.norm(box[0] - box[1])
-        width_b = np.linalg.norm(box[2] - box[3])
-        height_a = np.linalg.norm(box[0] - box[3])
-        height_b = np.linalg.norm(box[1] - box[2])
-        width = int(max(width_a, width_b))
-        height = int(max(height_a, height_b))
-        dst_rect = np.array([
-            [0, 0],
-            [width - 1, 0],
-            [width - 1, height - 1],
-            [0, height - 1]
-        ], dtype=np.float32)
         M = cv2.getPerspectiveTransform(box, dst_rect)
-        warped = cv2.warpPerspective(img_np, M, (width, height))
         cropped_images.append(warped)
-    # Step 2: Language detection with CLIP and OCR on cropped images
-    all_text_blocks = []
-    for i, img in enumerate(cropped_images):
-        pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
-        # Use CLIP to detect language. The model is already on the GPU.
-        inputs = processor(text=candidates, images=pil_img, return_tensors="pt", padding=True).to(device)
         with torch.no_grad():
-            outputs = clip_model(**inputs)
-            logits_per_image = outputs.logits_per_image
-            probs = logits_per_image.softmax(dim=1)
-        best = probs.argmax().item()
-        detected_lang_phrase = candidates[best]
-        detected_lang = detected_lang_phrase.split()[-2].lower()
-        lang_code = lang_map.get(detected_lang, "en")
-        # Initialize PaddleOCR with the detected language.
-        ocr = PaddleOCR(lang=lang_code, use_angle_cls=False, use_doc_unwarping=False, use_gpu=True)
-        result = ocr.predict(img)
-        # Extract text from OCR result
-        text_for_this_image = ""
-        if result and result[0] and 'rec_texts' in result[0]:
-            text_for_this_image = " ".join(result[0]['rec_texts'])
-        # Store text and bounding box information
-        center_x, center_y = get_box_center(sorted_polys[i])
-        all_text_blocks.append({
-            "text": text_for_this_image,
-            "center_x": center_x,
-            "center_y": center_y
-        })
-    # Step 3: Reconstruct the text in reading order
-    if not all_text_blocks:
-        print("No text could be extracted.")
-        return "No text could be extracted."
-    sorted_blocks = sorted(all_text_blocks, key=lambda item: (item["center_y"], item["center_x"]))
     lines = []
-    if sorted_blocks:
-        current_line = [sorted_blocks[0]]
-        for block in sorted_blocks[1:]:
-            if abs(block["center_y"] - current_line[-1]["center_y"]) < 40:
-                current_line.append(block)
-            else:
-                current_line.sort(key=lambda item: item["center_x"])
-                lines.append(" ".join([item["text"] for item in current_line]))
-                current_line = [block]
-        if current_line:
-            current_line.sort(key=lambda item: item["center_x"])
-            lines.append(" ".join([item["text"] for item in current_line]))
-    final_text = "\n".join(lines)
-    print("OCR pipeline finished successfully.")
-    return final_text
-# --- Gradio Interface ---
-iface = gr.Interface(
-    fn=ocr_pipeline,
-    inputs=gr.Image(type="pil", label="Upload Image"),
-    outputs=gr.Textbox(label="Recognized Text"),
-    title="Printed Text OCR with PaddleOCR and CLIP",
-    description="Upload a printed text image. The app will detect text regions, identify the language with CLIP, and perform OCR to return the text in reading order. This space uses an H200 GPU for high-speed processing."
-)
 if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
 import torch
 import numpy as np
 import cv2
+import os
+import json
+from PIL import Image
+from transformers import CLIPProcessor, CLIPModel
+from paddleocr import PaddleOCR, TextDetection
+from spaces import GPU  # Required for ZeroGPU on Hugging Face
+# Setup
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+lang_map = {
+    "english": "en",
+    "telugu": "te",
+    "chinese": "ch",
+    "korean": "korean",
+}
 candidates = [
     "This is English text",
     "This is Telugu text",
     "This is Chinese text",
+    "This is Korean text"
 ]
+text_detector = TextDetection(model_name="PP-OCRv5_server_det")
+@GPU
+def ocr_pipeline(image_np):
+    image_pil = Image.fromarray(image_np).convert("RGB")
+    width, height = image_pil.size
+    img_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
+    output = text_detector.predict(image_np, batch_size=1)
     arr = []
+    for res in output:
+        polys = res.get("dt_polys", [])
+        if polys is not None:
+            arr.extend(polys.tolist())
+    arr = sorted(arr, key=lambda box: (box[0][1], box[0][0]))
     cropped_images = []
+    warped_boxes = []
+    for box in arr:
         box = np.array(box, dtype=np.float32)
+        width = int(max(np.linalg.norm(box[0] - box[1]), np.linalg.norm(box[2] - box[3])))
+        height = int(max(np.linalg.norm(box[0] - box[3]), np.linalg.norm(box[1] - box[2])))
+        dst_rect = np.array([[0, 0], [width - 1, 0], [width - 1, height - 1], [0, height - 1]], dtype=np.float32)
         M = cv2.getPerspectiveTransform(box, dst_rect)
+        warped = cv2.warpPerspective(img_cv, M, (width, height))
         cropped_images.append(warped)
+        warped_boxes.append(box)
+    final_output_lines = []
+    for i, crop in enumerate(cropped_images):
+        if crop.shape[0] < 10 or crop.shape[1] < 10:
+            continue
+        # Language detection
+        clip_inputs = clip_processor(text=candidates, images=crop, return_tensors="pt", padding=True)
         with torch.no_grad():
+            probs = clip_model(**clip_inputs).logits_per_image.softmax(dim=1)
+        lang_index = probs.argmax().item()
+        lang_detected = candidates[lang_index].split()[-2].lower()
+        lang_code = lang_map.get(lang_detected, "en")
+        ocr = PaddleOCR(lang=lang_code, use_doc_orientation_classify=False,
+                        use_doc_unwarping=False, use_textline_orientation=False, device='cpu')
+        result = ocr.ocr(crop)
+        if not result or not result[0]:
+            continue
+        for line in result[0]:
+            text = line[1][0]
+            box = line[0]
+            center_x = sum([p[0] for p in box]) / 4
+            center_y = sum([p[1] for p in box]) / 4
+            final_output_lines.append({"text": text, "cx": center_x, "cy": center_y})
+    if not final_output_lines:
+        return "❌ No text detected."
+    # Grouping by line
+    sorted_blocks = sorted(final_output_lines, key=lambda b: (b["cy"], b["cx"]))
     lines = []
+    current_line = [sorted_blocks[0]]
+    for block in sorted_blocks[1:]:
+        if abs(block["cy"] - current_line[-1]["cy"]) < 40:
+            current_line.append(block)
+        else:
+            lines.append(" ".join([x["text"] for x in sorted(current_line, key=lambda b: b["cx"])]))
+            current_line = [block]
+    if current_line:
+        lines.append(" ".join([x["text"] for x in sorted(current_line, key=lambda b: b["cx"])]))
+    return "\n".join(lines)
+# Gradio Interface
+def build_interface():
+    return gr.Interface(
+        fn=ocr_pipeline,
+        inputs=gr.Image(type="numpy", label="Upload Handwritten Image"),
+        outputs="text",
+        title="🌐 Multilingual Handwritten OCR with CLIP + PaddleOCR",
+        description="📄 Upload a handwritten document image. Detects language using CLIP and performs text detection + recognition with PaddleOCR."
+    )
 if __name__ == "__main__":
+    iface = build_interface()
+    iface.launch()