Spaces:

imperiusrex
/

PrintedTextOCR

Sleeping

App Files Files Community

imperiusrex commited on Jul 31, 2025

Commit

e0e0134

verified ·

1 Parent(s): b504595

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -21

app.py CHANGED Viewed

@@ -3,10 +3,9 @@ import torch
 import numpy as np
 import cv2
 import os
-import json
 from PIL import Image
 from transformers import CLIPProcessor, CLIPModel
-from paddleocr import PaddleOCR, TextDetection
 from spaces import GPU  # Required for ZeroGPU on Hugging Face
 # Setup
@@ -27,22 +26,19 @@ candidates = [
     "This is Korean text"
 ]
-text_detector = TextDetection(model_name="PP-OCRv5_server_det")
 @GPU
 def ocr_pipeline(image_np):
     image_pil = Image.fromarray(image_np).convert("RGB")
-    width, height = image_pil.size
     img_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
-    output = text_detector.predict(image_np, batch_size=1)
-    arr = []
-    for res in output:
-        polys = res.get("dt_polys", [])
-        if polys is not None:
-            arr.extend(polys.tolist())
     arr = sorted(arr, key=lambda box: (box[0][1], box[0][0]))
     cropped_images = []
@@ -64,21 +60,19 @@ def ocr_pipeline(image_np):
         if crop.shape[0] < 10 or crop.shape[1] < 10:
             continue
-        # Language detection
-        clip_inputs = clip_processor(text=candidates, images=crop, return_tensors="pt", padding=True)
         with torch.no_grad():
             probs = clip_model(**clip_inputs).logits_per_image.softmax(dim=1)
         lang_index = probs.argmax().item()
         lang_detected = candidates[lang_index].split()[-2].lower()
         lang_code = lang_map.get(lang_detected, "en")
-        ocr = PaddleOCR(lang=lang_code, use_doc_orientation_classify=False,
-                        use_doc_unwarping=False, use_textline_orientation=False, device='cpu')
         result = ocr.ocr(crop)
         if not result or not result[0]:
             continue
         for line in result[0]:
             text = line[1][0]
             box = line[0]
@@ -89,7 +83,6 @@ def ocr_pipeline(image_np):
     if not final_output_lines:
         return "❌ No text detected."
-    # Grouping by line
     sorted_blocks = sorted(final_output_lines, key=lambda b: (b["cy"], b["cx"]))
     lines = []
     current_line = [sorted_blocks[0]]
@@ -105,16 +98,15 @@ def ocr_pipeline(image_np):
     return "\n".join(lines)
-# Gradio Interface
 def build_interface():
     return gr.Interface(
         fn=ocr_pipeline,
         inputs=gr.Image(type="numpy", label="Upload Handwritten Image"),
         outputs="text",
-        title="🌐 Multilingual Handwritten OCR with CLIP + PaddleOCR",
-        description="📄 Upload a handwritten document image. Detects language using CLIP and performs text detection + recognition with PaddleOCR."
     )
 if __name__ == "__main__":
     iface = build_interface()
-    iface.launch()

 import numpy as np
 import cv2
 import os
 from PIL import Image
 from transformers import CLIPProcessor, CLIPModel
+from paddleocr import PaddleOCR
 from spaces import GPU  # Required for ZeroGPU on Hugging Face
 # Setup
     "This is Korean text"
 ]
+ocr_detector = PaddleOCR(use_angle_cls=False, lang='en', det=True, rec=False, use_gpu=True)
 @GPU
 def ocr_pipeline(image_np):
     image_pil = Image.fromarray(image_np).convert("RGB")
     img_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
+    detection_result = ocr_detector.ocr(image_np, det=True, rec=False)
+    if not detection_result or not detection_result[0]:
+        return "❌ No text detected."
+    arr = [line[0] for line in detection_result[0]]
     arr = sorted(arr, key=lambda box: (box[0][1], box[0][0]))
     cropped_images = []
         if crop.shape[0] < 10 or crop.shape[1] < 10:
             continue
+        clip_inputs = clip_processor(text=candidates, images=Image.fromarray(crop), return_tensors="pt", padding=True)
         with torch.no_grad():
             probs = clip_model(**clip_inputs).logits_per_image.softmax(dim=1)
         lang_index = probs.argmax().item()
         lang_detected = candidates[lang_index].split()[-2].lower()
         lang_code = lang_map.get(lang_detected, "en")
+        ocr = PaddleOCR(lang=lang_code, use_angle_cls=False, det=False, rec=True, use_gpu=False)
         result = ocr.ocr(crop)
         if not result or not result[0]:
             continue
         for line in result[0]:
             text = line[1][0]
             box = line[0]
     if not final_output_lines:
         return "❌ No text detected."
     sorted_blocks = sorted(final_output_lines, key=lambda b: (b["cy"], b["cx"]))
     lines = []
     current_line = [sorted_blocks[0]]
     return "\n".join(lines)
 def build_interface():
     return gr.Interface(
         fn=ocr_pipeline,
         inputs=gr.Image(type="numpy", label="Upload Handwritten Image"),
         outputs="text",
+        title="\U0001F310 Multilingual Handwritten OCR with CLIP + PaddleOCR",
+        description="\U0001F4C4 Upload a handwritten document image. Detects language using CLIP and performs text detection + recognition with PaddleOCR."
     )
 if __name__ == "__main__":
     iface = build_interface()
+    iface.launch()