Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -3,10 +3,9 @@ import torch
|
|
| 3 |
import numpy as np
|
| 4 |
import cv2
|
| 5 |
import os
|
| 6 |
-
import json
|
| 7 |
from PIL import Image
|
| 8 |
from transformers import CLIPProcessor, CLIPModel
|
| 9 |
-
from paddleocr import PaddleOCR
|
| 10 |
from spaces import GPU # Required for ZeroGPU on Hugging Face
|
| 11 |
|
| 12 |
# Setup
|
|
@@ -27,22 +26,19 @@ candidates = [
|
|
| 27 |
"This is Korean text"
|
| 28 |
]
|
| 29 |
|
| 30 |
-
|
| 31 |
|
| 32 |
@GPU
|
| 33 |
def ocr_pipeline(image_np):
|
| 34 |
image_pil = Image.fromarray(image_np).convert("RGB")
|
| 35 |
-
width, height = image_pil.size
|
| 36 |
img_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
|
| 37 |
|
| 38 |
-
|
| 39 |
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
polys = res.get("dt_polys", [])
|
| 43 |
-
if polys is not None:
|
| 44 |
-
arr.extend(polys.tolist())
|
| 45 |
|
|
|
|
| 46 |
arr = sorted(arr, key=lambda box: (box[0][1], box[0][0]))
|
| 47 |
|
| 48 |
cropped_images = []
|
|
@@ -64,21 +60,19 @@ def ocr_pipeline(image_np):
|
|
| 64 |
if crop.shape[0] < 10 or crop.shape[1] < 10:
|
| 65 |
continue
|
| 66 |
|
| 67 |
-
|
| 68 |
-
clip_inputs = clip_processor(text=candidates, images=crop, return_tensors="pt", padding=True)
|
| 69 |
with torch.no_grad():
|
| 70 |
probs = clip_model(**clip_inputs).logits_per_image.softmax(dim=1)
|
| 71 |
lang_index = probs.argmax().item()
|
| 72 |
lang_detected = candidates[lang_index].split()[-2].lower()
|
| 73 |
lang_code = lang_map.get(lang_detected, "en")
|
| 74 |
|
| 75 |
-
ocr = PaddleOCR(lang=lang_code,
|
| 76 |
-
|
| 77 |
-
|
| 78 |
result = ocr.ocr(crop)
|
| 79 |
if not result or not result[0]:
|
| 80 |
continue
|
| 81 |
-
|
| 82 |
for line in result[0]:
|
| 83 |
text = line[1][0]
|
| 84 |
box = line[0]
|
|
@@ -89,7 +83,6 @@ def ocr_pipeline(image_np):
|
|
| 89 |
if not final_output_lines:
|
| 90 |
return "β No text detected."
|
| 91 |
|
| 92 |
-
# Grouping by line
|
| 93 |
sorted_blocks = sorted(final_output_lines, key=lambda b: (b["cy"], b["cx"]))
|
| 94 |
lines = []
|
| 95 |
current_line = [sorted_blocks[0]]
|
|
@@ -105,16 +98,15 @@ def ocr_pipeline(image_np):
|
|
| 105 |
return "\n".join(lines)
|
| 106 |
|
| 107 |
|
| 108 |
-
# Gradio Interface
|
| 109 |
def build_interface():
|
| 110 |
return gr.Interface(
|
| 111 |
fn=ocr_pipeline,
|
| 112 |
inputs=gr.Image(type="numpy", label="Upload Handwritten Image"),
|
| 113 |
outputs="text",
|
| 114 |
-
title="
|
| 115 |
-
description="
|
| 116 |
)
|
| 117 |
|
| 118 |
if __name__ == "__main__":
|
| 119 |
iface = build_interface()
|
| 120 |
-
iface.launch()
|
|
|
|
| 3 |
import numpy as np
|
| 4 |
import cv2
|
| 5 |
import os
|
|
|
|
| 6 |
from PIL import Image
|
| 7 |
from transformers import CLIPProcessor, CLIPModel
|
| 8 |
+
from paddleocr import PaddleOCR
|
| 9 |
from spaces import GPU # Required for ZeroGPU on Hugging Face
|
| 10 |
|
| 11 |
# Setup
|
|
|
|
| 26 |
"This is Korean text"
|
| 27 |
]
|
| 28 |
|
| 29 |
+
ocr_detector = PaddleOCR(use_angle_cls=False, lang='en', det=True, rec=False, use_gpu=True)
|
| 30 |
|
| 31 |
@GPU
|
| 32 |
def ocr_pipeline(image_np):
|
| 33 |
image_pil = Image.fromarray(image_np).convert("RGB")
|
|
|
|
| 34 |
img_cv = cv2.cvtColor(np.array(image_pil), cv2.COLOR_RGB2BGR)
|
| 35 |
|
| 36 |
+
detection_result = ocr_detector.ocr(image_np, det=True, rec=False)
|
| 37 |
|
| 38 |
+
if not detection_result or not detection_result[0]:
|
| 39 |
+
return "β No text detected."
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
+
arr = [line[0] for line in detection_result[0]]
|
| 42 |
arr = sorted(arr, key=lambda box: (box[0][1], box[0][0]))
|
| 43 |
|
| 44 |
cropped_images = []
|
|
|
|
| 60 |
if crop.shape[0] < 10 or crop.shape[1] < 10:
|
| 61 |
continue
|
| 62 |
|
| 63 |
+
clip_inputs = clip_processor(text=candidates, images=Image.fromarray(crop), return_tensors="pt", padding=True)
|
|
|
|
| 64 |
with torch.no_grad():
|
| 65 |
probs = clip_model(**clip_inputs).logits_per_image.softmax(dim=1)
|
| 66 |
lang_index = probs.argmax().item()
|
| 67 |
lang_detected = candidates[lang_index].split()[-2].lower()
|
| 68 |
lang_code = lang_map.get(lang_detected, "en")
|
| 69 |
|
| 70 |
+
ocr = PaddleOCR(lang=lang_code, use_angle_cls=False, det=False, rec=True, use_gpu=False)
|
| 71 |
+
|
|
|
|
| 72 |
result = ocr.ocr(crop)
|
| 73 |
if not result or not result[0]:
|
| 74 |
continue
|
| 75 |
+
|
| 76 |
for line in result[0]:
|
| 77 |
text = line[1][0]
|
| 78 |
box = line[0]
|
|
|
|
| 83 |
if not final_output_lines:
|
| 84 |
return "β No text detected."
|
| 85 |
|
|
|
|
| 86 |
sorted_blocks = sorted(final_output_lines, key=lambda b: (b["cy"], b["cx"]))
|
| 87 |
lines = []
|
| 88 |
current_line = [sorted_blocks[0]]
|
|
|
|
| 98 |
return "\n".join(lines)
|
| 99 |
|
| 100 |
|
|
|
|
| 101 |
def build_interface():
|
| 102 |
return gr.Interface(
|
| 103 |
fn=ocr_pipeline,
|
| 104 |
inputs=gr.Image(type="numpy", label="Upload Handwritten Image"),
|
| 105 |
outputs="text",
|
| 106 |
+
title="\U0001F310 Multilingual Handwritten OCR with CLIP + PaddleOCR",
|
| 107 |
+
description="\U0001F4C4 Upload a handwritten document image. Detects language using CLIP and performs text detection + recognition with PaddleOCR."
|
| 108 |
)
|
| 109 |
|
| 110 |
if __name__ == "__main__":
|
| 111 |
iface = build_interface()
|
| 112 |
+
iface.launch()
|