Spaces:

iammraat
/

ocr

Sleeping

App Files Files Community

iammraat commited on 22 days ago

Commit

e408019

verified ·

1 Parent(s): adb25fe

Update app.py

Browse files

Files changed (1) hide show

app.py +59 -32

app.py CHANGED Viewed

@@ -314,7 +314,6 @@
 import gradio as gr
 import torch
 import numpy as np
@@ -323,35 +322,51 @@ from PIL import Image
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 from craft_text_detector import Craft
-# --- DEFENSIVE MONKEY PATCH ---
 import craft_text_detector.craft_utils as craft_utils_module
 def fixed_adjustResultCoordinates(polys, ratio_w, ratio_h):
-    # This ensures coordinates are scaled correctly and remain as float32
-    if not polys: return []
     adjusted = []
     for poly in polys:
-        p = np.array(poly).reshape(-1, 2).astype(np.float32)
-        p[:, 0] *= ratio_w
-        p[:, 1] *= ratio_h
-        adjusted.append(p)
     return adjusted
 craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates
-# ------------------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load Models
 processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-handwritten")
 model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-handwritten").to(device).eval()
 craft = Craft(output_dir=None, crop_type="box", cuda=(device == "cuda"))
 def process_page(image):
-    if image is None: return None, "No image."
-    # 1. FORCE RESIZE for coordinate 1:1 mapping
-    # We resize to 1280 width to match CRAFT's default canvas
     base_width = 1280
     w_percent = (base_width / float(image.size[0]))
     h_size = int((float(image.size[1]) * float(w_percent)))
@@ -360,18 +375,21 @@ def process_page(image):
     img_np = np.array(image.convert("RGB"))
     # 2. DETECT
-    # Because we resized the image to 1280, ratio_w/h will be ~1.0
     prediction = craft.detect_text(img_np)
     boxes = prediction.get("boxes", [])
-    if not boxes: return image, "No text found."
-    # 3. SORT (Improved line grouping)
-    # We group items within 30 pixels of each other vertically as a single 'line'
     items = []
     for box in boxes:
         items.append({'cy': np.mean(box[:, 1]), 'cx': np.mean(box[:, 0]), 'box': box})
-    items.sort(key=lambda x: (int(x['cy'] // 30), x['cx']))
     annotated = img_np.copy()
     full_text = []
@@ -380,37 +398,46 @@ def process_page(image):
     for item in items:
         box = item['box'].astype(np.int32)
-        # Draw on image
         cv2.polylines(annotated, [box], True, (255, 0, 0), 2)
-        # Crop
         x_min, y_min = np.min(box, axis=0)
         x_max, y_max = np.max(box, axis=0)
-        # Clamp to image boundaries
         x_min, y_min = max(0, x_min), max(0, y_min)
         x_max, y_max = min(img_np.shape[1], x_max), min(img_np.shape[0], y_max)
-        if (x_max - x_min) < 5 or (y_max - y_min) < 5: continue
-        crop = Image.fromarray(img_np[y_min:y_max, x_min:x_max])
         with torch.no_grad():
-            pixel_values = processor(images=crop, return_tensors="pt").pixel_values.to(device)
             out_ids = model.generate(pixel_values)
             txt = processor.batch_decode(out_ids, skip_special_tokens=True)[0]
-            if txt.strip(): full_text.append(txt)
     return Image.fromarray(annotated), " ".join(full_text)
-# UI
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🚀 Final Fix: Full-Page OCR")
     with gr.Row():
-        input_i = gr.Image(type="pil", label="Input")
-        output_i = gr.Image(label="Detections (Scale Fixed)")
-    output_t = gr.Textbox(label="Result", lines=10)
-    btn = gr.Button("Transcribe", variant="primary")
-    btn.click(process_page, input_i, [output_i, output_t])
-demo.launch()

 import gradio as gr
 import torch
 import numpy as np
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 from craft_text_detector import Craft
+# --- THE FORTIFIED MONKEY PATCH ---
 import craft_text_detector.craft_utils as craft_utils_module
 def fixed_adjustResultCoordinates(polys, ratio_w, ratio_h):
+    if not polys:
+        return []
     adjusted = []
     for poly in polys:
+        try:
+            # Convert to numpy array first
+            p = np.array(poly)
+            # SANITY CHECK: A coordinate pair needs 2 numbers per point.
+            # If the total size is less than 2 or not even, it's noise.
+            if p.size < 2 or p.size % 2 != 0:
+                continue
+            p = p.reshape(-1, 2).astype(np.float32)
+            p[:, 0] *= ratio_w
+            p[:, 1] *= ratio_h
+            adjusted.append(p)
+        except Exception:
+            # If any mathematical error occurs for a specific noisy box, skip it
+            continue
     return adjusted
 craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates
+# ----------------------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load Models
+print("Loading TrOCR...")
 processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-handwritten")
 model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-handwritten").to(device).eval()
+print("Loading CRAFT...")
 craft = Craft(output_dir=None, crop_type="box", cuda=(device == "cuda"))
 def process_page(image):
+    if image is None:
+        return None, "Please upload an image."
+    # 1. FORCE RESIZE to align coordinate systems (Crucial for the 'tiny box' fix)
+    # We use 1280px as the standard width
     base_width = 1280
     w_percent = (base_width / float(image.size[0]))
     h_size = int((float(image.size[1]) * float(w_percent)))
     img_np = np.array(image.convert("RGB"))
     # 2. DETECT
     prediction = craft.detect_text(img_np)
     boxes = prediction.get("boxes", [])
+    if not boxes:
+        return image, "No text detected."
+    # 3. SORT (Line-by-line grouping)
+    # This logic groups boxes into lines if they overlap vertically
     items = []
     for box in boxes:
         items.append({'cy': np.mean(box[:, 1]), 'cx': np.mean(box[:, 0]), 'box': box})
+    # Sort by Y (approximate lines) then X (left to right)
+    # We increase the grouping factor to 40 to handle handwriting slant
+    items.sort(key=lambda x: (int(x['cy'] // 40), x['cx']))
     annotated = img_np.copy()
     full_text = []
     for item in items:
         box = item['box'].astype(np.int32)
+        # Draw on display image
         cv2.polylines(annotated, [box], True, (255, 0, 0), 2)
+        # Crop coordinates
         x_min, y_min = np.min(box, axis=0)
         x_max, y_max = np.max(box, axis=0)
+        # Clip to image boundaries
         x_min, y_min = max(0, x_min), max(0, y_min)
         x_max, y_max = min(img_np.shape[1], x_max), min(img_np.shape[0], y_max)
+        if (x_max - x_min) < 10 or (y_max - y_min) < 10:
+            continue
+        crop_pil = Image.fromarray(img_np[y_min:y_max, x_min:x_max])
         with torch.no_grad():
+            pixel_values = processor(images=crop_pil, return_tensors="pt").pixel_values.to(device)
             out_ids = model.generate(pixel_values)
             txt = processor.batch_decode(out_ids, skip_special_tokens=True)[0]
+            if txt.strip():
+                full_text.append(txt)
     return Image.fromarray(annotated), " ".join(full_text)
+# --- Gradio UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# ✍️ Full-Page Handwritten Recognition")
+    gr.Markdown("Pipeline: **CRAFT** (Detection) ➡️ **TrOCR** (Recognition)")
     with gr.Row():
+        with gr.Column():
+            input_img = gr.Image(type="pil", label="Step 1: Upload Image")
+            btn = gr.Button("Transcribe Page", variant="primary")
+        with gr.Column():
+            output_img = gr.Image(label="Step 2: Review Detections")
+            output_txt = gr.Textbox(label="Step 3: Extracted Text", lines=12)
+    btn.click(fn=process_page, inputs=input_img, outputs=[output_img, output_txt])
+if __name__ == "__main__":
+    demo.launch()