Spaces:

iammraat
/

ocr

Sleeping

App Files Files Community

iammraat commited on Jan 24

Commit

d07b368

verified ·

1 Parent(s): dd74e1b

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -83

app.py CHANGED Viewed

@@ -311,7 +311,6 @@
 import gradio as gr
 import torch
 import numpy as np
@@ -320,142 +319,115 @@ from PIL import Image
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 from craft_text_detector import Craft
-# --- 1. THE SAFE PATCH ---
-# We override the library's internal function with a safe, pure-Python version.
-# This prevents the "inhomogeneous shape" crash AND ensures scaling is applied.
-import craft_text_detector.craft_utils as craft_utils_module
-def safe_adjustResultCoordinates(polys, ratio_w, ratio_h):
-    if not polys:
-        return []
-    adjusted_polys = []
-    for poly in polys:
-        # Check 1: Must be a list or array
-        if poly is None or len(poly) == 0:
-            continue
-        # Check 2: Convert to numpy safely
-        try:
-            p = np.array(poly)
-            # Must have shape (N, 2) where N >= 3 (a polygon)
-            # If it's a 1D line or a dot, it's noise.
-            if p.ndim != 2 or p.shape[1] != 2 or p.shape[0] < 3:
-                continue
-        except Exception:
-            continue
-        # Check 3: Apply scaling (The Fix for Tiny Boxes)
-        # We multiply the coordinates by the ratio provided by the library
-        p = p.astype(np.float32)
-        p[:, 0] *= ratio_w
-        p[:, 1] *= ratio_h
-        adjusted_polys.append(p)
-    return adjusted_polys
-# Apply the patch
-craft_utils_module.adjustResultCoordinates = safe_adjustResultCoordinates
-# -------------------------
-# 2. LOAD MODELS
 print("Loading TrOCR...")
-device = "cuda" if torch.cuda.is_available() else "cpu"
 processor = TrOCRProcessor.from_pretrained('microsoft/trocr-small-handwritten')
 model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-small-handwritten').to(device).eval()
 print("Loading CRAFT...")
-# crop_type="box" gives us clean rectangles which are better for OCR than polygons
 craft = Craft(output_dir=None, crop_type="box", cuda=(device == "cuda"))
-# 3. HELPER: Sort Boxes
 def get_sorted_boxes(boxes):
-    # Sorts text boxes from top to bottom, then left to right
     if not boxes: return []
     items = []
     for box in boxes:
-        # Calculate center y and center x
         cy = np.mean(box[:, 1])
         cx = np.mean(box[:, 0])
         items.append((cy, cx, box))
-    # Sort by line (Y coordinate / 50px tolerance) then by X position
-    items.sort(key=lambda x: (int(x[0] // 50), x[1]))
     return [x[2] for x in items]
-# 4. MAIN PIPELINE
 def process_image(image):
     if image is None:
         return None, "Please upload an image."
-    # Convert to RGB (standard format)
-    # We DO NOT resize the image manually here. We pass the full resolution
-    # so the coordinates match the display image 1:1.
-    image_np = np.array(image.convert("RGB"))
-    # Run Detection
-    # The library handles internal resizing and passes the correct ratios
-    # to our 'safe_adjustResultCoordinates' patch above.
-    prediction = craft.detect_text(image_np)
     boxes = prediction.get("boxes", [])
     if not boxes:
-        return image, "No text detected."
     sorted_boxes = get_sorted_boxes(boxes)
-    annotated_img = image_np.copy()
     results = []
     for box in sorted_boxes:
-        # Convert to Integer for safe drawing/cropping
-        box = box.astype(np.int32)
-        # Draw box (Blue, thickness 4)
-        cv2.polylines(annotated_img, [box], True, (255, 0, 0), 4)
-        # Get cropping coordinates
-        x_min = max(0, np.min(box[:, 0]))
-        x_max = min(image_np.shape[1], np.max(box[:, 0]))
-        y_min = max(0, np.min(box[:, 1]))
-        y_max = min(image_np.shape[0], np.max(box[:, 1]))
-        # NOISE FILTER: Skip boxes that are too small (e.g., specks of dust)
-        # This prevents the "a b c d" garbage output
-        if (x_max - x_min) < 20 or (y_max - y_min) < 10:
             continue
-        # Crop and Recognize
-        crop = image_np[y_min:y_max, x_min:x_max]
         if crop.size == 0: continue
         pil_crop = Image.fromarray(crop)
         with torch.no_grad():
             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
             generated_ids = model.generate(pixel_values)
             text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            if text.strip() and len(text.strip()) > 1:
                 results.append(text)
     full_text = "\n".join(results)
     return Image.fromarray(annotated_img), full_text
-# 5. UI SETUP
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 📝 Handwritten Document OCR")
     with gr.Row():
-        with gr.Column(scale=1):
-            input_img = gr.Image(type="pil", label="Upload Document")
-            run_btn = gr.Button("Transcribe", variant="primary")
-        with gr.Column(scale=1):
             output_img = gr.Image(label="Detected Regions")
-            output_txt = gr.Textbox(label="Recognized Text", lines=20, show_copy_button=True)
-    run_btn.click(process_image, input_img, [output_img, output_txt])
 if __name__ == "__main__":
     demo.launch()

 import gradio as gr
 import torch
 import numpy as np
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 from craft_text_detector import Craft
+# --- SETUP ---
+device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Loading TrOCR...")
 processor = TrOCRProcessor.from_pretrained('microsoft/trocr-small-handwritten')
 model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-small-handwritten').to(device).eval()
 print("Loading CRAFT...")
+# We use crop_type="box" to get standard rectangles
 craft = Craft(output_dir=None, crop_type="box", cuda=(device == "cuda"))
 def get_sorted_boxes(boxes):
+    """Sorts boxes top-to-bottom, then left-to-right."""
     if not boxes: return []
     items = []
     for box in boxes:
+        # Calculate center y and x
         cy = np.mean(box[:, 1])
         cx = np.mean(box[:, 0])
         items.append((cy, cx, box))
+    # Sort by Y (lines) then X
+    items.sort(key=lambda x: (int(x[0] // 40), x[1]))
     return [x[2] for x in items]
 def process_image(image):
     if image is None:
         return None, "Please upload an image."
+    # 1. UNIFIED RESIZING (The Fix)
+    # We resize the input image to 1280px width immediately.
+    # We will use this SINGLE image for detection, cropping, and display.
+    target_width = 1280
+    w_percent = (target_width / float(image.size[0]))
+    h_size = int((float(image.size[1]) * float(w_percent)))
+    # High-quality resize
+    working_image = image.resize((target_width, h_size), Image.Resampling.LANCZOS)
+    # Convert to Numpy for OpenCV/CRAFT
+    # This is the ONLY image variable we will use from now on.
+    img_np = np.array(working_image.convert("RGB"))
+    # 2. DETECT
+    # Since our image is 1280px, and CRAFT defaults to 1280px canvas,
+    # the internal scaling ratio will be 1.0. Coordinates will match exactly.
+    prediction = craft.detect_text(img_np)
     boxes = prediction.get("boxes", [])
     if not boxes:
+        return working_image, "No text detected."
+    # 3. PROCESS & RECOGNIZE
     sorted_boxes = get_sorted_boxes(boxes)
+    annotated_img = img_np.copy()
     results = []
     for box in sorted_boxes:
+        # box is a list of points, convert to numpy int
+        box_np = np.array(box).astype(np.int32)
+        # Draw on the WORKING image
+        cv2.polylines(annotated_img, [box_np], True, (255, 0, 0), 3)
+        # Get Crop Coordinates
+        x_min = max(0, np.min(box_np[:, 0]))
+        x_max = min(img_np.shape[1], np.max(box_np[:, 0]))
+        y_min = max(0, np.min(box_np[:, 1]))
+        y_max = min(img_np.shape[0], np.max(box_np[:, 1]))
+        # Filter noise (tiny specks)
+        if (x_max - x_min) < 15 or (y_max - y_min) < 10:
             continue
+        # Crop from the WORKING image
+        crop = img_np[y_min:y_max, x_min:x_max]
         if crop.size == 0: continue
         pil_crop = Image.fromarray(crop)
+        # TrOCR Inference
         with torch.no_grad():
             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
             generated_ids = model.generate(pixel_values)
             text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            if text.strip():
                 results.append(text)
     full_text = "\n".join(results)
+    # Return the annotated WORKING image
     return Image.fromarray(annotated_img), full_text
+# --- UI ---
+with gr.Blocks(title="Handwritten OCR") as demo:
+    gr.Markdown("## 📝 Robust Handwritten OCR")
     with gr.Row():
+        with gr.Column():
+            input_img = gr.Image(type="pil", label="Upload Image")
+            btn = gr.Button("Extract Text", variant="primary")
+        with gr.Column():
+            # This output image will be the 1280px version we used for processing
             output_img = gr.Image(label="Detected Regions")
+            output_txt = gr.Textbox(label="Result", lines=20)
+    btn.click(process_image, input_img, [output_img, output_txt])
 if __name__ == "__main__":
     demo.launch()