Spaces:

iammraat
/

ocr

Sleeping

App Files Files Community

iammraat commited on about 1 month ago

Commit

f752551

verified ·

1 Parent(s): 2be994c

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -30

app.py CHANGED Viewed

@@ -244,7 +244,6 @@
 import gradio as gr
 import torch
 import numpy as np
@@ -257,23 +256,22 @@ from paddleocr import PaddleOCR
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Loading TrOCR on {device}...")
-# Using the 'base' model for better accuracy on the crops
 processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
 model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
 # --- 2. SETUP PADDLEOCR (Detection Only) ---
 print("Loading PaddleOCR (DBNet)...")
-# use_angle_cls=True helps if the page is slightly rotated
-# lang='en' loads the English detection model
 detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
 def get_sorted_boxes(boxes):
     """Sorts boxes top-to-bottom (lines), then left-to-right."""
-    if not boxes: return []
     items = []
     for box in boxes:
-        # Paddle returns boxes as list of points [[x1,y1], [x2,y2], ...]
-        # We convert to numpy for easier calc
         box = np.array(box).astype(np.float32)
         cy = np.mean(box[:, 1])
         cx = np.mean(box[:, 0])
@@ -287,43 +285,45 @@ def process_image(image):
     if image is None:
         return None, [], "Please upload an image."
-    # Convert to standard RGB Numpy array (Full Resolution)
     image_np = np.array(image.convert("RGB"))
-    # 1. DETECT with PaddleOCR
-    # cls=False because we don't need orientation classification for just boxes
-    # rec=False because we ONLY want boxes (we will use TrOCR to read)
-    result = detector.ocr(image_np, cls=False, rec=False)
-    # Paddle returns a list of results (one per page). We just have 1 page.
-    if not result or result[0] is None:
         return image, [], "No text detected."
-    # Extract boxes from result
-    boxes = result[0] # [[x1, y1], [x2, y2], ...]
-    sorted_boxes = get_sorted_boxes(boxes)
     annotated_img = image_np.copy()
     results = []
     debug_crops = []
-    # 2. PROCESS BOXES
     for box in sorted_boxes:
         box_int = box.astype(np.int32)
-        # Draw the box (Red, thickness 2)
         cv2.polylines(annotated_img, [box_int], True, (255, 0, 0), 2)
-        # --- CROP WITH PADDING ---
-        # Padding helps TrOCR see the start/end of letters
-        PADDING = 8
         x_min = max(0, np.min(box_int[:, 0]) - PADDING)
         x_max = min(image_np.shape[1], np.max(box_int[:, 0]) + PADDING)
         y_min = max(0, np.min(box_int[:, 1]) - PADDING)
         y_max = min(image_np.shape[0], np.max(box_int[:, 1]) + PADDING)
-        # Skip tiny noise
         if (x_max - x_min) < 15 or (y_max - y_min) < 10:
             continue
@@ -331,7 +331,7 @@ def process_image(image):
         pil_crop = Image.fromarray(crop)
         debug_crops.append(pil_crop)
-        # 3. RECOGNIZE (TrOCR)
         with torch.no_grad():
             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
             generated_ids = model.generate(pixel_values)
@@ -344,10 +344,10 @@ def process_image(image):
     return Image.fromarray(annotated_img), debug_crops, full_text
-# --- GRADIO UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# ⚡ PaddleOCR + TrOCR")
-    gr.Markdown("Using **PaddleOCR (DBNet)** for sharp detection on cramped text, and **TrOCR** for reading.")
     with gr.Row():
         with gr.Column(scale=1):
@@ -359,7 +359,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
     with gr.Row():
-        gallery = gr.Gallery(label="Line Crops", columns=6, height=200)
     btn.click(process_image, input_img, [output_img, gallery, output_txt])

 import gradio as gr
 import torch
 import numpy as np
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Loading TrOCR on {device}...")
 processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
 model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
 # --- 2. SETUP PADDLEOCR (Detection Only) ---
 print("Loading PaddleOCR (DBNet)...")
+# We load the detector but we will bypass the main .ocr() method to avoid bugs
 detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
 def get_sorted_boxes(boxes):
     """Sorts boxes top-to-bottom (lines), then left-to-right."""
+    if boxes is None or len(boxes) == 0:
+        return []
     items = []
     for box in boxes:
+        # Paddle returns boxes as numpy arrays or lists
         box = np.array(box).astype(np.float32)
         cy = np.mean(box[:, 1])
         cx = np.mean(box[:, 0])
     if image is None:
         return None, [], "Please upload an image."
+    # Convert to standard RGB Numpy array
     image_np = np.array(image.convert("RGB"))
+    # ============================================================
+    # 🔴 FIX: Direct Detection Bypass
+    # ============================================================
+    # The standard 'detector.ocr()' method has a bug in the current
+    # version that crashes when checking "if not boxes".
+    # We call the internal 'text_detector' directly to skip that check.
+    try:
+        dt_boxes, _ = detector.text_detector(image_np)
+    except Exception as e:
+        return image, [], f"Detection Error: {str(e)}"
+    if dt_boxes is None or len(dt_boxes) == 0:
         return image, [], "No text detected."
+    # dt_boxes is already a numpy array of coordinates
+    sorted_boxes = get_sorted_boxes(dt_boxes)
     annotated_img = image_np.copy()
     results = []
     debug_crops = []
+    # Process Boxes
     for box in sorted_boxes:
         box_int = box.astype(np.int32)
+        # Draw Box (Red, thickness 2)
         cv2.polylines(annotated_img, [box_int], True, (255, 0, 0), 2)
+        # Crop with Padding (Prevents TrOCR Hallucinations)
+        PADDING = 10
         x_min = max(0, np.min(box_int[:, 0]) - PADDING)
         x_max = min(image_np.shape[1], np.max(box_int[:, 0]) + PADDING)
         y_min = max(0, np.min(box_int[:, 1]) - PADDING)
         y_max = min(image_np.shape[0], np.max(box_int[:, 1]) + PADDING)
+        # Skip noise
         if (x_max - x_min) < 15 or (y_max - y_min) < 10:
             continue
         pil_crop = Image.fromarray(crop)
         debug_crops.append(pil_crop)
+        # Recognition (TrOCR)
         with torch.no_grad():
             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
             generated_ids = model.generate(pixel_values)
     return Image.fromarray(annotated_img), debug_crops, full_text
+# --- UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# ⚡ PaddleOCR + TrOCR (Robust)")
+    gr.Markdown("Using direct DBNet inference to avoid library bugs.")
     with gr.Row():
         with gr.Column(scale=1):
             output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
     with gr.Row():
+        gallery = gr.Gallery(label="Line Crops (Debug)", columns=6, height=200)
     btn.click(process_image, input_img, [output_img, gallery, output_txt])