Spaces:

iammraat
/

ocr

Sleeping

App Files Files Community

iammraat commited on Jan 24

Commit

81dee09

verified ·

1 Parent(s): 75a0625

Update app.py

Browse files

Files changed (1) hide show

app.py +341 -102

app.py CHANGED Viewed

@@ -368,6 +368,229 @@
 import gradio as gr
 import torch
 import numpy as np
@@ -376,24 +599,18 @@ from PIL import Image
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 from paddleocr import PaddleOCR
-# --- 1. SETUP TR-OCR ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Loading TrOCR on {device}...")
 processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
 model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
-# --- 2. SETUP PADDLEOCR ---
 print("Loading PaddleOCR...")
-# High resolution to catch faint text
 detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
                      det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)
-# ==========================================
-# 🧠 LOGIC FIX 1: REMOVE NESTED BOXES
-# ==========================================
-def calculate_overlap_area(box1, box2):
-    """Calculates the intersection area between two boxes."""
     x1 = max(box1[0], box2[0])
     y1 = max(box1[1], box2[1])
     x2 = min(box1[2], box2[2])
@@ -401,112 +618,139 @@ def calculate_overlap_area(box1, box2):
     if x2 < x1 or y2 < y1:
         return 0.0
-    return (x2 - x1) * (y2 - y1)
-def filter_nested_boxes(boxes, containment_thresh=0.80):
-    """
-    Removes boxes that are mostly contained within other larger boxes.
-    """
-    if not boxes: return []
-    # Convert all to [x1, y1, x2, y2, area]
-    active = []
     for b in boxes:
         area = (b[2] - b[0]) * (b[3] - b[1])
-        active.append(list(b) + [area])
-    # Sort by area (Largest to Smallest) - Crucial!
-    # We want to keep the big 'parent' box and delete the small 'child' box.
-    active.sort(key=lambda x: x[4], reverse=True)
-    final_boxes = []
-    for i, current in enumerate(active):
-        is_nested = False
-        curr_area = current[4]
-        # Check against all boxes we've already accepted (which are bigger/same size)
-        for kept in final_boxes:
-            overlap = calculate_overlap_area(current, kept)
-            # Check if 'current' is inside 'kept'
-            # If >80% of current box is covered by kept box, it's a duplicate/nested box
-            if (overlap / curr_area) > containment_thresh:
-                is_nested = True
                 break
-        if not is_nested:
-            final_boxes.append(current[:4]) # Store only coord, drop area
-    return final_boxes
-# ==========================================
-# 🧠 LOGIC FIX 2: MERGE WORDS INTO LINES
-# ==========================================
-def merge_boxes_into_lines(raw_boxes, y_thresh=30):
-    if raw_boxes is None or len(raw_boxes) == 0:
         return []
-    # 1. Convert raw polygons to Axis-Aligned Rectangles
     rects = []
     for box in raw_boxes:
         box = np.array(box).astype(np.float32)
-        x1 = np.min(box[:, 0])
-        y1 = np.min(box[:, 1])
-        x2 = np.max(box[:, 0])
-        y2 = np.max(box[:, 1])
         rects.append([x1, y1, x2, y2])
-    # 🔴 STEP 2: Filter Nested Boxes (Remove the 'child' boxes)
-    rects = filter_nested_boxes(rects)
-    # 3. Sort by Y center
-    rects.sort(key=lambda r: (r[1] + r[3]) / 2)
-    merged_lines = []
-    while rects:
-        current_line = [rects.pop(0)]
-        line_y_center = (current_line[0][1] + current_line[0][3]) / 2
-        remaining = []
-        for r in rects:
-            r_y_center = (r[1] + r[3]) / 2
-            # If Y-center is close (same horizontal line)
-            if abs(r_y_center - line_y_center) < y_thresh:
-                current_line.append(r)
             else:
-                remaining.append(r)
-        rects = remaining
-        # 4. Create Line Box
-        lx1 = min(r[0] for r in current_line)
-        ly1 = min(r[1] for r in current_line)
-        lx2 = max(r[2] for r in current_line)
-        ly2 = max(r[3] for r in current_line)
-        merged_lines.append([lx1, ly1, lx2, ly2])
-    # Final Sort by Y
-    merged_lines.sort(key=lambda r: r[1])
-    return merged_lines
 def process_image(image):
-    if image is None: return None, [], "Please upload an image."
     image_np = np.array(image.convert("RGB"))
-    # DETECT
     try:
         dt_boxes, _ = detector.text_detector(image_np)
     except Exception as e:
         return image, [], f"Detection Error: {str(e)}"
     if dt_boxes is None or len(dt_boxes) == 0:
         return image, [], "No text detected."
-    # PROCESS (Filter Nested -> Merge Lines)
     line_boxes = merge_boxes_into_lines(dt_boxes)
     annotated_img = image_np.copy()
@@ -516,14 +760,11 @@ def process_image(image):
     for box in line_boxes:
         x1, y1, x2, y2 = map(int, box)
-        # Filter Noise
         if (x2 - x1) < 20 or (y2 - y1) < 15:
             continue
-        # Draw (Green)
         cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
-        # PADDING
         PAD = 10
         h, w, _ = image_np.shape
         x1 = max(0, x1 - PAD)
@@ -535,20 +776,18 @@ def process_image(image):
         pil_crop = Image.fromarray(crop)
         debug_crops.append(pil_crop)
-        # RECOGNIZE
         with torch.no_grad():
             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
             generated_ids = model.generate(pixel_values)
             text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
             if text.strip():
                 results.append(text)
     full_text = "\n".join(results)
     return Image.fromarray(annotated_img), debug_crops, full_text
-# --- UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# ⚡ Smart Line-Level OCR (Cleaned)")
     with gr.Row():
         with gr.Column(scale=1):
@@ -556,12 +795,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             btn = gr.Button("Transcribe", variant="primary")
         with gr.Column(scale=1):
-            output_img = gr.Image(label="Cleaned Lines (Green Boxes)")
             output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
     with gr.Row():
-        gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200)
     btn.click(process_image, input_img, [output_img, gallery, output_txt])
 if __name__ == "__main__":

+# import gradio as gr
+# import torch
+# import numpy as np
+# import cv2
+# from PIL import Image
+# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+# from paddleocr import PaddleOCR
+# # --- 1. SETUP TR-OCR ---
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# print(f"Loading TrOCR on {device}...")
+# processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
+# model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
+# # --- 2. SETUP PADDLEOCR ---
+# print("Loading PaddleOCR...")
+# # High resolution to catch faint text
+# detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
+#                      det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)
+# # ==========================================
+# # 🧠 LOGIC FIX 1: REMOVE NESTED BOXES
+# # ==========================================
+# def calculate_overlap_area(box1, box2):
+#     """Calculates the intersection area between two boxes."""
+#     x1 = max(box1[0], box2[0])
+#     y1 = max(box1[1], box2[1])
+#     x2 = min(box1[2], box2[2])
+#     y2 = min(box1[3], box2[3])
+#     if x2 < x1 or y2 < y1:
+#         return 0.0
+#     return (x2 - x1) * (y2 - y1)
+# def filter_nested_boxes(boxes, containment_thresh=0.80):
+#     """
+#     Removes boxes that are mostly contained within other larger boxes.
+#     """
+#     if not boxes: return []
+#     # Convert all to [x1, y1, x2, y2, area]
+#     active = []
+#     for b in boxes:
+#         area = (b[2] - b[0]) * (b[3] - b[1])
+#         active.append(list(b) + [area])
+#     # Sort by area (Largest to Smallest) - Crucial!
+#     # We want to keep the big 'parent' box and delete the small 'child' box.
+#     active.sort(key=lambda x: x[4], reverse=True)
+#     final_boxes = []
+#     for i, current in enumerate(active):
+#         is_nested = False
+#         curr_area = current[4]
+#         # Check against all boxes we've already accepted (which are bigger/same size)
+#         for kept in final_boxes:
+#             overlap = calculate_overlap_area(current, kept)
+#             # Check if 'current' is inside 'kept'
+#             # If >80% of current box is covered by kept box, it's a duplicate/nested box
+#             if (overlap / curr_area) > containment_thresh:
+#                 is_nested = True
+#                 break
+#         if not is_nested:
+#             final_boxes.append(current[:4]) # Store only coord, drop area
+#     return final_boxes
+# # ==========================================
+# # 🧠 LOGIC FIX 2: MERGE WORDS INTO LINES
+# # ==========================================
+# def merge_boxes_into_lines(raw_boxes, y_thresh=30):
+#     if raw_boxes is None or len(raw_boxes) == 0:
+#         return []
+#     # 1. Convert raw polygons to Axis-Aligned Rectangles
+#     rects = []
+#     for box in raw_boxes:
+#         box = np.array(box).astype(np.float32)
+#         x1 = np.min(box[:, 0])
+#         y1 = np.min(box[:, 1])
+#         x2 = np.max(box[:, 0])
+#         y2 = np.max(box[:, 1])
+#         rects.append([x1, y1, x2, y2])
+#     # 🔴 STEP 2: Filter Nested Boxes (Remove the 'child' boxes)
+#     rects = filter_nested_boxes(rects)
+#     # 3. Sort by Y center
+#     rects.sort(key=lambda r: (r[1] + r[3]) / 2)
+#     merged_lines = []
+#     while rects:
+#         current_line = [rects.pop(0)]
+#         line_y_center = (current_line[0][1] + current_line[0][3]) / 2
+#         remaining = []
+#         for r in rects:
+#             r_y_center = (r[1] + r[3]) / 2
+#             # If Y-center is close (same horizontal line)
+#             if abs(r_y_center - line_y_center) < y_thresh:
+#                 current_line.append(r)
+#             else:
+#                 remaining.append(r)
+#         rects = remaining
+#         # 4. Create Line Box
+#         lx1 = min(r[0] for r in current_line)
+#         ly1 = min(r[1] for r in current_line)
+#         lx2 = max(r[2] for r in current_line)
+#         ly2 = max(r[3] for r in current_line)
+#         merged_lines.append([lx1, ly1, lx2, ly2])
+#     # Final Sort by Y
+#     merged_lines.sort(key=lambda r: r[1])
+#     return merged_lines
+# def process_image(image):
+#     if image is None: return None, [], "Please upload an image."
+#     image_np = np.array(image.convert("RGB"))
+#     # DETECT
+#     try:
+#         dt_boxes, _ = detector.text_detector(image_np)
+#     except Exception as e:
+#         return image, [], f"Detection Error: {str(e)}"
+#     if dt_boxes is None or len(dt_boxes) == 0:
+#         return image, [], "No text detected."
+#     # PROCESS (Filter Nested -> Merge Lines)
+#     line_boxes = merge_boxes_into_lines(dt_boxes)
+#     annotated_img = image_np.copy()
+#     results = []
+#     debug_crops = []
+#     for box in line_boxes:
+#         x1, y1, x2, y2 = map(int, box)
+#         # Filter Noise
+#         if (x2 - x1) < 20 or (y2 - y1) < 15:
+#             continue
+#         # Draw (Green)
+#         cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
+#         # PADDING
+#         PAD = 10
+#         h, w, _ = image_np.shape
+#         x1 = max(0, x1 - PAD)
+#         y1 = max(0, y1 - PAD)
+#         x2 = min(w, x2 + PAD)
+#         y2 = min(h, y2 + PAD)
+#         crop = image_np[y1:y2, x1:x2]
+#         pil_crop = Image.fromarray(crop)
+#         debug_crops.append(pil_crop)
+#         # RECOGNIZE
+#         with torch.no_grad():
+#             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
+#             generated_ids = model.generate(pixel_values)
+#             text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+#             if text.strip():
+#                 results.append(text)
+#     full_text = "\n".join(results)
+#     return Image.fromarray(annotated_img), debug_crops, full_text
+# # --- UI ---
+# with gr.Blocks(theme=gr.themes.Soft()) as demo:
+#     gr.Markdown("# ⚡ Smart Line-Level OCR (Cleaned)")
+#     with gr.Row():
+#         with gr.Column(scale=1):
+#             input_img = gr.Image(type="pil", label="Upload Image")
+#             btn = gr.Button("Transcribe", variant="primary")
+#         with gr.Column(scale=1):
+#             output_img = gr.Image(label="Cleaned Lines (Green Boxes)")
+#             output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
+#     with gr.Row():
+#         gallery = gr.Gallery(label="Final Line Crops", columns=4, height=200)
+#     btn.click(process_image, input_img, [output_img, gallery, output_txt])
+# if __name__ == "__main__":
+#     demo.launch()
 import gradio as gr
 import torch
 import numpy as np
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 from paddleocr import PaddleOCR
+# Setup
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print(f"Loading TrOCR on {device}...")
 processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
 model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
 print("Loading PaddleOCR...")
 detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False,
                      det_limit_side_len=2500, det_db_thresh=0.1, det_db_box_thresh=0.3)
+def calculate_iou(box1, box2):
+    """Calculate Intersection over Union"""
     x1 = max(box1[0], box2[0])
     y1 = max(box1[1], box2[1])
     x2 = min(box1[2], box2[2])
     if x2 < x1 or y2 < y1:
         return 0.0
+    intersection = (x2 - x1) * (y2 - y1)
+    area1 = (box1[2] - box1[0]) * (box1[3] - box1[1])
+    area2 = (box2[2] - box2[0]) * (box2[3] - box2[1])
+    return intersection / min(area1, area2)
+def remove_nested_boxes(boxes, iou_thresh=0.7):
+    """Remove boxes that are nested inside others"""
+    if not boxes:
+        return []
+    # Add area to each box
+    boxes_with_area = []
     for b in boxes:
         area = (b[2] - b[0]) * (b[3] - b[1])
+        boxes_with_area.append((*b, area))
+    # Sort by area descending (keep larger boxes)
+    boxes_with_area.sort(key=lambda x: x[4], reverse=True)
+    keep = []
+    for i, current in enumerate(boxes_with_area):
+        should_keep = True
+        curr_box = current[:4]
+        for kept in keep:
+            iou = calculate_iou(curr_box, kept)
+            if iou > iou_thresh:
+                should_keep = False
                 break
+        if should_keep:
+            keep.append(curr_box)
+    return keep
+def merge_boxes_into_lines(raw_boxes, y_overlap_thresh=0.5, x_gap_thresh=100):
+    """Merge boxes into lines with better horizontal merging"""
+    if not raw_boxes or len(raw_boxes) == 0:
         return []
+    # Convert polygons to rectangles
     rects = []
     for box in raw_boxes:
         box = np.array(box).astype(np.float32)
+        x1, y1 = np.min(box[:, 0]), np.min(box[:, 1])
+        x2, y2 = np.max(box[:, 0]), np.max(box[:, 1])
         rects.append([x1, y1, x2, y2])
+    # Remove nested boxes
+    rects = remove_nested_boxes(rects)
+    if not rects:
+        return []
+    # Sort by Y position
+    rects.sort(key=lambda r: r[1])
+    # Group into lines based on Y overlap
+    lines = []
+    current_line = [rects[0]]
+    for rect in rects[1:]:
+        # Check if rect belongs to current line
+        line_y1 = min(r[1] for r in current_line)
+        line_y2 = max(r[3] for r in current_line)
+        line_height = line_y2 - line_y1
+        rect_y1, rect_y2 = rect[1], rect[3]
+        rect_height = rect_y2 - rect_y1
+        # Calculate vertical overlap
+        overlap_y1 = max(line_y1, rect_y1)
+        overlap_y2 = min(line_y2, rect_y2)
+        overlap = max(0, overlap_y2 - overlap_y1)
+        # If significant vertical overlap, it's the same line
+        if overlap > y_overlap_thresh * min(line_height, rect_height):
+            current_line.append(rect)
+        else:
+            # Save current line and start new one
+            lines.append(current_line)
+            current_line = [rect]
+    lines.append(current_line)
+    # Merge boxes in each line
+    merged = []
+    for line in lines:
+        # Sort line boxes left to right
+        line.sort(key=lambda r: r[0])
+        # Merge horizontally close boxes
+        merged_line = [line[0]]
+        for rect in line[1:]:
+            last = merged_line[-1]
+            # If close horizontally, merge
+            if rect[0] - last[2] < x_gap_thresh:
+                merged_line[-1] = [
+                    min(last[0], rect[0]),
+                    min(last[1], rect[1]),
+                    max(last[2], rect[2]),
+                    max(last[3], rect[3])
+                ]
             else:
+                merged_line.append(rect)
+        # Final merge: combine all boxes in line into one
+        x1 = min(r[0] for r in merged_line)
+        y1 = min(r[1] for r in merged_line)
+        x2 = max(r[2] for r in merged_line)
+        y2 = max(r[3] for r in merged_line)
+        merged.append([x1, y1, x2, y2])
+    # Sort by Y
+    merged.sort(key=lambda r: r[1])
+    return merged
 def process_image(image):
+    if image is None:
+        return None, [], "Please upload an image."
     image_np = np.array(image.convert("RGB"))
     try:
         dt_boxes, _ = detector.text_detector(image_np)
     except Exception as e:
         return image, [], f"Detection Error: {str(e)}"
     if dt_boxes is None or len(dt_boxes) == 0:
         return image, [], "No text detected."
     line_boxes = merge_boxes_into_lines(dt_boxes)
     annotated_img = image_np.copy()
     for box in line_boxes:
         x1, y1, x2, y2 = map(int, box)
         if (x2 - x1) < 20 or (y2 - y1) < 15:
             continue
         cv2.rectangle(annotated_img, (x1, y1), (x2, y2), (0, 255, 0), 2)
         PAD = 10
         h, w, _ = image_np.shape
         x1 = max(0, x1 - PAD)
         pil_crop = Image.fromarray(crop)
         debug_crops.append(pil_crop)
         with torch.no_grad():
             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
             generated_ids = model.generate(pixel_values)
             text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
             if text.strip():
                 results.append(text)
     full_text = "\n".join(results)
     return Image.fromarray(annotated_img), debug_crops, full_text
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# ⚡ Smart Line-Level OCR (Fixed)")
     with gr.Row():
         with gr.Column(scale=1):
             btn = gr.Button("Transcribe", variant="primary")
         with gr.Column(scale=1):
+            output_img = gr.Image(label="Detected Lines")
             output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
     with gr.Row():
+        gallery = gr.Gallery(label="Line Crops", columns=4, height=200)
     btn.click(process_image, input_img, [output_img, gallery, output_txt])
 if __name__ == "__main__":