Spaces:

iammraat
/

ocr

Sleeping

App Files Files Community

iammraat commited on Jan 24

Commit

f2f27bb

verified ·

1 Parent(s): b02cd5a

Update app.py

Browse files

Files changed (1) hide show

app.py +96 -356

app.py CHANGED Viewed

@@ -66,248 +66,9 @@
-# import gradio as gr
-# import torch
-# import numpy as np
-# import cv2
-# from PIL import Image
-# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
-# from craft_text_detector import Craft
-# # ----------------------------
-# # Device
-# # ----------------------------
-# device = "cuda" if torch.cuda.is_available() else "cpu"
-# # ----------------------------
-# # Load TrOCR
-# # ----------------------------
-# print("Loading TrOCR model...")
-# processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-handwritten")
-# model = VisionEncoderDecoderModel.from_pretrained(
-#     "microsoft/trocr-small-handwritten"
-# )
-# model.to(device)
-# model.eval()
-# # ----------------------------
-# # Load CRAFT
-# # ----------------------------
-# print("Loading CRAFT text detector...")
-# craft = Craft(
-#     output_dir=None,
-#     crop_type="poly",
-#     cuda=(device == "cuda"),
-# )
-# # ----------------------------
-# # Sort boxes (reading order)
-# # ----------------------------
-# def get_sorted_boxes(boxes):
-#     items = []
-#     for box in boxes:
-#         cx = np.mean(box[:, 0])
-#         cy = np.mean(box[:, 1])
-#         items.append((cy, cx, box))
-#     # group by line (roughly)
-#     items.sort(key=lambda x: (int(x[0] // 20), x[1]))
-#     return [b for _, _, b in items]
-# # ----------------------------
-# # OCR Pipeline
-# # ----------------------------
-# def process_full_page(image: Image.Image):
-#     # ALWAYS return (image_or_None, text)
-#     if image is None:
-#         return None, "Please upload an image."
-#     image_np = np.array(image)
-#     prediction = craft.detect_text(image_np)
-#     boxes = prediction.get("boxes", [])
-#     if not boxes:
-#         return image, "No text detected."
-#     sorted_boxes = get_sorted_boxes(boxes)
-#     annotated = image_np.copy()
-#     texts = []
-#     for box in sorted_boxes:
-#         box = box.astype(int)
-#         cv2.polylines(annotated, [box], True, (255, 0, 0), 2)
-#         x_min = max(0, box[:, 0].min())
-#         x_max = min(image_np.shape[1], box[:, 0].max())
-#         y_min = max(0, box[:, 1].min())
-#         y_max = min(image_np.shape[0], box[:, 1].max())
-#         if x_max - x_min < 5 or y_max - y_min < 5:
-#             continue
-#         crop = image_np[y_min:y_max, x_min:x_max]
-#         pil_crop = Image.fromarray(crop).convert("RGB")
-#         with torch.no_grad():
-#             pixels = processor(
-#                 images=pil_crop,
-#                 return_tensors="pt"
-#             ).pixel_values.to(device)
-#             ids = model.generate(pixels)
-#             text = processor.batch_decode(
-#                 ids, skip_special_tokens=True
-#             )[0]
-#         if text.strip():
-#             texts.append(text)
-#     final_text = " ".join(texts)
-#     return Image.fromarray(annotated), final_text
-# # ----------------------------
-# # Gradio UI
-# # ----------------------------
-# with gr.Blocks(theme=gr.themes.Soft()) as demo:
-#     gr.Markdown("# 🕵️‍♀️ Full-Page Handwritten OCR")
-#     gr.Markdown("**CRAFT ➜ TrOCR** (Detection + Recognition)")
-#     with gr.Row():
-#         input_img = gr.Image(type="pil", label="Upload Full Page")
-#     with gr.Row():
-#         vis_output = gr.Image(label="Detections")
-#         text_output = gr.Textbox(label="Extracted Text", lines=10)
-#     btn = gr.Button("Process Page", variant="primary")
-#     btn.click(
-#         fn=process_full_page,
-#         inputs=input_img,
-#         outputs=[vis_output, text_output],
-#     )
-# if __name__ == "__main__":
-#     demo.launch(
-#     server_name="0.0.0.0",
-#     server_port=7860,
-#     show_api=False,
-# )
-# import gradio as gr
-# import torch
-# import numpy as np
-# import cv2
-# from PIL import Image
-# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
-# from craft_text_detector import Craft
-# # PATCH: Fix NumPy inhomogeneous array crash
-# import craft_text_detector.craft_utils as craft_utils_module
-# _original_adjust = craft_utils_module.adjustResultCoordinates
-# def fixed_adjustResultCoordinates(polys, ratio_w, ratio_h):
-#     if not polys:
-#         return []
-#     adjusted = []
-#     for poly in polys:
-#         if poly is None or len(poly) == 0:
-#             continue
-#         poly = np.array(poly).reshape(-1, 2)
-#         poly[:, 0] *= ratio_w
-#         poly[:, 1] *= ratio_h
-#         adjusted.append(poly)
-#     return adjusted
-# craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates
-# # Device
-# device = "cuda" if torch.cuda.is_available() else "cpu"
-# # Load TrOCR
-# print("Loading TrOCR model...")
-# processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-handwritten")
-# model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-handwritten")
-# model.to(device)
-# model.eval()
-# # Load CRAFT
-# print("Loading CRAFT text detector...")
-# craft = Craft(output_dir=None, crop_type="poly", cuda=(device == "cuda"))
-# # Sort boxes (reading order)
-# def get_sorted_boxes(boxes):
-#     items = []
-#     for box in boxes:
-#         cx = np.mean(box[:, 0])
-#         cy = np.mean(box[:, 1])
-#         items.append((cy, cx, box))
-#     items.sort(key=lambda x: (int(x[0] // 20), x[1]))
-#     return [b for _, _, b in items]
-# # OCR Pipeline
-# def process_full_page(image):
-#     if image is None:
-#         return None, "Please upload an image."
-#     image_np = np.array(image)
-#     prediction = craft.detect_text(image_np)
-#     boxes = prediction.get("boxes", [])
-#     if not boxes:
-#         return image, "No text detected."
-#     sorted_boxes = get_sorted_boxes(boxes)
-#     annotated = image_np.copy()
-#     texts = []
-#     for box in sorted_boxes:
-#         box = box.astype(int)
-#         cv2.polylines(annotated, [box], True, (255, 0, 0), 2)
-#         x_min = max(0, box[:, 0].min())
-#         x_max = min(image_np.shape[1], box[:, 0].max())
-#         y_min = max(0, box[:, 1].min())
-#         y_max = min(image_np.shape[0], box[:, 1].max())
-#         if x_max - x_min < 5 or y_max - y_min < 5:
-#             continue
-#         crop = image_np[y_min:y_max, x_min:x_max]
-#         pil_crop = Image.fromarray(crop).convert("RGB")
-#         with torch.no_grad():
-#             pixels = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
-#             ids = model.generate(pixels)
-#             text = processor.batch_decode(ids, skip_special_tokens=True)[0]
-#             if text.strip():
-#                 texts.append(text)
-#     final_text = " ".join(texts)
-#     return Image.fromarray(annotated), final_text
-# # Gradio UI
-# demo = gr.Interface(
-#     fn=process_full_page,
-#     inputs=gr.Image(type="pil", label="Upload Full Page"),
-#     outputs=[
-#         gr.Image(label="Detections"),
-#         gr.Textbox(label="Extracted Text", lines=10)
-#     ],
-#     title="🕵️‍♀️ Full-Page Handwritten OCR",
-#     description="CRAFT ➜ TrOCR (Detection + Recognition)"
-# )
-# if __name__ == "__main__":
-#     demo.launch(server_name="0.0.0.0", server_port=7860)
@@ -326,7 +87,7 @@
 # from craft_text_detector import Craft
 # # ==========================================
-# # 🔧 PATCH 1: Fix Torchvision (From your code)
 # # ==========================================
 # import torchvision.models.vgg
 # if not hasattr(torchvision.models.vgg, 'model_urls'):
@@ -335,7 +96,7 @@
 #     }
 # # ==========================================
-# # 🔧 PATCH 2: The Logic Fix (Ratio Net)
 # # ==========================================
 # import craft_text_detector.craft_utils as craft_utils_module
@@ -348,11 +109,10 @@
 #         if poly is None or len(poly) == 0:
 #             continue
-#         # Safe numpy conversion
 #         p = np.array(poly).reshape(-1, 2)
-#         # CRITICAL FIX: Multiply by ratio_net (defaults to 2)
-#         # This scales the 1/2 size heatmap output back to full image size
 #         p[:, 0] *= (ratio_w * ratio_net)
 #         p[:, 1] *= (ratio_h * ratio_net)
@@ -360,21 +120,24 @@
 #     return adjusted
-# # Apply the patch
 # craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates
 # # ==========================================
-# # --- Load TrOCR (Recognition) ---
 # device = "cuda" if torch.cuda.is_available() else "cpu"
-# print(f"Loading TrOCR on {device}...")
-# processor = TrOCRProcessor.from_pretrained('microsoft/trocr-small-handwritten')
-# model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-small-handwritten').to(device).eval()
-# # --- Load CRAFT (Detection) ---
 # print("Loading CRAFT...")
-# # crop_type="box" ensures we get clean rectangles
 # craft = Craft(output_dir=None, crop_type="box", cuda=(device == "cuda"))
 # def get_sorted_boxes(boxes):
 #     """Sorts boxes top-to-bottom (lines), then left-to-right."""
 #     if not boxes: return []
@@ -384,54 +147,60 @@
 #         cx = np.mean(box[:, 0])
 #         items.append((cy, cx, box))
-#     # Sort by Y (grouping by 40px lines) then X
-#     items.sort(key=lambda x: (int(x[0] // 40), x[1]))
 #     return [x[2] for x in items]
 # def process_image(image):
 #     if image is None:
-#         return None, "Please upload an image."
-#     # Convert to numpy
 #     image_np = np.array(image.convert("RGB"))
 #     # 1. DETECT
-#     # The patch we added above will now auto-multiply coordinates by 2 * ratio
-#     # fixing the "tiny box" issue.
 #     prediction = craft.detect_text(image_np)
 #     boxes = prediction.get("boxes", [])
 #     if not boxes:
-#         return image, "No text detected."
-#     # 2. VISUALIZE & CROP
 #     sorted_boxes = get_sorted_boxes(boxes)
 #     annotated_img = image_np.copy()
 #     results = []
 #     for box in sorted_boxes:
-#         # Cast to int for drawing
 #         box_int = box.astype(np.int32)
-#         # Draw on image (Blue, thickness 3)
 #         cv2.polylines(annotated_img, [box_int], True, (255, 0, 0), 3)
-#         # Get Crop Coordinates
-#         x_min = max(0, np.min(box_int[:, 0]))
-#         x_max = min(image_np.shape[1], np.max(box_int[:, 0]))
-#         y_min = max(0, np.min(box_int[:, 1]))
-#         y_max = min(image_np.shape[0], np.max(box_int[:, 1]))
-#         # Filter noise
-#         if (x_max - x_min) < 10 or (y_max - y_min) < 10:
 #             continue
 #         crop = image_np[y_min:y_max, x_min:x_max]
-#         if crop.size == 0: continue
 #         pil_crop = Image.fromarray(crop)
-#         # 3. RECOGNIZE (TrOCR)
 #         with torch.no_grad():
 #             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
 #             generated_ids = model.generate(pixel_values)
@@ -441,22 +210,28 @@
 #                 results.append(text)
 #     full_text = "\n".join(results)
-#     return Image.fromarray(annotated_img), full_text
-# # --- Gradio UI ---
-# with gr.Blocks(title="Handwritten OCR Fixed") as demo:
-#     gr.Markdown("# 📝 Handwritten OCR (Fixed Pipeline)")
 #     with gr.Row():
-#         with gr.Column():
 #             input_img = gr.Image(type="pil", label="Upload Image")
 #             btn = gr.Button("Transcribe", variant="primary")
-#         with gr.Column():
 #             output_img = gr.Image(label="Detections")
-#             output_txt = gr.Textbox(label="Result", lines=20)
-#     btn.click(process_image, input_img, [output_img, output_txt])
 # if __name__ == "__main__":
 #     demo.launch()
@@ -468,76 +243,43 @@
 import gradio as gr
 import torch
 import numpy as np
 import cv2
 from PIL import Image
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
-from craft_text_detector import Craft
-# ==========================================
-# 🔧 PATCH 1: Fix Torchvision Compatibility
-# ==========================================
-import torchvision.models.vgg
-if not hasattr(torchvision.models.vgg, 'model_urls'):
-    torchvision.models.vgg.model_urls = {
-        'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth'
-    }
-# ==========================================
-# 🔧 PATCH 2: The "Ratio Net" Logic Fix
-# ==========================================
-import craft_text_detector.craft_utils as craft_utils_module
-def fixed_adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net=2):
-    if not polys:
-        return []
-    adjusted = []
-    for poly in polys:
-        if poly is None or len(poly) == 0:
-            continue
-        # Convert to numpy and reshape
-        p = np.array(poly).reshape(-1, 2)
-        # Scale correctly using ratio_net
-        p[:, 0] *= (ratio_w * ratio_net)
-        p[:, 1] *= (ratio_h * ratio_net)
-        adjusted.append(p)
-    return adjusted
-craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates
-# ==========================================
-# --- 1. SETUP MODEL (Switched to BASE for stability) ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Loading TrOCR-Base on {device}...")
-# We use the 'base' model because 'small' hallucinates Wikipedia text on tight crops
-MODEL_ID = "microsoft/trocr-base-handwritten"
-processor = TrOCRProcessor.from_pretrained(MODEL_ID)
-model = VisionEncoderDecoderModel.from_pretrained(MODEL_ID).to(device).eval()
-print("Loading CRAFT...")
-craft = Craft(output_dir=None, crop_type="box", cuda=(device == "cuda"))
-# --- 2. HELPER FUNCTIONS ---
 def get_sorted_boxes(boxes):
     """Sorts boxes top-to-bottom (lines), then left-to-right."""
     if not boxes: return []
     items = []
     for box in boxes:
         cy = np.mean(box[:, 1])
         cx = np.mean(box[:, 0])
         items.append((cy, cx, box))
-    # Sort by line (approx 20px tolerance) then by column
     items.sort(key=lambda x: (int(x[0] // 20), x[1]))
     return [x[2] for x in items]
@@ -545,18 +287,21 @@ def process_image(image):
     if image is None:
         return None, [], "Please upload an image."
-    # Convert to standard RGB Numpy array
-    # We use the FULL resolution image (no resizing) to keep text sharp
     image_np = np.array(image.convert("RGB"))
-    # 1. DETECT
-    # The patch ensures coordinates map perfectly to this full-res image
-    prediction = craft.detect_text(image_np)
-    boxes = prediction.get("boxes", [])
-    if not boxes:
         return image, [], "No text detected."
     sorted_boxes = get_sorted_boxes(boxes)
     annotated_img = image_np.copy()
     results = []
@@ -566,31 +311,27 @@ def process_image(image):
     for box in sorted_boxes:
         box_int = box.astype(np.int32)
-        # Draw the box (Visual verification)
-        cv2.polylines(annotated_img, [box_int], True, (255, 0, 0), 3)
-        # --- CROP WITH PADDING (Crucial Fix) ---
-        # TrOCR needs 'breathing room' or it hallucinates.
-        PADDING = 10
         x_min = max(0, np.min(box_int[:, 0]) - PADDING)
         x_max = min(image_np.shape[1], np.max(box_int[:, 0]) + PADDING)
         y_min = max(0, np.min(box_int[:, 1]) - PADDING)
         y_max = min(image_np.shape[0], np.max(box_int[:, 1]) + PADDING)
-        # Skip noise
-        if (x_max - x_min) < 20 or (y_max - y_min) < 10:
             continue
         crop = image_np[y_min:y_max, x_min:x_max]
-        # Convert to PIL for Model
         pil_crop = Image.fromarray(crop)
-        # Add to debug gallery so user can see what the model sees
         debug_crops.append(pil_crop)
-        # 3. RECOGNIZE
         with torch.no_grad():
             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
             generated_ids = model.generate(pixel_values)
@@ -603,10 +344,10 @@ def process_image(image):
     return Image.fromarray(annotated_img), debug_crops, full_text
-# --- 3. GRADIO UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 📝 Robust Handwritten OCR (Base Model)")
-    gr.Markdown("Includes padding and a stronger model to prevent hallucinations.")
     with gr.Row():
         with gr.Column(scale=1):
@@ -614,14 +355,13 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
             btn = gr.Button("Transcribe", variant="primary")
         with gr.Column(scale=1):
-            output_img = gr.Image(label="Detections")
             output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
     with gr.Row():
-        # Gallery to check if crops are valid or empty
-        crop_gallery = gr.Gallery(label="Debug: See what the model sees (Crops)", columns=6, height=200)
-    btn.click(process_image, input_img, [output_img, crop_gallery, output_txt])
 if __name__ == "__main__":
     demo.launch()

 # from craft_text_detector import Craft
 # # ==========================================
+# # 🔧 PATCH 1: Fix Torchvision Compatibility
 # # ==========================================
 # import torchvision.models.vgg
 # if not hasattr(torchvision.models.vgg, 'model_urls'):
 #     }
 # # ==========================================
+# # 🔧 PATCH 2: The "Ratio Net" Logic Fix
 # # ==========================================
 # import craft_text_detector.craft_utils as craft_utils_module
 #         if poly is None or len(poly) == 0:
 #             continue
+#         # Convert to numpy and reshape
 #         p = np.array(poly).reshape(-1, 2)
+#         # Scale correctly using ratio_net
 #         p[:, 0] *= (ratio_w * ratio_net)
 #         p[:, 1] *= (ratio_h * ratio_net)
 #     return adjusted
 # craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates
 # # ==========================================
+# # --- 1. SETUP MODEL (Switched to BASE for stability) ---
 # device = "cuda" if torch.cuda.is_available() else "cpu"
+# print(f"Loading TrOCR-Base on {device}...")
+# # We use the 'base' model because 'small' hallucinates Wikipedia text on tight crops
+# MODEL_ID = "microsoft/trocr-base-handwritten"
+# processor = TrOCRProcessor.from_pretrained(MODEL_ID)
+# model = VisionEncoderDecoderModel.from_pretrained(MODEL_ID).to(device).eval()
 # print("Loading CRAFT...")
 # craft = Craft(output_dir=None, crop_type="box", cuda=(device == "cuda"))
+# # --- 2. HELPER FUNCTIONS ---
 # def get_sorted_boxes(boxes):
 #     """Sorts boxes top-to-bottom (lines), then left-to-right."""
 #     if not boxes: return []
 #         cx = np.mean(box[:, 0])
 #         items.append((cy, cx, box))
+#     # Sort by line (approx 20px tolerance) then by column
+#     items.sort(key=lambda x: (int(x[0] // 20), x[1]))
 #     return [x[2] for x in items]
 # def process_image(image):
 #     if image is None:
+#         return None, [], "Please upload an image."
+#     # Convert to standard RGB Numpy array
+#     # We use the FULL resolution image (no resizing) to keep text sharp
 #     image_np = np.array(image.convert("RGB"))
 #     # 1. DETECT
+#     # The patch ensures coordinates map perfectly to this full-res image
 #     prediction = craft.detect_text(image_np)
 #     boxes = prediction.get("boxes", [])
 #     if not boxes:
+#         return image, [], "No text detected."
 #     sorted_boxes = get_sorted_boxes(boxes)
 #     annotated_img = image_np.copy()
 #     results = []
+#     debug_crops = []
+#     # 2. PROCESS BOXES
 #     for box in sorted_boxes:
 #         box_int = box.astype(np.int32)
+#         # Draw the box (Visual verification)
 #         cv2.polylines(annotated_img, [box_int], True, (255, 0, 0), 3)
+#         # --- CROP WITH PADDING (Crucial Fix) ---
+#         # TrOCR needs 'breathing room' or it hallucinates.
+#         PADDING = 10
+#         x_min = max(0, np.min(box_int[:, 0]) - PADDING)
+#         x_max = min(image_np.shape[1], np.max(box_int[:, 0]) + PADDING)
+#         y_min = max(0, np.min(box_int[:, 1]) - PADDING)
+#         y_max = min(image_np.shape[0], np.max(box_int[:, 1]) + PADDING)
+#         # Skip noise
+#         if (x_max - x_min) < 20 or (y_max - y_min) < 10:
 #             continue
 #         crop = image_np[y_min:y_max, x_min:x_max]
+#         # Convert to PIL for Model
 #         pil_crop = Image.fromarray(crop)
+#         # Add to debug gallery so user can see what the model sees
+#         debug_crops.append(pil_crop)
+#         # 3. RECOGNIZE
 #         with torch.no_grad():
 #             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
 #             generated_ids = model.generate(pixel_values)
 #                 results.append(text)
 #     full_text = "\n".join(results)
+#     return Image.fromarray(annotated_img), debug_crops, full_text
+# # --- 3. GRADIO UI ---
+# with gr.Blocks(theme=gr.themes.Soft()) as demo:
+#     gr.Markdown("# 📝 Robust Handwritten OCR (Base Model)")
+#     gr.Markdown("Includes padding and a stronger model to prevent hallucinations.")
 #     with gr.Row():
+#         with gr.Column(scale=1):
 #             input_img = gr.Image(type="pil", label="Upload Image")
 #             btn = gr.Button("Transcribe", variant="primary")
+#         with gr.Column(scale=1):
 #             output_img = gr.Image(label="Detections")
+#             output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
+#     with gr.Row():
+#         # Gallery to check if crops are valid or empty
+#         crop_gallery = gr.Gallery(label="Debug: See what the model sees (Crops)", columns=6, height=200)
+#     btn.click(process_image, input_img, [output_img, crop_gallery, output_txt])
 # if __name__ == "__main__":
 #     demo.launch()
 import gradio as gr
 import torch
 import numpy as np
 import cv2
 from PIL import Image
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+from paddleocr import PaddleOCR
+# --- 1. SETUP TR-OCR (Recognition) ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Loading TrOCR on {device}...")
+# Using the 'base' model for better accuracy on the crops
+processor = TrOCRProcessor.from_pretrained('microsoft/trocr-base-handwritten')
+model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-base-handwritten').to(device).eval()
+# --- 2. SETUP PADDLEOCR (Detection Only) ---
+print("Loading PaddleOCR (DBNet)...")
+# use_angle_cls=True helps if the page is slightly rotated
+# lang='en' loads the English detection model
+detector = PaddleOCR(use_angle_cls=True, lang='en', show_log=False)
 def get_sorted_boxes(boxes):
     """Sorts boxes top-to-bottom (lines), then left-to-right."""
     if not boxes: return []
     items = []
     for box in boxes:
+        # Paddle returns boxes as list of points [[x1,y1], [x2,y2], ...]
+        # We convert to numpy for easier calc
+        box = np.array(box).astype(np.float32)
         cy = np.mean(box[:, 1])
         cx = np.mean(box[:, 0])
         items.append((cy, cx, box))
+    # Sort by Y (line tolerance 20px) then X
     items.sort(key=lambda x: (int(x[0] // 20), x[1]))
     return [x[2] for x in items]
     if image is None:
         return None, [], "Please upload an image."
+    # Convert to standard RGB Numpy array (Full Resolution)
     image_np = np.array(image.convert("RGB"))
+    # 1. DETECT with PaddleOCR
+    # cls=False because we don't need orientation classification for just boxes
+    # rec=False because we ONLY want boxes (we will use TrOCR to read)
+    result = detector.ocr(image_np, cls=False, rec=False)
+    # Paddle returns a list of results (one per page). We just have 1 page.
+    if not result or result[0] is None:
         return image, [], "No text detected."
+    # Extract boxes from result
+    boxes = result[0] # [[x1, y1], [x2, y2], ...]
     sorted_boxes = get_sorted_boxes(boxes)
     annotated_img = image_np.copy()
     results = []
     for box in sorted_boxes:
         box_int = box.astype(np.int32)
+        # Draw the box (Red, thickness 2)
+        cv2.polylines(annotated_img, [box_int], True, (255, 0, 0), 2)
+        # --- CROP WITH PADDING ---
+        # Padding helps TrOCR see the start/end of letters
+        PADDING = 8
         x_min = max(0, np.min(box_int[:, 0]) - PADDING)
         x_max = min(image_np.shape[1], np.max(box_int[:, 0]) + PADDING)
         y_min = max(0, np.min(box_int[:, 1]) - PADDING)
         y_max = min(image_np.shape[0], np.max(box_int[:, 1]) + PADDING)
+        # Skip tiny noise
+        if (x_max - x_min) < 15 or (y_max - y_min) < 10:
             continue
         crop = image_np[y_min:y_max, x_min:x_max]
         pil_crop = Image.fromarray(crop)
         debug_crops.append(pil_crop)
+        # 3. RECOGNIZE (TrOCR)
         with torch.no_grad():
             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
             generated_ids = model.generate(pixel_values)
     return Image.fromarray(annotated_img), debug_crops, full_text
+# --- GRADIO UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# ⚡ PaddleOCR + TrOCR")
+    gr.Markdown("Using **PaddleOCR (DBNet)** for sharp detection on cramped text, and **TrOCR** for reading.")
     with gr.Row():
         with gr.Column(scale=1):
             btn = gr.Button("Transcribe", variant="primary")
         with gr.Column(scale=1):
+            output_img = gr.Image(label="Detections (Paddle)")
             output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
     with gr.Row():
+        gallery = gr.Gallery(label="Line Crops", columns=6, height=200)
+    btn.click(process_image, input_img, [output_img, gallery, output_txt])
 if __name__ == "__main__":
     demo.launch()