Spaces:

iammraat
/

ocr

Sleeping

App Files Files Community

iammraat commited on Jan 24

Commit

b02cd5a

verified ·

1 Parent(s): 80cf5fd

Update app.py

Browse files

Files changed (1) hide show

app.py +211 -39

app.py CHANGED Viewed

@@ -310,6 +310,164 @@
 #     demo.launch(server_name="0.0.0.0", server_port=7860)
 import gradio as gr
 import torch
 import numpy as np
@@ -319,7 +477,7 @@ from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 from craft_text_detector import Craft
 # ==========================================
-# 🔧 PATCH 1: Fix Torchvision (From your code)
 # ==========================================
 import torchvision.models.vgg
 if not hasattr(torchvision.models.vgg, 'model_urls'):
@@ -328,7 +486,7 @@ if not hasattr(torchvision.models.vgg, 'model_urls'):
     }
 # ==========================================
-# 🔧 PATCH 2: The Logic Fix (Ratio Net)
 # ==========================================
 import craft_text_detector.craft_utils as craft_utils_module
@@ -341,11 +499,10 @@ def fixed_adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net=2):
         if poly is None or len(poly) == 0:
             continue
-        # Safe numpy conversion
         p = np.array(poly).reshape(-1, 2)
-        # CRITICAL FIX: Multiply by ratio_net (defaults to 2)
-        # This scales the 1/2 size heatmap output back to full image size
         p[:, 0] *= (ratio_w * ratio_net)
         p[:, 1] *= (ratio_h * ratio_net)
@@ -353,21 +510,24 @@ def fixed_adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net=2):
     return adjusted
-# Apply the patch
 craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates
 # ==========================================
-# --- Load TrOCR (Recognition) ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
-print(f"Loading TrOCR on {device}...")
-processor = TrOCRProcessor.from_pretrained('microsoft/trocr-small-handwritten')
-model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-small-handwritten').to(device).eval()
-# --- Load CRAFT (Detection) ---
 print("Loading CRAFT...")
-# crop_type="box" ensures we get clean rectangles
 craft = Craft(output_dir=None, crop_type="box", cuda=(device == "cuda"))
 def get_sorted_boxes(boxes):
     """Sorts boxes top-to-bottom (lines), then left-to-right."""
     if not boxes: return []
@@ -377,54 +537,60 @@ def get_sorted_boxes(boxes):
         cx = np.mean(box[:, 0])
         items.append((cy, cx, box))
-    # Sort by Y (grouping by 40px lines) then X
-    items.sort(key=lambda x: (int(x[0] // 40), x[1]))
     return [x[2] for x in items]
 def process_image(image):
     if image is None:
-        return None, "Please upload an image."
-    # Convert to numpy
     image_np = np.array(image.convert("RGB"))
     # 1. DETECT
-    # The patch we added above will now auto-multiply coordinates by 2 * ratio
-    # fixing the "tiny box" issue.
     prediction = craft.detect_text(image_np)
     boxes = prediction.get("boxes", [])
     if not boxes:
-        return image, "No text detected."
-    # 2. VISUALIZE & CROP
     sorted_boxes = get_sorted_boxes(boxes)
     annotated_img = image_np.copy()
     results = []
     for box in sorted_boxes:
-        # Cast to int for drawing
         box_int = box.astype(np.int32)
-        # Draw on image (Blue, thickness 3)
         cv2.polylines(annotated_img, [box_int], True, (255, 0, 0), 3)
-        # Get Crop Coordinates
-        x_min = max(0, np.min(box_int[:, 0]))
-        x_max = min(image_np.shape[1], np.max(box_int[:, 0]))
-        y_min = max(0, np.min(box_int[:, 1]))
-        y_max = min(image_np.shape[0], np.max(box_int[:, 1]))
-        # Filter noise
-        if (x_max - x_min) < 10 or (y_max - y_min) < 10:
             continue
         crop = image_np[y_min:y_max, x_min:x_max]
-        if crop.size == 0: continue
         pil_crop = Image.fromarray(crop)
-        # 3. RECOGNIZE (TrOCR)
         with torch.no_grad():
             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
             generated_ids = model.generate(pixel_values)
@@ -434,22 +600,28 @@ def process_image(image):
                 results.append(text)
     full_text = "\n".join(results)
-    return Image.fromarray(annotated_img), full_text
-# --- Gradio UI ---
-with gr.Blocks(title="Handwritten OCR Fixed") as demo:
-    gr.Markdown("# 📝 Handwritten OCR (Fixed Pipeline)")
     with gr.Row():
-        with gr.Column():
             input_img = gr.Image(type="pil", label="Upload Image")
             btn = gr.Button("Transcribe", variant="primary")
-        with gr.Column():
             output_img = gr.Image(label="Detections")
-            output_txt = gr.Textbox(label="Result", lines=20)
-    btn.click(process_image, input_img, [output_img, output_txt])
 if __name__ == "__main__":
     demo.launch()

 #     demo.launch(server_name="0.0.0.0", server_port=7860)
+# import gradio as gr
+# import torch
+# import numpy as np
+# import cv2
+# from PIL import Image
+# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+# from craft_text_detector import Craft
+# # ==========================================
+# # 🔧 PATCH 1: Fix Torchvision (From your code)
+# # ==========================================
+# import torchvision.models.vgg
+# if not hasattr(torchvision.models.vgg, 'model_urls'):
+#     torchvision.models.vgg.model_urls = {
+#         'vgg16_bn': 'https://download.pytorch.org/models/vgg16_bn-6c64b313.pth'
+#     }
+# # ==========================================
+# # 🔧 PATCH 2: The Logic Fix (Ratio Net)
+# # ==========================================
+# import craft_text_detector.craft_utils as craft_utils_module
+# def fixed_adjustResultCoordinates(polys, ratio_w, ratio_h, ratio_net=2):
+#     if not polys:
+#         return []
+#     adjusted = []
+#     for poly in polys:
+#         if poly is None or len(poly) == 0:
+#             continue
+#         # Safe numpy conversion
+#         p = np.array(poly).reshape(-1, 2)
+#         # CRITICAL FIX: Multiply by ratio_net (defaults to 2)
+#         # This scales the 1/2 size heatmap output back to full image size
+#         p[:, 0] *= (ratio_w * ratio_net)
+#         p[:, 1] *= (ratio_h * ratio_net)
+#         adjusted.append(p)
+#     return adjusted
+# # Apply the patch
+# craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates
+# # ==========================================
+# # --- Load TrOCR (Recognition) ---
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# print(f"Loading TrOCR on {device}...")
+# processor = TrOCRProcessor.from_pretrained('microsoft/trocr-small-handwritten')
+# model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-small-handwritten').to(device).eval()
+# # --- Load CRAFT (Detection) ---
+# print("Loading CRAFT...")
+# # crop_type="box" ensures we get clean rectangles
+# craft = Craft(output_dir=None, crop_type="box", cuda=(device == "cuda"))
+# def get_sorted_boxes(boxes):
+#     """Sorts boxes top-to-bottom (lines), then left-to-right."""
+#     if not boxes: return []
+#     items = []
+#     for box in boxes:
+#         cy = np.mean(box[:, 1])
+#         cx = np.mean(box[:, 0])
+#         items.append((cy, cx, box))
+#     # Sort by Y (grouping by 40px lines) then X
+#     items.sort(key=lambda x: (int(x[0] // 40), x[1]))
+#     return [x[2] for x in items]
+# def process_image(image):
+#     if image is None:
+#         return None, "Please upload an image."
+#     # Convert to numpy
+#     image_np = np.array(image.convert("RGB"))
+#     # 1. DETECT
+#     # The patch we added above will now auto-multiply coordinates by 2 * ratio
+#     # fixing the "tiny box" issue.
+#     prediction = craft.detect_text(image_np)
+#     boxes = prediction.get("boxes", [])
+#     if not boxes:
+#         return image, "No text detected."
+#     # 2. VISUALIZE & CROP
+#     sorted_boxes = get_sorted_boxes(boxes)
+#     annotated_img = image_np.copy()
+#     results = []
+#     for box in sorted_boxes:
+#         # Cast to int for drawing
+#         box_int = box.astype(np.int32)
+#         # Draw on image (Blue, thickness 3)
+#         cv2.polylines(annotated_img, [box_int], True, (255, 0, 0), 3)
+#         # Get Crop Coordinates
+#         x_min = max(0, np.min(box_int[:, 0]))
+#         x_max = min(image_np.shape[1], np.max(box_int[:, 0]))
+#         y_min = max(0, np.min(box_int[:, 1]))
+#         y_max = min(image_np.shape[0], np.max(box_int[:, 1]))
+#         # Filter noise
+#         if (x_max - x_min) < 10 or (y_max - y_min) < 10:
+#             continue
+#         crop = image_np[y_min:y_max, x_min:x_max]
+#         if crop.size == 0: continue
+#         pil_crop = Image.fromarray(crop)
+#         # 3. RECOGNIZE (TrOCR)
+#         with torch.no_grad():
+#             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
+#             generated_ids = model.generate(pixel_values)
+#             text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+#             if text.strip():
+#                 results.append(text)
+#     full_text = "\n".join(results)
+#     return Image.fromarray(annotated_img), full_text
+# # --- Gradio UI ---
+# with gr.Blocks(title="Handwritten OCR Fixed") as demo:
+#     gr.Markdown("# 📝 Handwritten OCR (Fixed Pipeline)")
+#     with gr.Row():
+#         with gr.Column():
+#             input_img = gr.Image(type="pil", label="Upload Image")
+#             btn = gr.Button("Transcribe", variant="primary")
+#         with gr.Column():
+#             output_img = gr.Image(label="Detections")
+#             output_txt = gr.Textbox(label="Result", lines=20)
+#     btn.click(process_image, input_img, [output_img, output_txt])
+# if __name__ == "__main__":
+#     demo.launch()
 import gradio as gr
 import torch
 import numpy as np
 from craft_text_detector import Craft
 # ==========================================
+# 🔧 PATCH 1: Fix Torchvision Compatibility
 # ==========================================
 import torchvision.models.vgg
 if not hasattr(torchvision.models.vgg, 'model_urls'):
     }
 # ==========================================
+# 🔧 PATCH 2: The "Ratio Net" Logic Fix
 # ==========================================
 import craft_text_detector.craft_utils as craft_utils_module
         if poly is None or len(poly) == 0:
             continue
+        # Convert to numpy and reshape
         p = np.array(poly).reshape(-1, 2)
+        # Scale correctly using ratio_net
         p[:, 0] *= (ratio_w * ratio_net)
         p[:, 1] *= (ratio_h * ratio_net)
     return adjusted
 craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates
 # ==========================================
+# --- 1. SETUP MODEL (Switched to BASE for stability) ---
 device = "cuda" if torch.cuda.is_available() else "cpu"
+print(f"Loading TrOCR-Base on {device}...")
+# We use the 'base' model because 'small' hallucinates Wikipedia text on tight crops
+MODEL_ID = "microsoft/trocr-base-handwritten"
+processor = TrOCRProcessor.from_pretrained(MODEL_ID)
+model = VisionEncoderDecoderModel.from_pretrained(MODEL_ID).to(device).eval()
 print("Loading CRAFT...")
 craft = Craft(output_dir=None, crop_type="box", cuda=(device == "cuda"))
+# --- 2. HELPER FUNCTIONS ---
 def get_sorted_boxes(boxes):
     """Sorts boxes top-to-bottom (lines), then left-to-right."""
     if not boxes: return []
         cx = np.mean(box[:, 0])
         items.append((cy, cx, box))
+    # Sort by line (approx 20px tolerance) then by column
+    items.sort(key=lambda x: (int(x[0] // 20), x[1]))
     return [x[2] for x in items]
 def process_image(image):
     if image is None:
+        return None, [], "Please upload an image."
+    # Convert to standard RGB Numpy array
+    # We use the FULL resolution image (no resizing) to keep text sharp
     image_np = np.array(image.convert("RGB"))
     # 1. DETECT
+    # The patch ensures coordinates map perfectly to this full-res image
     prediction = craft.detect_text(image_np)
     boxes = prediction.get("boxes", [])
     if not boxes:
+        return image, [], "No text detected."
     sorted_boxes = get_sorted_boxes(boxes)
     annotated_img = image_np.copy()
     results = []
+    debug_crops = []
+    # 2. PROCESS BOXES
     for box in sorted_boxes:
         box_int = box.astype(np.int32)
+        # Draw the box (Visual verification)
         cv2.polylines(annotated_img, [box_int], True, (255, 0, 0), 3)
+        # --- CROP WITH PADDING (Crucial Fix) ---
+        # TrOCR needs 'breathing room' or it hallucinates.
+        PADDING = 10
+        x_min = max(0, np.min(box_int[:, 0]) - PADDING)
+        x_max = min(image_np.shape[1], np.max(box_int[:, 0]) + PADDING)
+        y_min = max(0, np.min(box_int[:, 1]) - PADDING)
+        y_max = min(image_np.shape[0], np.max(box_int[:, 1]) + PADDING)
+        # Skip noise
+        if (x_max - x_min) < 20 or (y_max - y_min) < 10:
             continue
         crop = image_np[y_min:y_max, x_min:x_max]
+        # Convert to PIL for Model
         pil_crop = Image.fromarray(crop)
+        # Add to debug gallery so user can see what the model sees
+        debug_crops.append(pil_crop)
+        # 3. RECOGNIZE
         with torch.no_grad():
             pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
             generated_ids = model.generate(pixel_values)
                 results.append(text)
     full_text = "\n".join(results)
+    return Image.fromarray(annotated_img), debug_crops, full_text
+# --- 3. GRADIO UI ---
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 📝 Robust Handwritten OCR (Base Model)")
+    gr.Markdown("Includes padding and a stronger model to prevent hallucinations.")
     with gr.Row():
+        with gr.Column(scale=1):
             input_img = gr.Image(type="pil", label="Upload Image")
             btn = gr.Button("Transcribe", variant="primary")
+        with gr.Column(scale=1):
             output_img = gr.Image(label="Detections")
+            output_txt = gr.Textbox(label="Extracted Text", lines=15, show_copy_button=True)
+    with gr.Row():
+        # Gallery to check if crops are valid or empty
+        crop_gallery = gr.Gallery(label="Debug: See what the model sees (Crops)", columns=6, height=200)
+    btn.click(process_image, input_img, [output_img, crop_gallery, output_txt])
 if __name__ == "__main__":
     demo.launch()