Spaces:

iammraat
/

ocr

Sleeping

App Files Files Community

iammraat commited on Jan 24

Commit

1ddb5bb

verified ·

1 Parent(s): f8bcded

Update app.py

Browse files

Files changed (1) hide show

app.py +163 -73

app.py CHANGED Viewed

@@ -66,6 +66,141 @@
 import gradio as gr
 import torch
 import numpy as np
@@ -74,122 +209,77 @@ from PIL import Image
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 from craft_text_detector import Craft
-# ----------------------------
-# Device
-# ----------------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
-# ----------------------------
-# Load TrOCR
-# ----------------------------
 print("Loading TrOCR model...")
 processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-handwritten")
-model = VisionEncoderDecoderModel.from_pretrained(
-    "microsoft/trocr-small-handwritten"
-)
 model.to(device)
 model.eval()
-# ----------------------------
-# Load CRAFT
-# ----------------------------
 print("Loading CRAFT text detector...")
-craft = Craft(
-    output_dir=None,
-    crop_type="poly",
-    cuda=(device == "cuda"),
-)
-# ----------------------------
-# Sort boxes (reading order)
-# ----------------------------
 def get_sorted_boxes(boxes):
     items = []
     for box in boxes:
         cx = np.mean(box[:, 0])
         cy = np.mean(box[:, 1])
         items.append((cy, cx, box))
-    # group by line (roughly)
     items.sort(key=lambda x: (int(x[0] // 20), x[1]))
     return [b for _, _, b in items]
-# ----------------------------
-# OCR Pipeline
-# ----------------------------
-def process_full_page(image: Image.Image):
-    # ALWAYS return (image_or_None, text)
     if image is None:
         return None, "Please upload an image."
     image_np = np.array(image)
     prediction = craft.detect_text(image_np)
     boxes = prediction.get("boxes", [])
     if not boxes:
         return image, "No text detected."
     sorted_boxes = get_sorted_boxes(boxes)
     annotated = image_np.copy()
     texts = []
     for box in sorted_boxes:
         box = box.astype(int)
         cv2.polylines(annotated, [box], True, (255, 0, 0), 2)
         x_min = max(0, box[:, 0].min())
         x_max = min(image_np.shape[1], box[:, 0].max())
         y_min = max(0, box[:, 1].min())
         y_max = min(image_np.shape[0], box[:, 1].max())
         if x_max - x_min < 5 or y_max - y_min < 5:
             continue
         crop = image_np[y_min:y_max, x_min:x_max]
         pil_crop = Image.fromarray(crop).convert("RGB")
         with torch.no_grad():
-            pixels = processor(
-                images=pil_crop,
-                return_tensors="pt"
-            ).pixel_values.to(device)
             ids = model.generate(pixels)
-            text = processor.batch_decode(
-                ids, skip_special_tokens=True
-            )[0]
-        if text.strip():
-            texts.append(text)
     final_text = " ".join(texts)
     return Image.fromarray(annotated), final_text
-# ----------------------------
-# Gradio UI
-# ----------------------------
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown("# 🕵️‍♀️ Full-Page Handwritten OCR")
-    gr.Markdown("**CRAFT ➜ TrOCR** (Detection + Recognition)")
-    with gr.Row():
-        input_img = gr.Image(type="pil", label="Upload Full Page")
-    with gr.Row():
-        vis_output = gr.Image(label="Detections")
-        text_output = gr.Textbox(label="Extracted Text", lines=10)
-    btn = gr.Button("Process Page", variant="primary")
-    btn.click(
-        fn=process_full_page,
-        inputs=input_img,
-        outputs=[vis_output, text_output],
-    )
 if __name__ == "__main__":
-    demo.launch(
-    server_name="0.0.0.0",
-    server_port=7860,
-    show_api=False,
-)

+# import gradio as gr
+# import torch
+# import numpy as np
+# import cv2
+# from PIL import Image
+# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+# from craft_text_detector import Craft
+# # ----------------------------
+# # Device
+# # ----------------------------
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# # ----------------------------
+# # Load TrOCR
+# # ----------------------------
+# print("Loading TrOCR model...")
+# processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-handwritten")
+# model = VisionEncoderDecoderModel.from_pretrained(
+#     "microsoft/trocr-small-handwritten"
+# )
+# model.to(device)
+# model.eval()
+# # ----------------------------
+# # Load CRAFT
+# # ----------------------------
+# print("Loading CRAFT text detector...")
+# craft = Craft(
+#     output_dir=None,
+#     crop_type="poly",
+#     cuda=(device == "cuda"),
+# )
+# # ----------------------------
+# # Sort boxes (reading order)
+# # ----------------------------
+# def get_sorted_boxes(boxes):
+#     items = []
+#     for box in boxes:
+#         cx = np.mean(box[:, 0])
+#         cy = np.mean(box[:, 1])
+#         items.append((cy, cx, box))
+#     # group by line (roughly)
+#     items.sort(key=lambda x: (int(x[0] // 20), x[1]))
+#     return [b for _, _, b in items]
+# # ----------------------------
+# # OCR Pipeline
+# # ----------------------------
+# def process_full_page(image: Image.Image):
+#     # ALWAYS return (image_or_None, text)
+#     if image is None:
+#         return None, "Please upload an image."
+#     image_np = np.array(image)
+#     prediction = craft.detect_text(image_np)
+#     boxes = prediction.get("boxes", [])
+#     if not boxes:
+#         return image, "No text detected."
+#     sorted_boxes = get_sorted_boxes(boxes)
+#     annotated = image_np.copy()
+#     texts = []
+#     for box in sorted_boxes:
+#         box = box.astype(int)
+#         cv2.polylines(annotated, [box], True, (255, 0, 0), 2)
+#         x_min = max(0, box[:, 0].min())
+#         x_max = min(image_np.shape[1], box[:, 0].max())
+#         y_min = max(0, box[:, 1].min())
+#         y_max = min(image_np.shape[0], box[:, 1].max())
+#         if x_max - x_min < 5 or y_max - y_min < 5:
+#             continue
+#         crop = image_np[y_min:y_max, x_min:x_max]
+#         pil_crop = Image.fromarray(crop).convert("RGB")
+#         with torch.no_grad():
+#             pixels = processor(
+#                 images=pil_crop,
+#                 return_tensors="pt"
+#             ).pixel_values.to(device)
+#             ids = model.generate(pixels)
+#             text = processor.batch_decode(
+#                 ids, skip_special_tokens=True
+#             )[0]
+#         if text.strip():
+#             texts.append(text)
+#     final_text = " ".join(texts)
+#     return Image.fromarray(annotated), final_text
+# # ----------------------------
+# # Gradio UI
+# # ----------------------------
+# with gr.Blocks(theme=gr.themes.Soft()) as demo:
+#     gr.Markdown("# 🕵️‍♀️ Full-Page Handwritten OCR")
+#     gr.Markdown("**CRAFT ➜ TrOCR** (Detection + Recognition)")
+#     with gr.Row():
+#         input_img = gr.Image(type="pil", label="Upload Full Page")
+#     with gr.Row():
+#         vis_output = gr.Image(label="Detections")
+#         text_output = gr.Textbox(label="Extracted Text", lines=10)
+#     btn = gr.Button("Process Page", variant="primary")
+#     btn.click(
+#         fn=process_full_page,
+#         inputs=input_img,
+#         outputs=[vis_output, text_output],
+#     )
+# if __name__ == "__main__":
+#     demo.launch(
+#     server_name="0.0.0.0",
+#     server_port=7860,
+#     show_api=False,
+# )
 import gradio as gr
 import torch
 import numpy as np
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 from craft_text_detector import Craft
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Loading TrOCR model...")
 processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-handwritten")
+model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-handwritten")
 model.to(device)
 model.eval()
 print("Loading CRAFT text detector...")
+craft = Craft(output_dir=None, crop_type="poly", cuda=(device == "cuda"))
 def get_sorted_boxes(boxes):
     items = []
     for box in boxes:
         cx = np.mean(box[:, 0])
         cy = np.mean(box[:, 1])
         items.append((cy, cx, box))
     items.sort(key=lambda x: (int(x[0] // 20), x[1]))
     return [b for _, _, b in items]
+def process_full_page(image):
     if image is None:
         return None, "Please upload an image."
     image_np = np.array(image)
     prediction = craft.detect_text(image_np)
     boxes = prediction.get("boxes", [])
     if not boxes:
         return image, "No text detected."
     sorted_boxes = get_sorted_boxes(boxes)
     annotated = image_np.copy()
     texts = []
     for box in sorted_boxes:
         box = box.astype(int)
         cv2.polylines(annotated, [box], True, (255, 0, 0), 2)
         x_min = max(0, box[:, 0].min())
         x_max = min(image_np.shape[1], box[:, 0].max())
         y_min = max(0, box[:, 1].min())
         y_max = min(image_np.shape[0], box[:, 1].max())
         if x_max - x_min < 5 or y_max - y_min < 5:
             continue
         crop = image_np[y_min:y_max, x_min:x_max]
         pil_crop = Image.fromarray(crop).convert("RGB")
         with torch.no_grad():
+            pixels = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
             ids = model.generate(pixels)
+            text = processor.batch_decode(ids, skip_special_tokens=True)[0]
+            if text.strip():
+                texts.append(text)
     final_text = " ".join(texts)
     return Image.fromarray(annotated), final_text
+demo = gr.Interface(
+    fn=process_full_page,
+    inputs=gr.Image(type="pil", label="Upload Full Page"),
+    outputs=[
+        gr.Image(label="Detections"),
+        gr.Textbox(label="Extracted Text", lines=10)
+    ],
+    title="🕵️‍♀️ Full-Page Handwritten OCR",
+    description="CRAFT ➜ TrOCR (Detection + Recognition)"
+)
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)