Spaces:

iammraat
/

ocr

Sleeping

App Files Files Community

iammraat commited on Jan 23

Commit

a29782b

verified ·

1 Parent(s): b00ad18

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -43

app.py CHANGED Viewed

@@ -1,65 +1,189 @@
 import gradio as gr
-from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 import torch
 from PIL import Image
-# --- Model Setup ---
-# We load the model outside the inference function to cache it on startup
-MODEL_ID = "microsoft/trocr-base-handwritten"
-print(f"Loading {MODEL_ID}...")
-processor = TrOCRProcessor.from_pretrained(MODEL_ID)
-model = VisionEncoderDecoderModel.from_pretrained(MODEL_ID)
-# Check for GPU (Free Spaces are usually CPU-only, but this handles upgrades)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
-print(f"Model loaded on device: {device}")
-# --- Inference Function ---
-def process_image(image):
     if image is None:
         return "Please upload an image."
-    try:
-        # 1. Convert to RGB (standardizes input)
-        image = image.convert("RGB")
-        # 2. Preprocess
-        pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
-        # 3. Generate text
         generated_ids = model.generate(pixel_values)
-        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        return generated_text
-    except Exception as e:
-        return f"Error: {str(e)}"
-# --- Gradio Interface ---
-# Using the Blocks API for a clean layout
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
-    gr.Markdown(
-        """
-        # ✍️ Handwritten Text Recognition
-        Using Microsoft's **TrOCR Small** model. Upload a handwritten note to transcribe it.
-        """
-    )
     with gr.Row():
-        with gr.Column():
-            input_img = gr.Image(type="pil", label="Upload Image")
-            submit_btn = gr.Button("Transcribe", variant="primary")
-        with gr.Column():
-            output_text = gr.Textbox(label="Result", interactive=False)
-    # Examples help users test it immediately without uploading their own file
-    # (Uncomment the list below if you upload example images to your repo)
-    # gr.Examples(["sample1.jpg"], inputs=input_img)
-    submit_btn.click(fn=process_image, inputs=input_img, outputs=output_text)
-# Launch for Spaces
 if __name__ == "__main__":
     demo.launch()

+# import gradio as gr
+# from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+# import torch
+# from PIL import Image
+# # --- Model Setup ---
+# # We load the model outside the inference function to cache it on startup
+# MODEL_ID = "microsoft/trocr-base-handwritten"
+# print(f"Loading {MODEL_ID}...")
+# processor = TrOCRProcessor.from_pretrained(MODEL_ID)
+# model = VisionEncoderDecoderModel.from_pretrained(MODEL_ID)
+# # Check for GPU (Free Spaces are usually CPU-only, but this handles upgrades)
+# device = "cuda" if torch.cuda.is_available() else "cpu"
+# model.to(device)
+# print(f"Model loaded on device: {device}")
+# # --- Inference Function ---
+# def process_image(image):
+#     if image is None:
+#         return "Please upload an image."
+#     try:
+#         # 1. Convert to RGB (standardizes input)
+#         image = image.convert("RGB")
+#         # 2. Preprocess
+#         pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
+#         # 3. Generate text
+#         generated_ids = model.generate(pixel_values)
+#         generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+#         return generated_text
+#     except Exception as e:
+#         return f"Error: {str(e)}"
+# # --- Gradio Interface ---
+# # Using the Blocks API for a clean layout
+# with gr.Blocks(theme=gr.themes.Soft()) as demo:
+#     gr.Markdown(
+#         """
+#         # ✍️ Handwritten Text Recognition
+#         Using Microsoft's **TrOCR Small** model. Upload a handwritten note to transcribe it.
+#         """
+#     )
+#     with gr.Row():
+#         with gr.Column():
+#             input_img = gr.Image(type="pil", label="Upload Image")
+#             submit_btn = gr.Button("Transcribe", variant="primary")
+#         with gr.Column():
+#             output_text = gr.Textbox(label="Result", interactive=False)
+#     # Examples help users test it immediately without uploading their own file
+#     # (Uncomment the list below if you upload example images to your repo)
+#     # gr.Examples(["sample1.jpg"], inputs=input_img)
+#     submit_btn.click(fn=process_image, inputs=input_img, outputs=output_text)
+# # Launch for Spaces
+# if __name__ == "__main__":
+#     demo.launch()
 import gradio as gr
 import torch
+import numpy as np
+import cv2
 from PIL import Image
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+from craft_text_detector import Craft
+# --- 1. Load TrOCR (Recognition) ---
+print("Loading TrOCR model...")
+processor = TrOCRProcessor.from_pretrained('microsoft/trocr-small-handwritten')
+model = VisionEncoderDecoderModel.from_pretrained('microsoft/trocr-small-handwritten')
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
+# --- 2. Load CRAFT (Detection) ---
+print("Loading CRAFT text detector...")
+# refine_net=True helps connect individual characters into words/lines
+craft = Craft(output_dir=None, crop_type="poly", cuda=(device == "cuda"))
+# --- Helper: Sort Boxes (Reading Order) ---
+def get_sorted_boxes(boxes):
+    """
+    Sort boxes from top-to-bottom, then left-to-right.
+    This simple sorting assumes lines are roughly horizontal.
+    """
+    # Calculate centroids
+    centroids = []
+    for box in boxes:
+        # box is usually [[x1,y1], [x2,y2], [x3,y3], [x4,y4]]
+        # Get center x and y
+        x_center = np.mean(box[:, 0])
+        y_center = np.mean(box[:, 1])
+        centroids.append([x_center, y_center, box])
+    # Sort by Y first (with a tolerance to group items on same line)
+    # This is a naive sort; for complex layouts, more advanced logic is needed.
+    centroids.sort(key=lambda k: (int(k[1] // 20), k[0]))
+    return [item[2] for item in centroids]
+# --- Main Inference Pipeline ---
+def process_full_page(image):
     if image is None:
         return "Please upload an image."
+    # Convert PIL to Numpy (OpenCV format)
+    image_np = np.array(image)
+    # 1. DETECT TEXT REGIONS
+    # prediction_result returns: {"boxes": [...], "polys": [...], "heatmaps": ...}
+    prediction_result = craft.detect_text(image_np)
+    boxes = prediction_result["boxes"]
+    if len(boxes) == 0:
+        return "No text detected."
+    # 2. SORT BOXES (Reading Order)
+    sorted_boxes = get_sorted_boxes(boxes)
+    # 3. RECOGNIZE TEXT (Iterate through crops)
+    full_text = []
+    # Optional: Draw boxes on image for visualization
+    annotated_img = image_np.copy()
+    for box in sorted_boxes:
+        # Get coordinates for cropping
+        # box points are float, convert to int
+        box = box.astype(int)
+        # Draw box on visualization
+        cv2.polylines(annotated_img, [box], True, (255, 0, 0), 2)
+        # Crop the region
+        x_min = max(0, np.min(box[:, 0]))
+        x_max = min(image_np.shape[1], np.max(box[:, 0]))
+        y_min = max(0, np.min(box[:, 1]))
+        y_max = min(image_np.shape[0], np.max(box[:, 1]))
+        # Safety check for empty crops
+        if x_max - x_min < 5 or y_max - y_min < 5:
+            continue
+        cropped_region = image_np[y_min:y_max, x_min:x_max]
+        # Convert crop back to PIL for TrOCR
+        pil_crop = Image.fromarray(cropped_region).convert("RGB")
+        # Run TrOCR
+        pixel_values = processor(images=pil_crop, return_tensors="pt").pixel_values.to(device)
         generated_ids = model.generate(pixel_values)
+        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        full_text.append(text)
+    # Join detected pieces
+    final_output = " ".join(full_text)
+    return Image.fromarray(annotated_img), final_output
+# --- Gradio UI ---
 with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🕵️‍♀️ Full-Page Handwritten OCR")
+    gr.Markdown("Pipeline: **CRAFT** (Detection) ➡️ **TrOCR** (Recognition)")
     with gr.Row():
+        input_img = gr.Image(type="pil", label="Upload Full Page")
+    with gr.Row():
+        vis_output = gr.Image(label="Detections", type="pil")
+        text_output = gr.Textbox(label="Extracted Text", lines=10)
+    submit_btn = gr.Button("Process Page", variant="primary")
+    submit_btn.click(fn=process_full_page, inputs=input_img, outputs=[vis_output, text_output])
 if __name__ == "__main__":
     demo.launch()