Spaces:

Sathiyapramod
/

image_processing_ocr

Sleeping

App Files Files Community

Sathiyapramod commited on Apr 12

Commit

0cc296e

verified ·

1 Parent(s): a0cc850

Update app.py

Browse files

Files changed (1) hide show

app.py +86 -23

app.py CHANGED Viewed

@@ -2,17 +2,39 @@ import gradio as gr
 from PIL import Image
 import numpy as np
 import cv2
-from transformers import AutoModel
-model = AutoModel.from_pretrained("deepseek-ai/DeepSeek-OCR-2", trust_remote_code=True, dtype="auto")
-def segment_lines(image):
-    # Convert to OpenCV format
-    img = np.array(image.convert("L"))
-    # Threshold
-    _, thresh = cv2.threshold(img, 150, 255, cv2.THRESH_BINARY_INV)
     # Horizontal projection
     horizontal_sum = np.sum(thresh, axis=1)
@@ -28,35 +50,76 @@ def segment_lines(image):
             lines.append((start, end))
             start = None
-    # Extract line images
     line_images = []
     for (s, e) in lines:
-        cropped = image.crop((0, s, image.width, e))
-        line_images.append(cropped)
     return line_images
 def predict(image):
     if image is None:
-        return "Upload an image"
-    lines = segment_lines(image)
-    results = []
-    for line_img in lines:
-        pixel_values = processor(images=line_img, return_tensors="pt").pixel_values
-        generated_ids = model.generate(pixel_values)
-        text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        results.append(text)
-    return "\n".join(results)
-gr.Interface(
     fn=predict,
-    inputs=gr.Image(type="pil"),
     outputs=gr.Textbox(label="Extracted Text"),
-    title="📝 Multi-line Handwritten OCR",
-).launch()

 from PIL import Image
 import numpy as np
 import cv2
+import torch
+from transformers import TrOCRProcessor, VisionEncoderDecoderModel
+# =========================
+# Model Loader (cached)
+# =========================
+processor = None
+model = None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def load_model():
+    global processor, model
+    if processor is None or model is None:
+        processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
+        model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
+        model.to(device)
+# =========================
+# Line Segmentation Logic
+# =========================
+def segment_lines(image: Image.Image):
+    """
+    Splits image into individual text lines using horizontal projection
+    """
+    # Convert to grayscale
+    gray = np.array(image.convert("L"))
+    # Apply thresholding
+    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
     # Horizontal projection
     horizontal_sum = np.sum(thresh, axis=1)
             lines.append((start, end))
             start = None
+    # Edge case: last line
+    if start is not None:
+        lines.append((start, len(horizontal_sum)))
+    # Crop line images
     line_images = []
     for (s, e) in lines:
+        # Add small padding
+        top = max(0, s - 5)
+        bottom = min(image.height, e + 5)
+        cropped = image.crop((0, top, image.width, bottom))
+        # Skip very small/noise regions
+        if bottom - top > 10:
+            line_images.append(cropped)
     return line_images
+# =========================
+# OCR Prediction
+# =========================
 def predict(image):
+    load_model()
     if image is None:
+        return "⚠️ Please upload an image."
+    try:
+        # Segment into lines
+        lines = segment_lines(image)
+        if not lines:
+            return "⚠️ No text detected. Try a clearer image."
+        results = []
+        for line_img in lines:
+            pixel_values = processor(
+                images=line_img,
+                return_tensors="pt"
+            ).pixel_values.to(device)
+            generated_ids = model.generate(pixel_values)
+            text = processor.batch_decode(
+                generated_ids,
+                skip_special_tokens=True
+            )[0]
+            results.append(text)
+        final_text = "\n".join(results)
+        return final_text if final_text.strip() else "⚠️ Could not extract text."
+    except Exception as e:
+        return f"❌ Error occurred: {str(e)}"
+# =========================
+# Gradio UI
+# =========================
+demo = gr.Interface(
     fn=predict,
+    inputs=gr.Image(type="pil", label="Upload Handwritten Image"),
     outputs=gr.Textbox(label="Extracted Text"),
+    title="📝 Handwritten OCR (Multi-line)",
+    description="Upload a handwritten note image. The model will extract text line by line.",
+)
+if __name__ == "__main__":
+    demo.launch()