Spaces:

iammraat
/

test

Sleeping

App Files Files Community

iammraat commited on 14 days ago

Commit

4efabae

verified ·

1 Parent(s): 0b695b3

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -144

app.py CHANGED Viewed

@@ -441,8 +441,7 @@
-# app.py
 import gradio as gr
 from ultralytics import YOLO
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
@@ -450,28 +449,20 @@ from PIL import Image
 import torch
 import numpy as np
-# Load local models (your uploaded .pt files)
-region_model = YOLO("regions.pt")          # ← fixed: local file
-line_model = YOLO("lines.pt")              # ← fixed: local file
-# TrOCR (you can change to large if you have GPU and want better accuracy)
 processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
 model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
-# Move TrOCR to GPU if available (much faster on paid Spaces)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
 def get_crop(image: Image.Image, result, idx: int, padding: int = 15):
-    """
-    Crop using segmentation mask if available (much more accurate than boxes),
-    otherwise fall back to bounding box with padding.
-    Background outside the mask is forced to white → better for OCR.
-    """
     img_np = np.array(image)
     if result.masks is not None:
-        # Segmentation model → use mask (this is what the original Riksarkivet demo does)
         mask = result.masks.data[idx].cpu().numpy()
         mask_bool = mask > 0.5
@@ -482,147 +473,40 @@ def get_crop(image: Image.Image, result, idx: int, padding: int = 15):
         y_min, y_max = ys.min(), ys.max()
         x_min, x_max = xs.min(), xs.max()
-        # Add padding
         y_min = max(0, y_min - padding)
         y_max = min(img_np.shape[0], y_max + padding + 1)
         x_min = max(0, x_min - padding)
         x_max = min(img_np.shape[1], x_max + padding + 1)
         crop = img_np[y_min:y_max, x_min:x_max]
         mask_crop = mask_bool[y_min:y_max, x_min:x_max]
-        # Force background to white
         crop[~mask_crop] = 255
         return Image.fromarray(crop)
     else:
-        # Detection only → use bounding box with padding
         xyxy = result.boxes.xyxy[idx].cpu().numpy().astype(int)
         x1, y1, x2, y2 = xyxy
         x1 = max(0, x1 - padding)
         y1 = max(0, y1 - padding)
         x2 = min(image.width, x2 + padding)
         y2 = min(image.height, y2 + padding)
-        return image.crop((x1, y1, x2, y2))
-def process_image(image: Image.Image):
-    results = region_model(image)
-    region_result = results[0]
-    if region_result.boxes is None or len(region_result.boxes) == 0:
-        return "No text regions detected."
-    # Collect regions with their vertical position for sorting
-    regions_with_pos = []
-    for i in range(len(region_result.boxes)):
-        y1 = region_result.boxes.xyxy[i][1].item()  # top y-coordinate
-        crop = get_crop(image, region_result, i, padding=20)
-        if crop:
-            regions_with_pos.append((y1, crop))
-    # Sort regions top → bottom
-    regions_with_pos.sort(key=lambda x: x[0])
-    full_text_parts = []
-    for _, region_crop in regions_with_pos:
-        line_results = line_model(region_crop)
-        line_result = line_results[0]
-        if line_result.boxes is None or len(line_result.boxes) == 0:
-            continue
-        lines_with_pos = []
-        for j in range(len(line_result.boxes)):
-            rel_y1 = line_result.boxes.xyxy[j][1].item()   # relative to region crop
-            rel_x1 = line_result.boxes.xyxy[j][0].item()
-            line_crop = get_crop(region_crop, line_result, j, padding=15)
-            if line_crop is None:
-                continue
-            # TrOCR recognition
-            pixel_values = processor(line_crop, return_tensors="pt").pixel_values.to(device)
-            generated_ids = model.generate(pixel_values)
-            text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            lines_with_pos.append((rel_y1, rel_x1, text))
-        # Sort lines: top→bottom, then left→right (handles multi-column reasonably)
-        lines_with_pos.sort(key=lambda x: (x[0], x[1]))
-        region_text = "\n".join([item[2] for item in lines_with_pos])
-        full_text_parts.append(region_text)
-    return "\n\n".join(full_text_parts) if full_text_parts else "No text recognized."
-# Gradio interface
-demo = gr.Interface(
-    fn=process_image,
-    inputs=gr.Image(type="pil", label="Upload handwritten document"),
-    outputs=gr.Textbox(label="Recognized Text"),
-    title="Handwritten Text Recognition (YOLO regions/lines + TrOCR)",
-    description="Uses your local regions.pt and lines.pt (same as Riksarkivet demo) with precise mask-based cropping.",
-    flagging_mode="never"
-)
-if __name__ == "__main__":
-    demo.launch()# app.py (fixed version)
-import gradio as gr
-from ultralytics import YOLO
-from transformers import TrOCRProcessor, VisionEncoderDecoderModel
-from PIL import Image
-import torch
-import numpy as np
-# Load local models
-region_model = YOLO("regions.pt")
-line_model = YOLO("lines.pt")
-# TrOCR
-processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
-model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
-# Move to GPU if available
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
-def get_crop(image: Image.Image, result, idx: int, padding: int = 15):
-    img_np = np.array(image)
-    if result.masks is not None:
-        mask = result.masks.data[idx].cpu().numpy()
-        mask_bool = mask > 0.5
-        ys, xs = np.where(mask_bool)
-        if len(ys) == 0:
             return None
-        y_min, y_max = ys.min(), ys.max()
-        x_min, x_max = xs.min(), xs.max()
-        y_min = max(0, y_min - padding)
-        y_max = min(img_np.shape[0], y_max + padding + 1)
-        x_min = max(0, x_min - padding)
-        x_max = min(img_np.shape[1], x_max + padding + 1)
-        crop = img_np[y_min:y_max, x_min:x_max]
-        mask_crop = mask_bool[y_min:y_max, x_min:x_max]
-        crop[~mask_crop] = 255
-        return Image.fromarray(crop)
-    else:
-        xyxy = result.boxes.xyxy[idx].cpu().numpy().astype(int)
-        x1, y1, x2, y2 = xyxy
-        x1 = max(0, x1 - padding)
-        y1 = max(0, y1 - padding)
-        x2 = min(image.width, x2 + padding)
-        y2 = min(image.height, y2 + padding)
         return image.crop((x1, y1, x2, y2))
 def process_image(image: Image.Image):
     results = region_model(image)
     region_result = results[0]
@@ -633,14 +517,17 @@ def process_image(image: Image.Image):
     for i in range(len(region_result.boxes)):
         y1 = region_result.boxes.xyxy[i][1].item()
         crop = get_crop(image, region_result, i, padding=20)
-        if crop:
             regions_with_pos.append((y1, crop))
     regions_with_pos.sort(key=lambda x: x[0])
     full_text_parts = []
-    for _, region_crop in regions_with_pos:
         line_results = line_model(region_crop)
         line_result = line_results[0]
@@ -653,29 +540,40 @@ def process_image(image: Image.Image):
             rel_x1 = line_result.boxes.xyxy[j][0].item()
             line_crop = get_crop(region_crop, line_result, j, padding=15)
-            if line_crop is None:
                 continue
-            pixel_values = processor(line_crop, return_tensors="pt").pixel_values.to(device)
-            generated_ids = model.generate(pixel_values)
-            text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            lines_with_pos.append((rel_y1, rel_x1, text))
         lines_with_pos.sort(key=lambda x: (x[0], x[1]))
-        region_text = "\n".join([item[2] for item in lines_with_pos])
-        full_text_parts.append(region_text)
-    return "\n\n".join(full_text_parts) if full_text_parts else "No text recognized."
-# Gradio interface (fixed: use flagging_mode instead of allow_flagging)
 demo = gr.Interface(
     fn=process_image,
     inputs=gr.Image(type="pil", label="Upload handwritten document"),
     outputs=gr.Textbox(label="Recognized Text"),
-    title="Handwritten Text Recognition (YOLO regions/lines + TrOCR)",
-    description="Uses your local regions.pt and lines.pt (same as Riksarkivet demo) with precise mask-based cropping.",
-    flagging_mode="never"  # ← fixed: changed from allow_flagging to flagging_mode
 )
 if __name__ == "__main__":

+# app.py - FIXED VERSION with empty crop protection
 import gradio as gr
 from ultralytics import YOLO
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 import torch
 import numpy as np
+# Load models
+region_model = YOLO("regions.pt")
+line_model = YOLO("lines.pt")
 processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
 model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
 def get_crop(image: Image.Image, result, idx: int, padding: int = 15):
     img_np = np.array(image)
     if result.masks is not None:
         mask = result.masks.data[idx].cpu().numpy()
         mask_bool = mask > 0.5
         y_min, y_max = ys.min(), ys.max()
         x_min, x_max = xs.min(), xs.max()
         y_min = max(0, y_min - padding)
         y_max = min(img_np.shape[0], y_max + padding + 1)
         x_min = max(0, x_min - padding)
         x_max = min(img_np.shape[1], x_max + padding + 1)
+        # Safety: if after padding still degenerate
+        if y_max <= y_min or x_max <= x_min:
+            return None
         crop = img_np[y_min:y_max, x_min:x_max]
         mask_crop = mask_bool[y_min:y_max, x_min:x_max]
         crop[~mask_crop] = 255
         return Image.fromarray(crop)
     else:
+        # Bounding box fallback
         xyxy = result.boxes.xyxy[idx].cpu().numpy().astype(int)
         x1, y1, x2, y2 = xyxy
         x1 = max(0, x1 - padding)
         y1 = max(0, y1 - padding)
         x2 = min(image.width, x2 + padding)
         y2 = min(image.height, y2 + padding)
+        if x2 <= x1 or y2 <= y1:
             return None
         return image.crop((x1, y1, x2, y2))
 def process_image(image: Image.Image):
+    if image is None:
+        return "Please upload an image."
     results = region_model(image)
     region_result = results[0]
     for i in range(len(region_result.boxes)):
         y1 = region_result.boxes.xyxy[i][1].item()
         crop = get_crop(image, region_result, i, padding=20)
+        if crop and crop.size[0] > 0 and crop.size[1] > 0:
             regions_with_pos.append((y1, crop))
+    if not regions_with_pos:
+        return "No valid text regions after cropping."
     regions_with_pos.sort(key=lambda x: x[0])
     full_text_parts = []
+    for region_idx, (_, region_crop) in enumerate(regions_with_pos):
         line_results = line_model(region_crop)
         line_result = line_results[0]
             rel_x1 = line_result.boxes.xyxy[j][0].item()
             line_crop = get_crop(region_crop, line_result, j, padding=15)
+            if line_crop is None or line_crop.size[0] < 10 or line_crop.size[1] < 8:
+                # Skip tiny/invalid crops to prevent TrOCR crash
+                # print(f"Skipped tiny line {j} in region {region_idx}")
                 continue
+            try:
+                pixel_values = processor(line_crop, return_tensors="pt").pixel_values.to(device)
+                generated_ids = model.generate(pixel_values)
+                text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
+                if text:  # only add non-empty
+                    lines_with_pos.append((rel_y1, rel_x1, text))
+            except Exception as e:
+                # Catch any remaining processing errors
+                # print(f"TrOCR error on line {j}: {e}")
+                continue
         lines_with_pos.sort(key=lambda x: (x[0], x[1]))
+        region_text = "\n".join([item[2] for item in lines_with_pos if item[2]])
+        if region_text:
+            full_text_parts.append(region_text)
+    if not full_text_parts:
+        return "No readable text recognized (possibly due to small/tiny lines or model limitations). Try a clearer document or larger padding."
+    return "\n\n".join(full_text_parts)
+# Gradio interface
 demo = gr.Interface(
     fn=process_image,
     inputs=gr.Image(type="pil", label="Upload handwritten document"),
     outputs=gr.Textbox(label="Recognized Text"),
+    title="Handwritten Text Recognition (YOLO + TrOCR)",
+    description="Local models: regions.pt / lines.pt + microsoft/trocr-base-handwritten. Mask-based cropping + safeguards against empty crops.",
+    flagging_mode="never"
 )
 if __name__ == "__main__":