Spaces:

iammraat
/

ocr

Sleeping

App Files Files Community

iammraat commited on Jan 24

Commit

adb25fe

verified ·

1 Parent(s): cd5090e

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -104

app.py CHANGED Viewed

@@ -323,133 +323,94 @@ from PIL import Image
 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 from craft_text_detector import Craft
-# --- THE ULTIMATE MONKEY PATCH ---
 import craft_text_detector.craft_utils as craft_utils_module
 def fixed_adjustResultCoordinates(polys, ratio_w, ratio_h):
-    if not polys or len(polys) == 0:
-        return []
-    adjusted_polys = []
     for poly in polys:
-        try:
-            # Convert to numpy and check if it's actually a coordinate list
-            p = np.array(poly).astype(np.float32)
-            # If p is empty or just a single point/scalar, skip it
-            if p.ndim != 2 or p.shape[0] == 0:
-                continue
-            # Scale coordinates
-            p[:, 0] *= ratio_w
-            p[:, 1] *= ratio_h
-            adjusted_polys.append(p)
-        except (IndexError, TypeError, ValueError):
-            # If anything goes wrong with a specific noise-box, just skip it
-            continue
-    return adjusted_polys
-# Apply the patch to the library in memory
 craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates
-# ----------------------------
-# Device Setup
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load Models
-print("Loading TrOCR...")
 processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-handwritten")
-model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-handwritten")
-model.to(device).eval()
-print("Loading CRAFT...")
-# We use crop_type="box" for clean rectangles
 craft = Craft(output_dir=None, crop_type="box", cuda=(device == "cuda"))
-def get_sorted_boxes(boxes):
-    """Sorts detected boxes into reading order: top-to-bottom, left-to-right."""
-    items = []
-    for box in boxes:
-        # Avoid empty boxes
-        if box is None or len(box) == 0:
-            continue
-        cx = np.mean(box[:, 0])
-        cy = np.mean(box[:, 1])
-        items.append((cy, cx, box))
-    # Sort by Y (line grouping) then X
-    items.sort(key=lambda x: (int(x[0] // 30), x[1]))
-    return [b for _, _, b in items]
 def process_page(image):
-    if image is None:
-        return None, "Please upload an image."
-    # Convert PIL to standard RGB format
-    image_rgb = image.convert("RGB")
-    image_np = np.array(image_rgb)
-    # 1. Run Detection (Patched function handles coordinate mapping)
-    prediction = craft.detect_text(image_np)
     boxes = prediction.get("boxes", [])
-    if not boxes or len(boxes) == 0:
-        return image_rgb, "No text detected."
-    # 2. Sort and Draw
-    sorted_boxes = get_sorted_boxes(boxes)
-    annotated = image_np.copy()
-    transcriptions = []
-    for box in sorted_boxes:
-        # Cast to integer for CV2 and slicing
-        box_int = box.astype(np.int32)
-        # 3. Draw on the visualization image
-        cv2.polylines(annotated, [box_int], True, (255, 0, 0), 2)
-        # 4. Extract Crop for OCR
-        # Get axis-aligned bounding box from points
-        x_min, y_min = np.min(box_int, axis=0)
-        x_max, y_max = np.max(box_int, axis=0)
-        # Keep within image dimensions
         x_min, y_min = max(0, x_min), max(0, y_min)
-        x_max, y_max = min(image_np.shape[1], x_max), min(image_np.shape[0], y_max)
-        # Skip boxes that are too small to contain a character
-        if (x_max - x_min) < 10 or (y_max - y_min) < 10:
-            continue
-        crop_region = image_np[y_min:y_max, x_min:x_max]
-        crop_pil = Image.fromarray(crop_region)
-        # 5. Inference with TrOCR
         with torch.no_grad():
-            pixel_values = processor(images=crop_pil, return_tensors="pt").pixel_values.to(device)
-            generated_ids = model.generate(pixel_values)
-            text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-            if text.strip():
-                transcriptions.append(text)
-    # Output the result
-    final_text = " ".join(transcriptions)
-    return Image.fromarray(annotated), final_text
-# Gradio Interface
-demo = gr.Interface(
-    fn=process_page,
-    inputs=gr.Image(type="pil", label="Upload Page Image"),
-    outputs=[
-        gr.Image(label="Detection Visualization"),
-        gr.Textbox(label="Transcribed Text", lines=15)
-    ],
-    title="✍️ Full-Page Handwritten Recognition",
-    description="Combines CRAFT detection with TrOCR recognition. Use high-contrast images for best results.",
-    theme="soft"
-)
-if __name__ == "__main__":
-    demo.launch()

 from transformers import TrOCRProcessor, VisionEncoderDecoderModel
 from craft_text_detector import Craft
+# --- DEFENSIVE MONKEY PATCH ---
 import craft_text_detector.craft_utils as craft_utils_module
 def fixed_adjustResultCoordinates(polys, ratio_w, ratio_h):
+    # This ensures coordinates are scaled correctly and remain as float32
+    if not polys: return []
+    adjusted = []
     for poly in polys:
+        p = np.array(poly).reshape(-1, 2).astype(np.float32)
+        p[:, 0] *= ratio_w
+        p[:, 1] *= ratio_h
+        adjusted.append(p)
+    return adjusted
 craft_utils_module.adjustResultCoordinates = fixed_adjustResultCoordinates
+# ------------------------------
 device = "cuda" if torch.cuda.is_available() else "cpu"
 # Load Models
 processor = TrOCRProcessor.from_pretrained("microsoft/trocr-small-handwritten")
+model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-small-handwritten").to(device).eval()
 craft = Craft(output_dir=None, crop_type="box", cuda=(device == "cuda"))
 def process_page(image):
+    if image is None: return None, "No image."
+    # 1. FORCE RESIZE for coordinate 1:1 mapping
+    # We resize to 1280 width to match CRAFT's default canvas
+    base_width = 1280
+    w_percent = (base_width / float(image.size[0]))
+    h_size = int((float(image.size[1]) * float(w_percent)))
+    image = image.resize((base_width, h_size), Image.Resampling.LANCZOS)
+    img_np = np.array(image.convert("RGB"))
+    # 2. DETECT
+    # Because we resized the image to 1280, ratio_w/h will be ~1.0
+    prediction = craft.detect_text(img_np)
     boxes = prediction.get("boxes", [])
+    if not boxes: return image, "No text found."
+    # 3. SORT (Improved line grouping)
+    # We group items within 30 pixels of each other vertically as a single 'line'
+    items = []
+    for box in boxes:
+        items.append({'cy': np.mean(box[:, 1]), 'cx': np.mean(box[:, 0]), 'box': box})
+    items.sort(key=lambda x: (int(x['cy'] // 30), x['cx']))
+    annotated = img_np.copy()
+    full_text = []
+    # 4. RECOGNIZE
+    for item in items:
+        box = item['box'].astype(np.int32)
+        # Draw on image
+        cv2.polylines(annotated, [box], True, (255, 0, 0), 2)
+        # Crop
+        x_min, y_min = np.min(box, axis=0)
+        x_max, y_max = np.max(box, axis=0)
+        # Clamp to image boundaries
         x_min, y_min = max(0, x_min), max(0, y_min)
+        x_max, y_max = min(img_np.shape[1], x_max), min(img_np.shape[0], y_max)
+        if (x_max - x_min) < 5 or (y_max - y_min) < 5: continue
+        crop = Image.fromarray(img_np[y_min:y_max, x_min:x_max])
         with torch.no_grad():
+            pixel_values = processor(images=crop, return_tensors="pt").pixel_values.to(device)
+            out_ids = model.generate(pixel_values)
+            txt = processor.batch_decode(out_ids, skip_special_tokens=True)[0]
+            if txt.strip(): full_text.append(txt)
+    return Image.fromarray(annotated), " ".join(full_text)
+# UI
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# 🚀 Final Fix: Full-Page OCR")
+    with gr.Row():
+        input_i = gr.Image(type="pil", label="Input")
+        output_i = gr.Image(label="Detections (Scale Fixed)")
+    output_t = gr.Textbox(label="Result", lines=10)
+    btn = gr.Button("Transcribe", variant="primary")
+    btn.click(process_page, input_i, [output_i, output_t])
+demo.launch()