Spaces:

imperiusrex
/

PrintedTextOCR

Sleeping

App Files Files Community

imperiusrex commited on Jul 31, 2025

Commit

3e0219f

verified ·

1 Parent(s): eddfdb3

Update app.py

Browse files

Files changed (1) hide show

app.py +25 -30

app.py CHANGED Viewed

@@ -1,24 +1,25 @@
 import gradio as gr
 from transformers import CLIPProcessor, CLIPModel
 from paddleocr import PaddleOCR, TextDetection
 from PIL import Image
-import torch
 import numpy as np
 import cv2
-import os
 import spaces
 # --- Global setup for models and data ---
-# This section runs once when the app starts.
-print("Initializing models...")
-# Load CLIP model once.
-# By default, Hugging Face transformers will load models to the GPU if available.
-clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to("cuda" if torch.cuda.is_available() else "cpu")
 processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
 # Initialize Paddle's text detection model.
-# REMOVED the 'use_gpu=True' argument to fix the ValueError.
 det_model = TextDetection(model_name="PP-OCRv5_server_det")
 # Candidate language phrases for detection
@@ -27,7 +28,6 @@ candidates = [
     "This is Telugu text",
     "This is Chinese text",
     "This is Korean text",
-    # Add other languages as needed
 ]
 # Map detected languages to PaddleOCR language codes
@@ -38,6 +38,8 @@ lang_map = {
     "korean": "korean",
 }
 # --- Utility Functions ---
 def get_box_center(box):
     """Calculates the center of a bounding box."""
@@ -47,45 +49,41 @@ def get_box_center(box):
     center_y = sum(y_coords) / len(y_coords)
     return center_x, center_y
-# --- Main OCR Pipeline Function ---
-@spaces.GPU  # This decorator ensures the function is executed on the assigned GPU.
-def ocr_pipeline(image: Image.Image) -> str:
     """
     Performs OCR on an input image using a multi-step pipeline.
     Args:
-        image: A PIL Image object from the Gradio interface.
     Returns:
         A string containing the reconstructed text.
     """
-    if image is None:
         return "No image provided."
     print("Starting OCR pipeline...")
     # Convert PIL image to a NumPy array for OpenCV and Paddle
-    img_np = np.array(image.convert("RGB"))
     # Step 1: Text Detection with PaddleOCR's model
-    # This will be fast on the H200 GPU.
     output = det_model.predict(img_np, batch_size=1)
     arr = []
-    for res in output:
-        polys = res['dt_polys']
-        if polys is not None:
-            arr.extend(polys.tolist())
     # Sort the bounding boxes in reading order
-    arr = sorted(arr, key=lambda box: (box[0][1], box[0][0]))
-    if not arr:
         print("No text regions detected.")
         return "No text regions detected."
     cropped_images = []
-    for box in arr:
         box = np.array(box, dtype=np.float32)
         width_a = np.linalg.norm(box[0] - box[1])
         width_b = np.linalg.norm(box[2] - box[3])
@@ -110,9 +108,7 @@ def ocr_pipeline(image: Image.Image) -> str:
         pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
         # Use CLIP to detect language. The model is already on the GPU.
-        inputs = processor(text=candidates, images=pil_img, return_tensors="pt", padding=True)
-        # Move inputs to the GPU
-        inputs = {k: v.to(clip_model.device) for k, v in inputs.items()}
         with torch.no_grad():
             outputs = clip_model(**inputs)
             logits_per_image = outputs.logits_per_image
@@ -124,17 +120,16 @@ def ocr_pipeline(image: Image.Image) -> str:
         lang_code = lang_map.get(detected_lang, "en")
         # Initialize PaddleOCR with the detected language.
-        # REMOVED the 'use_gpu=True' argument here as well.
-        ocr = PaddleOCR(lang=lang_code, use_angle_cls=False, use_doc_unwarping=False)
         result = ocr.predict(img)
         # Extract text from OCR result
         text_for_this_image = ""
-        if result and result[0] and result[0].get('rec_texts'):
             text_for_this_image = " ".join(result[0]['rec_texts'])
         # Store text and bounding box information
-        center_x, center_y = get_box_center(arr[i])
         all_text_blocks.append({
             "text": text_for_this_image,
             "center_x": center_x,

 import gradio as gr
+import torch
 from transformers import CLIPProcessor, CLIPModel
 from paddleocr import PaddleOCR, TextDetection
 from PIL import Image
 import numpy as np
 import cv2
 import spaces
 # --- Global setup for models and data ---
+print("🔄 Initializing models...")
+# Check for GPU and set device
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Device being used: {device}")
+# Load CLIP model once. This is memory-intensive, so we do it once.
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to(device)
 processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
 # Initialize Paddle's text detection model.
+# The latest versions of PaddlePaddle/PaddleOCR automatically use the GPU.
 det_model = TextDetection(model_name="PP-OCRv5_server_det")
 # Candidate language phrases for detection
     "This is Telugu text",
     "This is Chinese text",
     "This is Korean text",
 ]
 # Map detected languages to PaddleOCR language codes
     "korean": "korean",
 }
+print("✅ Models loaded successfully.")
 # --- Utility Functions ---
 def get_box_center(box):
     """Calculates the center of a bounding box."""
     center_y = sum(y_coords) / len(y_coords)
     return center_x, center_y
+@spaces.GPU
+def ocr_pipeline(image_pil: Image.Image) -> str:
     """
     Performs OCR on an input image using a multi-step pipeline.
     Args:
+        image_pil: A PIL Image object from the Gradio interface.
     Returns:
         A string containing the reconstructed text.
     """
+    if image_pil is None:
         return "No image provided."
     print("Starting OCR pipeline...")
     # Convert PIL image to a NumPy array for OpenCV and Paddle
+    img_np = np.array(image_pil.convert("RGB"))
     # Step 1: Text Detection with PaddleOCR's model
     output = det_model.predict(img_np, batch_size=1)
     arr = []
+    if output and output[0] and 'dt_polys' in output[0] and output[0]['dt_polys'] is not None:
+        arr.extend(output[0]['dt_polys'].tolist())
     # Sort the bounding boxes in reading order
+    sorted_polys = sorted(arr, key=lambda box: (box[0][1], box[0][0]))
+    if not sorted_polys:
         print("No text regions detected.")
         return "No text regions detected."
     cropped_images = []
+    for box in sorted_polys:
         box = np.array(box, dtype=np.float32)
         width_a = np.linalg.norm(box[0] - box[1])
         width_b = np.linalg.norm(box[2] - box[3])
         pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
         # Use CLIP to detect language. The model is already on the GPU.
+        inputs = processor(text=candidates, images=pil_img, return_tensors="pt", padding=True).to(device)
         with torch.no_grad():
             outputs = clip_model(**inputs)
             logits_per_image = outputs.logits_per_image
         lang_code = lang_map.get(detected_lang, "en")
         # Initialize PaddleOCR with the detected language.
+        ocr = PaddleOCR(lang=lang_code, use_angle_cls=False, use_doc_unwarping=False, use_gpu=True)
         result = ocr.predict(img)
         # Extract text from OCR result
         text_for_this_image = ""
+        if result and result[0] and 'rec_texts' in result[0]:
             text_for_this_image = " ".join(result[0]['rec_texts'])
         # Store text and bounding box information
+        center_x, center_y = get_box_center(sorted_polys[i])
         all_text_blocks.append({
             "text": text_for_this_image,
             "center_x": center_x,