Spaces:

imperiusrex
/

PrintedTextOCR

Sleeping

App Files Files Community

imperiusrex commited on Jul 31, 2025

Commit

5828679

verified ·

1 Parent(s): a0ee49a

Update app.py

Browse files

Files changed (1) hide show

app.py +133 -368

app.py CHANGED Viewed

@@ -1,362 +1,45 @@
-from paddleocr import PaddleOCR
 from transformers import CLIPProcessor, CLIPModel
 from PIL import Image
 import torch
-from google.colab import files
-import json
-import os
-import glob
-from IPython.display import Image as ColabImage, display
-from paddleocr import PaddleOCR
-from paddleocr import TextDetection
-uploaded = files.upload()
-img_path = next(iter(uploaded.keys()))
-image = Image.open(img_path).convert("RGB")
-image = Image.open(img_path)
-width, height = image.size
-total_pixels = width * height
-print(f"Width: {width}, Height: {height}, Total pixels: {total_pixels}")
-# Initialize array for bounding boxes
-arr = []
-# Load and run the text detection model
-model = TextDetection(model_name="PP-OCRv5_server_det")
-output = model.predict(img_path, batch_size=1)
-# Extract bounding boxes
-for res in output:
-    polys = res['dt_polys']  # NumPy array of shape (N, 4, 2)
-    if polys is not None:
-        arr.extend(polys.tolist())
-# Sort the bounding boxes in reading order
-arr = sorted(arr, key=lambda box: (box[0][1], box[0][0]))
-# Print the sorted bounding box coordinates
-print("Extracted bounding box coordinates (in reading order):")
-for box in arr:
-    print(box)
-print(f"Number of detected text regions: {len(arr)}")
-import cv2
 import numpy as np
 import os
-import json
-# Load original image
-img = cv2.imread(img_path)
-# Output setup
-output_dir = "./output/crops_warped"
-os.makedirs(output_dir, exist_ok=True)
-cropped_images = []
-for i, box in enumerate(arr):
-    box = np.array(box, dtype=np.float32)  # shape: (4, 2)
-    # Compute width and height of the new image
-    width_a = np.linalg.norm(box[0] - box[1])
-    width_b = np.linalg.norm(box[2] - box[3])
-    height_a = np.linalg.norm(box[0] - box[3])
-    height_b = np.linalg.norm(box[1] - box[2])
-    width = int(max(width_a, width_b))
-    height = int(max(height_a, height_b))
-    # Destination rectangle
-    dst_rect = np.array([
-        [0, 0],
-        [width - 1, 0],
-        [width - 1, height - 1],
-        [0, height - 1]
-    ], dtype=np.float32)
-    # Perspective transform
-    M = cv2.getPerspectiveTransform(box, dst_rect)
-    warped = cv2.warpPerspective(img, M, (width, height))
-    cropped_images.append(warped)
-    # Save warped image
-    cv2.imwrite(os.path.join(output_dir, f"crop_{i}.png"), warped)
-print(f"Cropped {len(cropped_images)} perspective-warped regions.")
-# cropped_images.reverse()
-import matplotlib.pyplot as plt
-# Display all cropped images in a grid
-for i, crop in enumerate(cropped_images):
-    plt.figure(figsize=(4, 4))
-    plt.imshow(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))  # Convert BGR to RGB
-    plt.title(f'Cropped Image {i}')
-    plt.axis('off')
-    plt.show()
-# Load CLIP model
-clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
 processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
 # Candidate language phrases for detection
 candidates = [
     "This is English text",
-    # "This is Hindi text",
-    # "This is Tamil text",
     "This is Telugu text",
-    # "This is Bengali text",
-    # "This is Arabic text",
     "This is Chinese text",
-    # "This is Japanese text",
     "This is Korean text",
-    # "This is Russian text",
-    # "This is Kannada text",
-    # "This is Malayalam text",
-    # "This is Marathi text",
-    # "This is Urdu text",
-    # "This is French text",
-    # "This is Spanish text",
-    # "This is Italian text",
-    # "This is Portuguese text",
-    # "This is Romanian text",
-    # "This is Hungarian text",
-    # "This is Indonesian text",
-    # "This is Lithuanian text",
-    # "This is Chinese Traditional text",
-    # "This is Malay text",
-    # "This is Dutch text",
-    # "This is Norwegian text",
-    # "This is Bosnian text",
-    # "This is Polish text",
-    # "This is Czech text",
-    # "This is Slovak text",
-    # "This is Welsh text",
-    # "This is Slovenian text",
-    # "This is Danish text",
-    # "This is Albanian text",
-    # "This is Estonian text",
-    # "This is Swedish text",
-    # "This is Irish text",
-    # "This is Swahili text",
-    # "This is Croatian text",
-    # "This is Uzbek text",
-    # "This is Turkish text",
-    # "This is Latin text",
-    # "This is Belarusian text",
-    # "This is Ukrainian text"
 ]
 # Map detected languages to PaddleOCR language codes
 lang_map = {
     "english": "en",
-    # "hindi": "hi",
-    # "tamil": "ta",
     "telugu": "te",
-    # "bengali": "bn",
-    # "arabic": "ar",
     "chinese": "ch",
-    # "japanese": "japan",
     "korean": "korean",
-    # "russian": "ru",
-    # "kannada": "kn",
-    # "malayalam": "ml",
-    # "marathi": "mr",
-    # "urdu": "ur",
-    # "french": "fr",
-    # "spanish": "es",
-    # "italian": "it",
-    # "portuguese": "pt",
-    # "romanian": "ro",
-    # "hungarian": "hu",
-    # "indonesian": "id",
-    # "lithuanian": "lt",
-    # "chinese traditional": "chinese_cht",
-    # "malay": "ms",
-    # "dutch": "nl",
-    # "norwegian": "no",
-    # "bosnian": "bs",
-    # "polish": "pl",
-    # "czech": "cs",
-    # "slovak": "sk",
-    # "welsh": "cy",
-    # "slovenian": "sl",
-    # "danish": "da",
-    # "albanian": "sq",
-    # "estonian": "et",
-    # "swedish": "sv",
-    # "irish": "ga",
-    # "swahili": "sw",
-    # "croatian": "hr",
-    # "uzbek": "uz",
-    # "turkish": "tr",
-    # "latin": "la",
-    # "belarusian": "be",
-    # "ukrainian": "uk"
 }
-for img in cropped_images:
-      # Get probabilities
-      inputs = processor(text=candidates, images=img, return_tensors="pt", padding=True)
-      with torch.no_grad():
-          logits_per_image = clip_model(**inputs).logits_per_image
-          probs = logits_per_image.softmax(dim=1)
-      # Get best language match
-      best = probs.argmax().item()
-      detected_lang_phrase = candidates[best]
-      detected_lang = detected_lang_phrase.split()[-2].lower()
-      lang_code = lang_map.get(detected_lang, "en")
-      print(f"\n✅ Detected script/language: {detected_lang_phrase} → PaddleOCR lang='{lang_code}'")
-import numpy as np # Ensure numpy is imported
-import os # Ensure os is imported
-ocr = PaddleOCR(
-    use_doc_orientation_classify=False, # Enable orientation classification for auto lang detection
-    use_doc_unwarping=False,
-    use_textline_orientation=False, # Enable textline orientation for auto lang detection
-    lang=lang_code,  # Use paddleOCR's auto language detection
-    device="cpu"
-)
-for i, img in enumerate(cropped_images):
-    # Define output diarectory and make sure it exists
-    # Get base name of uploaded image (without extension)
-    # Use a unique name for each cropped image
-    base_name = f"cropped_image_{i}"
-    bounding_boxes_image_path=os.path.join("/content/", f"{base_name}_bounding_boxes.jpg")
-    json_file_path=os.path.join("/content/", f"{base_name}.json")
-    # Convert the PIL Image to a NumPy array
-    # image_np = np.array(img) # img is already a numpy array from cv2
-    # Skip small images that might cause errors
-    if img.shape[0] < 10 or img.shape[1] < 10:
-        print(f"Skipping small image {i} with shape {img.shape}")
-        continue
-    # Perform OCR and save results
-    result = ocr.predict(img) # Pass the NumPy array
-    # print(f"\n====json_output for cropped image {i}====\n")
-    # Assuming the first element of the result contains the overall detection
-    if result and result[0]: # Check if result is not empty and has at least one element
-        # Print results for each detected element
-        # for res in result:
-        #     res.print()
-        # Save the combined result to image and json
-        result[0].save_to_img(bounding_boxes_image_path)
-        result[0].save_to_json(json_file_path)
-    else:
-        print(f"No OCR results found for cropped image {i}.")
-    # Construct the expected saved image path
-    saved_image_path = bounding_boxes_image_path
-    if os.path.exists(saved_image_path):
-        display(ColabImage(filename=saved_image_path))
-    else:
-        print(f"No OCR image found at: {saved_image_path}")
-    # print("\n===== Markdown Layout Preview =====\n")
-    # Construct the expected saved JSON path
-    saved_json_path = json_file_path
-    if os.path.exists(saved_json_path):
-        with open(saved_json_path, "r", encoding="utf-8") as f:
-            data = json.load(f)
-        texts = data["rec_texts"]
-        boxes = data["rec_boxes"]
-        elements = []
-        for text, box in zip(texts, boxes):
-            x1, y1, x2, y2 = box
-            elements.append({"text": text, "x": x1, "y": y1, "line_y": (y1 + y2) / 2})
-        elements.sort(key=lambda e: (round(e["line_y"] / 10), e["x"]))
-        lines = []
-        current_line_y = None
-        line = []
-        for elem in elements:
-            if current_line_y is None or abs(elem["line_y"] - current_line_y) <= 10:
-                line.append(elem["text"])
-                current_line_y = elem["line_y"]
-            else:
-                lines.append(line)
-                line = [elem["text"]]
-                current_line_y = elem["line_y"]
-        if line:
-            lines.append(line)
-        markdown_output = ""
-        for line in lines:
-            markdown_output += " ".join(line) #+ "\n\n"
-        print(markdown_output)
-    else:
-        print(f"No JSON file found at: {saved_json_path}")
-import json
-import os
-predicted_texts = []
-# Iterate through the number of cropped images we have
-for i in range(len(cropped_images)):
-    json_file_path = f"/content/cropped_image_{i}.json"
-    text_for_this_image = ""
-    if os.path.exists(json_file_path):
-        with open(json_file_path, 'r', encoding='utf-8') as f:
-            try:
-                # Load the JSON data
-                data = json.load(f)
-                # The result from predict() is a dictionary, not a list
-                if data and 'rec_texts' in data:
-                    text_for_this_image = " ".join(data['rec_texts'])
-            except json.JSONDecodeError:
-                print(f"Warning: Could not decode JSON from {json_file_path}")
-            except KeyError as e:
-                print(f"Warning: Unexpected JSON structure in {json_file_path}: {e}")
-    predicted_texts.append(text_for_this_image)
-# Display the final list of predicted texts
-print("Predicted Texts Array (from JSON files):")
-print(predicted_texts)
-import json
-import os
 def get_box_center(box):
     """Calculates the center of a bounding box."""
     x_coords = [p[0] for p in box]
@@ -365,52 +48,134 @@ def get_box_center(box):
     center_y = sum(y_coords) / len(y_coords)
     return center_x, center_y
-# --- Step 1: Read all text and their centroid coordinates ---
-all_text_blocks = []
-for i, box in enumerate(arr):
-    json_file_path = f"/content/cropped_image_{i}.json"
-    if os.path.exists(json_file_path):
-        with open(json_file_path, 'r', encoding='utf-8') as f:
-            result = json.load(f)
-        if result and 'rec_texts' in result and result['rec_texts']:
-            text = " ".join(result['rec_texts'])
-            center_x, center_y = get_box_center(box)
-            all_text_blocks.append({
-                "text": text,
-                "center_x": center_x,
-                "center_y": center_y
-            })
-# --- Step 2: Sort by y-coordinate, then by x-coordinate, and group into lines ---
-if all_text_blocks:
-    # Sort by center_y, then by center_x
     sorted_blocks = sorted(all_text_blocks, key=lambda item: (item["center_y"], item["center_x"]))
     lines = []
     if sorted_blocks:
         current_line = [sorted_blocks[0]]
         for block in sorted_blocks[1:]:
-            # Check if the vertical centers are close enough to be on the same line
-            if abs(block["center_y"] - current_line[-1]["center_y"]) < 40: # Y-threshold
                 current_line.append(block)
             else:
-                # Sort the current line by x-coordinate and add it to the lines list
                 current_line.sort(key=lambda item: item["center_x"])
                 lines.append(" ".join([item["text"] for item in current_line]))
                 current_line = [block]
-        # Add the last line
         if current_line:
             current_line.sort(key=lambda item: item["center_x"])
             lines.append(" ".join([item["text"] for item in current_line]))
-    # --- Step 3: Print the final reconstructed text ---
-    if lines:
-        for line in lines:
-            print(line)
-    else:
-        print("No text was reconstructed.")
-else:
-    print("No text blocks were found.")

+import gradio as gr
 from transformers import CLIPProcessor, CLIPModel
+from paddleocr import PaddleOCR, TextDetection
 from PIL import Image
 import torch
 import numpy as np
+import cv2
 import os
+import spaces
+# --- Global setup for models and data ---
+# This section runs once when the app starts.
+print("Initializing models...")
+# Load CLIP model once.
+# By default, Hugging Face transformers will load models to the GPU if available.
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").to("cuda" if torch.cuda.is_available() else "cpu")
 processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+# Initialize Paddle's text detection model.
+# The 'use_gpu=True' parameter is crucial here to ensure it uses the GPU.
+# The model will be downloaded and loaded into GPU memory once.
+det_model = TextDetection(model_name="PP-OCRv5_server_det", use_gpu=True)
 # Candidate language phrases for detection
 candidates = [
     "This is English text",
     "This is Telugu text",
     "This is Chinese text",
     "This is Korean text",
+    # Add other languages as needed
 ]
 # Map detected languages to PaddleOCR language codes
 lang_map = {
     "english": "en",
     "telugu": "te",
     "chinese": "ch",
     "korean": "korean",
 }
+# --- Utility Functions ---
 def get_box_center(box):
     """Calculates the center of a bounding box."""
     x_coords = [p[0] for p in box]
     center_y = sum(y_coords) / len(y_coords)
     return center_x, center_y
+# --- Main OCR Pipeline Function ---
+@spaces.GPU  # This decorator ensures the function is executed on the assigned GPU.
+def ocr_pipeline(image: Image.Image) -> str:
+    """
+    Performs OCR on an input image using a multi-step pipeline.
+    Args:
+        image: A PIL Image object from the Gradio interface.
+    Returns:
+        A string containing the reconstructed text.
+    """
+    if image is None:
+        return "No image provided."
+    print("Starting OCR pipeline...")
+    # Convert PIL image to a NumPy array for OpenCV and Paddle
+    img_np = np.array(image.convert("RGB"))
+    # Step 1: Text Detection with PaddleOCR's model
+    # This will be fast on the H200 GPU.
+    output = det_model.predict(img_np, batch_size=1)
+    arr = []
+    for res in output:
+        polys = res['dt_polys']
+        if polys is not None:
+            arr.extend(polys.tolist())
+    # Sort the bounding boxes in reading order
+    arr = sorted(arr, key=lambda box: (box[0][1], box[0][0]))
+    if not arr:
+        print("No text regions detected.")
+        return "No text regions detected."
+    cropped_images = []
+    for box in arr:
+        box = np.array(box, dtype=np.float32)
+        width_a = np.linalg.norm(box[0] - box[1])
+        width_b = np.linalg.norm(box[2] - box[3])
+        height_a = np.linalg.norm(box[0] - box[3])
+        height_b = np.linalg.norm(box[1] - box[2])
+        width = int(max(width_a, width_b))
+        height = int(max(height_a, height_b))
+        dst_rect = np.array([
+            [0, 0],
+            [width - 1, 0],
+            [width - 1, height - 1],
+            [0, height - 1]
+        ], dtype=np.float32)
+        M = cv2.getPerspectiveTransform(box, dst_rect)
+        warped = cv2.warpPerspective(img_np, M, (width, height))
+        cropped_images.append(warped)
+    # Step 2: Language detection with CLIP and OCR on cropped images
+    all_text_blocks = []
+    for i, img in enumerate(cropped_images):
+        pil_img = Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
+        # Use CLIP to detect language. The model is already on the GPU.
+        inputs = processor(text=candidates, images=pil_img, return_tensors="pt", padding=True)
+        # Move inputs to the GPU
+        inputs = {k: v.to(clip_model.device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = clip_model(**inputs)
+            logits_per_image = outputs.logits_per_image
+            probs = logits_per_image.softmax(dim=1)
+        best = probs.argmax().item()
+        detected_lang_phrase = candidates[best]
+        detected_lang = detected_lang_phrase.split()[-2].lower()
+        lang_code = lang_map.get(detected_lang, "en")
+        # Initialize PaddleOCR with the detected language.
+        # This part will run fast as the H200 GPU accelerates the model.
+        ocr = PaddleOCR(lang=lang_code, use_gpu=True, use_angle_cls=False, use_doc_unwarping=False)
+        result = ocr.predict(img)
+        # Extract text from OCR result
+        text_for_this_image = ""
+        if result and result[0] and result[0].get('rec_texts'):
+            text_for_this_image = " ".join(result[0]['rec_texts'])
+        # Store text and bounding box information
+        center_x, center_y = get_box_center(arr[i])
+        all_text_blocks.append({
+            "text": text_for_this_image,
+            "center_x": center_x,
+            "center_y": center_y
+        })
+    # Step 3: Reconstruct the text in reading order
+    if not all_text_blocks:
+        print("No text could be extracted.")
+        return "No text could be extracted."
     sorted_blocks = sorted(all_text_blocks, key=lambda item: (item["center_y"], item["center_x"]))
     lines = []
     if sorted_blocks:
         current_line = [sorted_blocks[0]]
         for block in sorted_blocks[1:]:
+            if abs(block["center_y"] - current_line[-1]["center_y"]) < 40:
                 current_line.append(block)
             else:
                 current_line.sort(key=lambda item: item["center_x"])
                 lines.append(" ".join([item["text"] for item in current_line]))
                 current_line = [block]
         if current_line:
             current_line.sort(key=lambda item: item["center_x"])
             lines.append(" ".join([item["text"] for item in current_line]))
+    final_text = "\n".join(lines)
+    print("OCR pipeline finished successfully.")
+    return final_text
+# --- Gradio Interface ---
+iface = gr.Interface(
+    fn=ocr_pipeline,
+    inputs=gr.Image(type="pil", label="Upload Image"),
+    outputs=gr.Textbox(label="Recognized Text"),
+    title="Printed Text OCR with PaddleOCR and CLIP",
+    description="Upload a printed text image. The app will detect text regions, identify the language with CLIP, and perform OCR to return the text in reading order. This space uses an H200 GPU for high-speed processing."
+)
+if __name__ == "__main__":
+    iface.launch()