Spaces:

imperiusrex
/

PrintedTextOCR

Sleeping

App Files Files Community

imperiusrex commited on Jul 31, 2025

Commit

051ce33

verified ·

1 Parent(s): 1bbbe26

Update app.py

Browse files

Files changed (1) hide show

app.py +401 -60

app.py CHANGED Viewed

@@ -1,76 +1,417 @@
-import spaces
-# No spaces.GPU.require() here, remove it
-import gradio as gr
 import cv2
 import numpy as np
-import torch
-from PIL import Image
-from transformers import CLIPProcessor, CLIPModel
-from paddleocr import PaddleOCR
-import tempfile
-# Your utility functions here (run_text_detection, crop_and_warp_regions, etc.)
-clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
-clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
-language_map = {
-    "english": "en",
-    "telugu": "te",
-    "chinese": "ch",
-    "korean": "korean"
-}
 candidates = [
     "This is English text",
     "This is Telugu text",
     "This is Chinese text",
-    "This is Korean text"
 ]
-@spaces.GPU  # Decorate the function you want to run on GPU
-def process_image(image):
-    # Your processing logic here
-    image_pil = Image.fromarray(image).convert("RGB")
-    img_path = "uploaded.jpg"
-    image_pil.save(img_path)
-    boxes = run_text_detection(img_path)
-    crops = crop_and_warp_regions(img_path, boxes)
-    all_results = []
-    for crop in crops:
-        lang = detect_language_clip(crop, clip_model, clip_processor, candidates)
-        lang_code = language_map.get(lang, "en")
-        ocr_model = PaddleOCR(
-            use_doc_orientation_classify=False,
-            use_doc_unwarping=False,
-            use_textline_orientation=False,
-            lang=lang_code,
-            det=False,
-            rec=True,
-            cls=False,
-            show_log=False
-        )
-        texts = run_paddle_ocr(crop, ocr_model)
-        all_results.append({
-            "lang": lang,
-            "texts": texts,
-            "image": crop
-        })
-    final_lines = group_text_by_position(all_results, boxes)
-    return "\n".join(final_lines)
-interface = gr.Interface(
-    fn=process_image,
-    inputs=gr.Image(type="numpy", label="Upload an Image"),
-    outputs=gr.Textbox(label="Reconstructed Text"),
-    title="Printed Text OCR",
-    description="Upload a printed or scanned document image. The system detects text regions, recognizes language, runs OCR, and reconstructs the output."
 )
-if __name__ == "__main__":
-    interface.launch()

+!pip install transformers ftfy paddleocr paddlepaddle
+from paddleocr import PaddleOCR
+from transformers import CLIPProcessor, CLIPModel
+from PIL import Image
+import torch
+from google.colab import files
+import json
+import os
+import glob
+from IPython.display import Image as ColabImage, display
+from paddleocr import PaddleOCR
+from paddleocr import TextDetection
+uploaded = files.upload()
+img_path = next(iter(uploaded.keys()))
+image = Image.open(img_path).convert("RGB")
+image = Image.open(img_path)
+width, height = image.size
+total_pixels = width * height
+print(f"Width: {width}, Height: {height}, Total pixels: {total_pixels}")
+# Initialize array for bounding boxes
+arr = []
+# Load and run the text detection model
+model = TextDetection(model_name="PP-OCRv5_server_det")
+output = model.predict(img_path, batch_size=1)
+# Extract bounding boxes
+for res in output:
+    polys = res['dt_polys']  # NumPy array of shape (N, 4, 2)
+    if polys is not None:
+        arr.extend(polys.tolist())
+# Sort the bounding boxes in reading order
+arr = sorted(arr, key=lambda box: (box[0][1], box[0][0]))
+# Print the sorted bounding box coordinates
+print("Extracted bounding box coordinates (in reading order):")
+for box in arr:
+    print(box)
+print(f"Number of detected text regions: {len(arr)}")
 import cv2
 import numpy as np
+import os
+import json
+# Load original image
+img = cv2.imread(img_path)
+# Output setup
+output_dir = "./output/crops_warped"
+os.makedirs(output_dir, exist_ok=True)
+cropped_images = []
+for i, box in enumerate(arr):
+    box = np.array(box, dtype=np.float32)  # shape: (4, 2)
+    # Compute width and height of the new image
+    width_a = np.linalg.norm(box[0] - box[1])
+    width_b = np.linalg.norm(box[2] - box[3])
+    height_a = np.linalg.norm(box[0] - box[3])
+    height_b = np.linalg.norm(box[1] - box[2])
+    width = int(max(width_a, width_b))
+    height = int(max(height_a, height_b))
+    # Destination rectangle
+    dst_rect = np.array([
+        [0, 0],
+        [width - 1, 0],
+        [width - 1, height - 1],
+        [0, height - 1]
+    ], dtype=np.float32)
+    # Perspective transform
+    M = cv2.getPerspectiveTransform(box, dst_rect)
+    warped = cv2.warpPerspective(img, M, (width, height))
+    cropped_images.append(warped)
+    # Save warped image
+    cv2.imwrite(os.path.join(output_dir, f"crop_{i}.png"), warped)
+print(f"Cropped {len(cropped_images)} perspective-warped regions.")
+# cropped_images.reverse()
+import matplotlib.pyplot as plt
+# Display all cropped images in a grid
+for i, crop in enumerate(cropped_images):
+    plt.figure(figsize=(4, 4))
+    plt.imshow(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))  # Convert BGR to RGB
+    plt.title(f'Cropped Image {i}')
+    plt.axis('off')
+    plt.show()
+# Load CLIP model
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
+processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+# Candidate language phrases for detection
 candidates = [
     "This is English text",
+    # "This is Hindi text",
+    # "This is Tamil text",
     "This is Telugu text",
+    # "This is Bengali text",
+    # "This is Arabic text",
     "This is Chinese text",
+    # "This is Japanese text",
+    "This is Korean text",
+    # "This is Russian text",
+    # "This is Kannada text",
+    # "This is Malayalam text",
+    # "This is Marathi text",
+    # "This is Urdu text",
+    # "This is French text",
+    # "This is Spanish text",
+    # "This is Italian text",
+    # "This is Portuguese text",
+    # "This is Romanian text",
+    # "This is Hungarian text",
+    # "This is Indonesian text",
+    # "This is Lithuanian text",
+    # "This is Chinese Traditional text",
+    # "This is Malay text",
+    # "This is Dutch text",
+    # "This is Norwegian text",
+    # "This is Bosnian text",
+    # "This is Polish text",
+    # "This is Czech text",
+    # "This is Slovak text",
+    # "This is Welsh text",
+    # "This is Slovenian text",
+    # "This is Danish text",
+    # "This is Albanian text",
+    # "This is Estonian text",
+    # "This is Swedish text",
+    # "This is Irish text",
+    # "This is Swahili text",
+    # "This is Croatian text",
+    # "This is Uzbek text",
+    # "This is Turkish text",
+    # "This is Latin text",
+    # "This is Belarusian text",
+    # "This is Ukrainian text"
 ]
+# Map detected languages to PaddleOCR language codes
+lang_map = {
+    "english": "en",
+    # "hindi": "hi",
+    # "tamil": "ta",
+    "telugu": "te",
+    # "bengali": "bn",
+    # "arabic": "ar",
+    "chinese": "ch",
+    # "japanese": "japan",
+    "korean": "korean",
+    # "russian": "ru",
+    # "kannada": "kn",
+    # "malayalam": "ml",
+    # "marathi": "mr",
+    # "urdu": "ur",
+    # "french": "fr",
+    # "spanish": "es",
+    # "italian": "it",
+    # "portuguese": "pt",
+    # "romanian": "ro",
+    # "hungarian": "hu",
+    # "indonesian": "id",
+    # "lithuanian": "lt",
+    # "chinese traditional": "chinese_cht",
+    # "malay": "ms",
+    # "dutch": "nl",
+    # "norwegian": "no",
+    # "bosnian": "bs",
+    # "polish": "pl",
+    # "czech": "cs",
+    # "slovak": "sk",
+    # "welsh": "cy",
+    # "slovenian": "sl",
+    # "danish": "da",
+    # "albanian": "sq",
+    # "estonian": "et",
+    # "swedish": "sv",
+    # "irish": "ga",
+    # "swahili": "sw",
+    # "croatian": "hr",
+    # "uzbek": "uz",
+    # "turkish": "tr",
+    # "latin": "la",
+    # "belarusian": "be",
+    # "ukrainian": "uk"
+}
+for img in cropped_images:
+      # Get probabilities
+      inputs = processor(text=candidates, images=img, return_tensors="pt", padding=True)
+      with torch.no_grad():
+          logits_per_image = clip_model(**inputs).logits_per_image
+          probs = logits_per_image.softmax(dim=1)
+      # Get best language match
+      best = probs.argmax().item()
+      detected_lang_phrase = candidates[best]
+      detected_lang = detected_lang_phrase.split()[-2].lower()
+      lang_code = lang_map.get(detected_lang, "en")
+      print(f"\n✅ Detected script/language: {detected_lang_phrase} → PaddleOCR lang='{lang_code}'")
+import numpy as np # Ensure numpy is imported
+import os # Ensure os is imported
+ocr = PaddleOCR(
+    use_doc_orientation_classify=False, # Enable orientation classification for auto lang detection
+    use_doc_unwarping=False,
+    use_textline_orientation=False, # Enable textline orientation for auto lang detection
+    lang=lang_code,  # Use paddleOCR's auto language detection
+    device="cpu"
 )
+for i, img in enumerate(cropped_images):
+    # Define output diarectory and make sure it exists
+    # Get base name of uploaded image (without extension)
+    # Use a unique name for each cropped image
+    base_name = f"cropped_image_{i}"
+    bounding_boxes_image_path=os.path.join("/content/", f"{base_name}_bounding_boxes.jpg")
+    json_file_path=os.path.join("/content/", f"{base_name}.json")
+    # Convert the PIL Image to a NumPy array
+    # image_np = np.array(img) # img is already a numpy array from cv2
+    # Skip small images that might cause errors
+    if img.shape[0] < 10 or img.shape[1] < 10:
+        print(f"Skipping small image {i} with shape {img.shape}")
+        continue
+    # Perform OCR and save results
+    result = ocr.predict(img) # Pass the NumPy array
+    # print(f"\n====json_output for cropped image {i}====\n")
+    # Assuming the first element of the result contains the overall detection
+    if result and result[0]: # Check if result is not empty and has at least one element
+        # Print results for each detected element
+        # for res in result:
+        #     res.print()
+        # Save the combined result to image and json
+        result[0].save_to_img(bounding_boxes_image_path)
+        result[0].save_to_json(json_file_path)
+    else:
+        print(f"No OCR results found for cropped image {i}.")
+    # Construct the expected saved image path
+    saved_image_path = bounding_boxes_image_path
+    if os.path.exists(saved_image_path):
+        display(ColabImage(filename=saved_image_path))
+    else:
+        print(f"No OCR image found at: {saved_image_path}")
+    # print("\n===== Markdown Layout Preview =====\n")
+    # Construct the expected saved JSON path
+    saved_json_path = json_file_path
+    if os.path.exists(saved_json_path):
+        with open(saved_json_path, "r", encoding="utf-8") as f:
+            data = json.load(f)
+        texts = data["rec_texts"]
+        boxes = data["rec_boxes"]
+        elements = []
+        for text, box in zip(texts, boxes):
+            x1, y1, x2, y2 = box
+            elements.append({"text": text, "x": x1, "y": y1, "line_y": (y1 + y2) / 2})
+        elements.sort(key=lambda e: (round(e["line_y"] / 10), e["x"]))
+        lines = []
+        current_line_y = None
+        line = []
+        for elem in elements:
+            if current_line_y is None or abs(elem["line_y"] - current_line_y) <= 10:
+                line.append(elem["text"])
+                current_line_y = elem["line_y"]
+            else:
+                lines.append(line)
+                line = [elem["text"]]
+                current_line_y = elem["line_y"]
+        if line:
+            lines.append(line)
+        markdown_output = ""
+        for line in lines:
+            markdown_output += " ".join(line) #+ "\n\n"
+        print(markdown_output)
+    else:
+        print(f"No JSON file found at: {saved_json_path}")
+import json
+import os
+predicted_texts = []
+# Iterate through the number of cropped images we have
+for i in range(len(cropped_images)):
+    json_file_path = f"/content/cropped_image_{i}.json"
+    text_for_this_image = ""
+    if os.path.exists(json_file_path):
+        with open(json_file_path, 'r', encoding='utf-8') as f:
+            try:
+                # Load the JSON data
+                data = json.load(f)
+                # The result from predict() is a dictionary, not a list
+                if data and 'rec_texts' in data:
+                    text_for_this_image = " ".join(data['rec_texts'])
+            except json.JSONDecodeError:
+                print(f"Warning: Could not decode JSON from {json_file_path}")
+            except KeyError as e:
+                print(f"Warning: Unexpected JSON structure in {json_file_path}: {e}")
+    predicted_texts.append(text_for_this_image)
+# Display the final list of predicted texts
+print("Predicted Texts Array (from JSON files):")
+print(predicted_texts)
+import json
+import os
+def get_box_center(box):
+    """Calculates the center of a bounding box."""
+    x_coords = [p[0] for p in box]
+    y_coords = [p[1] for p in box]
+    center_x = sum(x_coords) / len(x_coords)
+    center_y = sum(y_coords) / len(y_coords)
+    return center_x, center_y
+# --- Step 1: Read all text and their centroid coordinates ---
+all_text_blocks = []
+for i, box in enumerate(arr):
+    json_file_path = f"/content/cropped_image_{i}.json"
+    if os.path.exists(json_file_path):
+        with open(json_file_path, 'r', encoding='utf-8') as f:
+            result = json.load(f)
+        if result and 'rec_texts' in result and result['rec_texts']:
+            text = " ".join(result['rec_texts'])
+            center_x, center_y = get_box_center(box)
+            all_text_blocks.append({
+                "text": text,
+                "center_x": center_x,
+                "center_y": center_y
+            })
+# --- Step 2: Sort by y-coordinate, then by x-coordinate, and group into lines ---
+if all_text_blocks:
+    # Sort by center_y, then by center_x
+    sorted_blocks = sorted(all_text_blocks, key=lambda item: (item["center_y"], item["center_x"]))
+    lines = []
+    if sorted_blocks:
+        current_line = [sorted_blocks[0]]
+        for block in sorted_blocks[1:]:
+            # Check if the vertical centers are close enough to be on the same line
+            if abs(block["center_y"] - current_line[-1]["center_y"]) < 40: # Y-threshold
+                current_line.append(block)
+            else:
+                # Sort the current line by x-coordinate and add it to the lines list
+                current_line.sort(key=lambda item: item["center_x"])
+                lines.append(" ".join([item["text"] for item in current_line]))
+                current_line = [block]
+        # Add the last line
+        if current_line:
+            current_line.sort(key=lambda item: item["center_x"])
+            lines.append(" ".join([item["text"] for item in current_line]))
+    # --- Step 3: Print the final reconstructed text ---
+    if lines:
+        for line in lines:
+            print(line)
+    else:
+        print("No text was reconstructed.")
+else:
+    print("No text blocks were found.")
+these are the code cells for colab in for printed text OCR. now i need to deploy this in huggingface space using H200 GPU. Make sure u include import space from GPU for the GPU working, also give me the requirements.txt, and use GRadio for the UI interface