Spaces:

imperiusrex
/

PrintedTextOCR

Sleeping

App Files Files Community

imperiusrex commited on Jul 31, 2025

Commit

b89622d

verified ·

1 Parent(s): aea72b3

Create app.py

Browse files

Files changed (1) hide show

app.py +90 -0

app.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import gradio as gr
+import os
+import cv2
+import numpy as np
+import torch
+import json
+from PIL import Image
+from transformers import CLIPProcessor, CLIPModel
+from paddleocr import PaddleOCR, TextDetection
+from ocr_utils import (
+    run_text_detection,
+    crop_and_warp_regions,
+    detect_language_clip,
+    run_paddle_ocr,
+    group_text_by_position
+)
+# Load models once
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
+language_map = {
+    "english": "en",
+    "telugu": "te",
+    "chinese": "ch",
+    "korean": "korean"
+}
+candidates = [
+    "This is English text",
+    "This is Telugu text",
+    "This is Chinese text",
+    "This is Korean text"
+]
+def process_image(image):
+    image_pil = Image.fromarray(image).convert("RGB")
+    img_path = "uploaded.jpg"
+    image_pil.save(img_path)
+    width, height = image_pil.size
+    total_pixels = width * height
+    arr = run_text_detection(img_path)
+    cropped_images = crop_and_warp_regions(img_path, arr)
+    all_results = []
+    lines_final = []
+    for i, crop in enumerate(cropped_images):
+        lang_detected = detect_language_clip(crop, clip_model, clip_processor, candidates)
+        lang_code = language_map.get(lang_detected.lower(), "en")
+        ocr_model = PaddleOCR(
+            use_doc_orientation_classify=False,
+            use_doc_unwarping=False,
+            use_textline_orientation=False,
+            lang=lang_code,
+            det=False,
+            rec=True,
+            cls=False,
+            show_log=False,
+            use_angle_cls=False
+        )
+        result_texts = run_paddle_ocr(crop, ocr_model)
+        all_results.append({
+            "lang": lang_detected,
+            "texts": result_texts,
+            "image": crop
+        })
+    lines_final = group_text_by_position(all_results, arr)
+    return "\n".join(lines_final)
+interface = gr.Interface(
+    fn=process_image,
+    inputs=gr.Image(type="numpy", label="Upload an Image"),
+    outputs=gr.Textbox(label="Reconstructed Text"),
+    title="Printed Text OCR",
+    description="Upload a scanned document or printed image. The app detects bounding boxes, extracts text, detects language, and reconstructs the text."
+)
+if __name__ == "__main__":
+    import spaces
+    spaces.GPU.require("A100")  # Uses H100/A100 GPU on Hugging Face Spaces
+    interface.launch()