Spaces:

imperiusrex
/

PrintedTextOCR

Sleeping

App Files Files Community

imperiusrex commited on Jul 31, 2025

Commit

9c5dae2

verified ·

1 Parent(s): 4cc1450

Update app.py

Browse files

Files changed (1) hide show

app.py +8 -92

app.py CHANGED Viewed

@@ -1,3 +1,6 @@
 import gradio as gr
 import os
 import cv2
@@ -7,92 +10,10 @@ from PIL import Image
 from transformers import CLIPProcessor, CLIPModel
 from paddleocr import PaddleOCR
 import tempfile
-import spaces  # Import for @spaces.GPU decorator
-# ------------------------ Utility Functions ------------------------
-def run_text_detection(img_path):
-    ocr_detector = PaddleOCR(
-        use_angle_cls=False,
-        lang='en',
-        det=True,
-        rec=False,
-        use_gpu=torch.cuda.is_available(),
-        show_log=False
-    )
-    result = ocr_detector.ocr(img_path, cls=False)
-    boxes = [line[0] for line in result[0]]
-    return boxes
-def crop_and_warp_regions(img_path, regions):
-    image = cv2.imread(img_path)
-    cropped_images = []
-    for region in regions:
-        pts = np.array(region).astype(np.float32)
-        width = int(
-            max(np.linalg.norm(pts[0] - pts[1]), np.linalg.norm(pts[2] - pts[3]))
-        )
-        height = int(
-            max(np.linalg.norm(pts[0] - pts[3]), np.linalg.norm(pts[1] - pts[2]))
-        )
-        dst = np.array([
-            [0, 0],
-            [width - 1, 0],
-            [width - 1, height - 1],
-            [0, height - 1]
-        ], dtype=np.float32)
-        M = cv2.getPerspectiveTransform(pts, dst)
-        warped = cv2.warpPerspective(image, M, (width, height))
-        cropped_images.append(warped)
-    return cropped_images
-def detect_language_clip(image_np, model, processor, candidates):
-    image_pil = Image.fromarray(image_np).convert("RGB")
-    inputs = processor(text=candidates, images=image_pil, return_tensors="pt", padding=True)
-    with torch.no_grad():
-        outputs = model(**inputs)
-        logits_per_image = outputs.logits_per_image
-        probs = logits_per_image.softmax(dim=1).squeeze().cpu().numpy()
-        detected_lang = candidates[int(np.argmax(probs))].split()[-2].lower()
-    return detected_lang
-def run_paddle_ocr(image_np, ocr_model):
-    with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
-        tmp_path = tmp.name
-        cv2.imwrite(tmp_path, image_np)
-    result = ocr_model.ocr(tmp_path, cls=False)
-    texts = [line[1][0] for line in result[0]] if result else []
-    os.remove(tmp_path)
-    return texts
-def group_text_by_position(all_results, positions):
-    lines = []
-    for result, box in zip(all_results, positions):
-        min_y = min([pt[1] for pt in box])
-        lines.append((min_y, result["texts"]))
-    # Sort by vertical position
-    lines.sort(key=lambda x: x[0])
-    # Flatten
-    reconstructed = []
-    for _, texts in lines:
-        reconstructed.append(" ".join(texts))
-    return reconstructed
-# ------------------------ Load Models Once ------------------------
 clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
 clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
@@ -111,10 +32,9 @@ candidates = [
     "This is Korean text"
 ]
-# ------------------------ Main Processing Function ------------------------
-@spaces.GPU()
 def process_image(image):
     image_pil = Image.fromarray(image).convert("RGB")
     img_path = "uploaded.jpg"
     image_pil.save(img_path)
@@ -134,8 +54,7 @@ def process_image(image):
             det=False,
             rec=True,
             cls=False,
-            show_log=False,
-            use_angle_cls=False
         )
         texts = run_paddle_ocr(crop, ocr_model)
         all_results.append({
@@ -147,9 +66,6 @@ def process_image(image):
     final_lines = group_text_by_position(all_results, boxes)
     return "\n".join(final_lines)
-# ------------------------ Gradio Interface ------------------------
 interface = gr.Interface(
     fn=process_image,
     inputs=gr.Image(type="numpy", label="Upload an Image"),

+import spaces  # import before anything else to request GPU
+spaces.GPU.require("H200")  # request H200 GPU
 import gradio as gr
 import os
 import cv2
 from transformers import CLIPProcessor, CLIPModel
 from paddleocr import PaddleOCR
 import tempfile
+# (Keep all your utility functions here, unchanged)
+# Load your models outside your GPU function to avoid reloading
 clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
 clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
     "This is Korean text"
 ]
+@spaces.GPU  # This decorator tells Spaces to run this function on GPU
 def process_image(image):
+    # your processing code here, exactly as before
     image_pil = Image.fromarray(image).convert("RGB")
     img_path = "uploaded.jpg"
     image_pil.save(img_path)
             det=False,
             rec=True,
             cls=False,
+            show_log=False
         )
         texts = run_paddle_ocr(crop, ocr_model)
         all_results.append({
     final_lines = group_text_by_position(all_results, boxes)
     return "\n".join(final_lines)
 interface = gr.Interface(
     fn=process_image,
     inputs=gr.Image(type="numpy", label="Upload an Image"),