Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,3 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import os
|
| 3 |
import cv2
|
|
@@ -7,92 +10,10 @@ from PIL import Image
|
|
| 7 |
from transformers import CLIPProcessor, CLIPModel
|
| 8 |
from paddleocr import PaddleOCR
|
| 9 |
import tempfile
|
| 10 |
-
import spaces # Import for @spaces.GPU decorator
|
| 11 |
-
|
| 12 |
-
# ------------------------ Utility Functions ------------------------
|
| 13 |
-
|
| 14 |
-
def run_text_detection(img_path):
|
| 15 |
-
ocr_detector = PaddleOCR(
|
| 16 |
-
use_angle_cls=False,
|
| 17 |
-
lang='en',
|
| 18 |
-
det=True,
|
| 19 |
-
rec=False,
|
| 20 |
-
use_gpu=torch.cuda.is_available(),
|
| 21 |
-
show_log=False
|
| 22 |
-
)
|
| 23 |
-
result = ocr_detector.ocr(img_path, cls=False)
|
| 24 |
-
boxes = [line[0] for line in result[0]]
|
| 25 |
-
return boxes
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
def crop_and_warp_regions(img_path, regions):
|
| 29 |
-
image = cv2.imread(img_path)
|
| 30 |
-
cropped_images = []
|
| 31 |
-
|
| 32 |
-
for region in regions:
|
| 33 |
-
pts = np.array(region).astype(np.float32)
|
| 34 |
-
|
| 35 |
-
width = int(
|
| 36 |
-
max(np.linalg.norm(pts[0] - pts[1]), np.linalg.norm(pts[2] - pts[3]))
|
| 37 |
-
)
|
| 38 |
-
height = int(
|
| 39 |
-
max(np.linalg.norm(pts[0] - pts[3]), np.linalg.norm(pts[1] - pts[2]))
|
| 40 |
-
)
|
| 41 |
-
|
| 42 |
-
dst = np.array([
|
| 43 |
-
[0, 0],
|
| 44 |
-
[width - 1, 0],
|
| 45 |
-
[width - 1, height - 1],
|
| 46 |
-
[0, height - 1]
|
| 47 |
-
], dtype=np.float32)
|
| 48 |
-
|
| 49 |
-
M = cv2.getPerspectiveTransform(pts, dst)
|
| 50 |
-
warped = cv2.warpPerspective(image, M, (width, height))
|
| 51 |
-
cropped_images.append(warped)
|
| 52 |
-
|
| 53 |
-
return cropped_images
|
| 54 |
-
|
| 55 |
|
| 56 |
-
|
| 57 |
-
image_pil = Image.fromarray(image_np).convert("RGB")
|
| 58 |
-
inputs = processor(text=candidates, images=image_pil, return_tensors="pt", padding=True)
|
| 59 |
-
with torch.no_grad():
|
| 60 |
-
outputs = model(**inputs)
|
| 61 |
-
logits_per_image = outputs.logits_per_image
|
| 62 |
-
probs = logits_per_image.softmax(dim=1).squeeze().cpu().numpy()
|
| 63 |
-
detected_lang = candidates[int(np.argmax(probs))].split()[-2].lower()
|
| 64 |
-
return detected_lang
|
| 65 |
|
| 66 |
-
|
| 67 |
-
def run_paddle_ocr(image_np, ocr_model):
|
| 68 |
-
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
|
| 69 |
-
tmp_path = tmp.name
|
| 70 |
-
cv2.imwrite(tmp_path, image_np)
|
| 71 |
-
|
| 72 |
-
result = ocr_model.ocr(tmp_path, cls=False)
|
| 73 |
-
texts = [line[1][0] for line in result[0]] if result else []
|
| 74 |
-
os.remove(tmp_path)
|
| 75 |
-
return texts
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
def group_text_by_position(all_results, positions):
|
| 79 |
-
lines = []
|
| 80 |
-
for result, box in zip(all_results, positions):
|
| 81 |
-
min_y = min([pt[1] for pt in box])
|
| 82 |
-
lines.append((min_y, result["texts"]))
|
| 83 |
-
|
| 84 |
-
# Sort by vertical position
|
| 85 |
-
lines.sort(key=lambda x: x[0])
|
| 86 |
-
|
| 87 |
-
# Flatten
|
| 88 |
-
reconstructed = []
|
| 89 |
-
for _, texts in lines:
|
| 90 |
-
reconstructed.append(" ".join(texts))
|
| 91 |
-
|
| 92 |
-
return reconstructed
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
# ------------------------ Load Models Once ------------------------
|
| 96 |
|
| 97 |
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
|
| 98 |
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
|
|
@@ -111,10 +32,9 @@ candidates = [
|
|
| 111 |
"This is Korean text"
|
| 112 |
]
|
| 113 |
|
| 114 |
-
#
|
| 115 |
-
|
| 116 |
-
@spaces.GPU()
|
| 117 |
def process_image(image):
|
|
|
|
| 118 |
image_pil = Image.fromarray(image).convert("RGB")
|
| 119 |
img_path = "uploaded.jpg"
|
| 120 |
image_pil.save(img_path)
|
|
@@ -134,8 +54,7 @@ def process_image(image):
|
|
| 134 |
det=False,
|
| 135 |
rec=True,
|
| 136 |
cls=False,
|
| 137 |
-
show_log=False
|
| 138 |
-
use_angle_cls=False
|
| 139 |
)
|
| 140 |
texts = run_paddle_ocr(crop, ocr_model)
|
| 141 |
all_results.append({
|
|
@@ -147,9 +66,6 @@ def process_image(image):
|
|
| 147 |
final_lines = group_text_by_position(all_results, boxes)
|
| 148 |
return "\n".join(final_lines)
|
| 149 |
|
| 150 |
-
|
| 151 |
-
# ------------------------ Gradio Interface ------------------------
|
| 152 |
-
|
| 153 |
interface = gr.Interface(
|
| 154 |
fn=process_image,
|
| 155 |
inputs=gr.Image(type="numpy", label="Upload an Image"),
|
|
|
|
| 1 |
+
import spaces # import before anything else to request GPU
|
| 2 |
+
spaces.GPU.require("H200") # request H200 GPU
|
| 3 |
+
|
| 4 |
import gradio as gr
|
| 5 |
import os
|
| 6 |
import cv2
|
|
|
|
| 10 |
from transformers import CLIPProcessor, CLIPModel
|
| 11 |
from paddleocr import PaddleOCR
|
| 12 |
import tempfile
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
+
# (Keep all your utility functions here, unchanged)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
+
# Load your models outside your GPU function to avoid reloading
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
|
| 19 |
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
|
|
|
|
| 32 |
"This is Korean text"
|
| 33 |
]
|
| 34 |
|
| 35 |
+
@spaces.GPU # This decorator tells Spaces to run this function on GPU
|
|
|
|
|
|
|
| 36 |
def process_image(image):
|
| 37 |
+
# your processing code here, exactly as before
|
| 38 |
image_pil = Image.fromarray(image).convert("RGB")
|
| 39 |
img_path = "uploaded.jpg"
|
| 40 |
image_pil.save(img_path)
|
|
|
|
| 54 |
det=False,
|
| 55 |
rec=True,
|
| 56 |
cls=False,
|
| 57 |
+
show_log=False
|
|
|
|
| 58 |
)
|
| 59 |
texts = run_paddle_ocr(crop, ocr_model)
|
| 60 |
all_results.append({
|
|
|
|
| 66 |
final_lines = group_text_by_position(all_results, boxes)
|
| 67 |
return "\n".join(final_lines)
|
| 68 |
|
|
|
|
|
|
|
|
|
|
| 69 |
interface = gr.Interface(
|
| 70 |
fn=process_image,
|
| 71 |
inputs=gr.Image(type="numpy", label="Upload an Image"),
|