imperiusrex commited on
Commit
9c5dae2
·
verified ·
1 Parent(s): 4cc1450

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -92
app.py CHANGED
@@ -1,3 +1,6 @@
 
 
 
1
  import gradio as gr
2
  import os
3
  import cv2
@@ -7,92 +10,10 @@ from PIL import Image
7
  from transformers import CLIPProcessor, CLIPModel
8
  from paddleocr import PaddleOCR
9
  import tempfile
10
- import spaces # Import for @spaces.GPU decorator
11
-
12
- # ------------------------ Utility Functions ------------------------
13
-
14
- def run_text_detection(img_path):
15
- ocr_detector = PaddleOCR(
16
- use_angle_cls=False,
17
- lang='en',
18
- det=True,
19
- rec=False,
20
- use_gpu=torch.cuda.is_available(),
21
- show_log=False
22
- )
23
- result = ocr_detector.ocr(img_path, cls=False)
24
- boxes = [line[0] for line in result[0]]
25
- return boxes
26
-
27
-
28
- def crop_and_warp_regions(img_path, regions):
29
- image = cv2.imread(img_path)
30
- cropped_images = []
31
-
32
- for region in regions:
33
- pts = np.array(region).astype(np.float32)
34
-
35
- width = int(
36
- max(np.linalg.norm(pts[0] - pts[1]), np.linalg.norm(pts[2] - pts[3]))
37
- )
38
- height = int(
39
- max(np.linalg.norm(pts[0] - pts[3]), np.linalg.norm(pts[1] - pts[2]))
40
- )
41
-
42
- dst = np.array([
43
- [0, 0],
44
- [width - 1, 0],
45
- [width - 1, height - 1],
46
- [0, height - 1]
47
- ], dtype=np.float32)
48
-
49
- M = cv2.getPerspectiveTransform(pts, dst)
50
- warped = cv2.warpPerspective(image, M, (width, height))
51
- cropped_images.append(warped)
52
-
53
- return cropped_images
54
-
55
 
56
- def detect_language_clip(image_np, model, processor, candidates):
57
- image_pil = Image.fromarray(image_np).convert("RGB")
58
- inputs = processor(text=candidates, images=image_pil, return_tensors="pt", padding=True)
59
- with torch.no_grad():
60
- outputs = model(**inputs)
61
- logits_per_image = outputs.logits_per_image
62
- probs = logits_per_image.softmax(dim=1).squeeze().cpu().numpy()
63
- detected_lang = candidates[int(np.argmax(probs))].split()[-2].lower()
64
- return detected_lang
65
 
66
-
67
- def run_paddle_ocr(image_np, ocr_model):
68
- with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as tmp:
69
- tmp_path = tmp.name
70
- cv2.imwrite(tmp_path, image_np)
71
-
72
- result = ocr_model.ocr(tmp_path, cls=False)
73
- texts = [line[1][0] for line in result[0]] if result else []
74
- os.remove(tmp_path)
75
- return texts
76
-
77
-
78
- def group_text_by_position(all_results, positions):
79
- lines = []
80
- for result, box in zip(all_results, positions):
81
- min_y = min([pt[1] for pt in box])
82
- lines.append((min_y, result["texts"]))
83
-
84
- # Sort by vertical position
85
- lines.sort(key=lambda x: x[0])
86
-
87
- # Flatten
88
- reconstructed = []
89
- for _, texts in lines:
90
- reconstructed.append(" ".join(texts))
91
-
92
- return reconstructed
93
-
94
-
95
- # ------------------------ Load Models Once ------------------------
96
 
97
  clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
98
  clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
@@ -111,10 +32,9 @@ candidates = [
111
  "This is Korean text"
112
  ]
113
 
114
- # ------------------------ Main Processing Function ------------------------
115
-
116
- @spaces.GPU()
117
  def process_image(image):
 
118
  image_pil = Image.fromarray(image).convert("RGB")
119
  img_path = "uploaded.jpg"
120
  image_pil.save(img_path)
@@ -134,8 +54,7 @@ def process_image(image):
134
  det=False,
135
  rec=True,
136
  cls=False,
137
- show_log=False,
138
- use_angle_cls=False
139
  )
140
  texts = run_paddle_ocr(crop, ocr_model)
141
  all_results.append({
@@ -147,9 +66,6 @@ def process_image(image):
147
  final_lines = group_text_by_position(all_results, boxes)
148
  return "\n".join(final_lines)
149
 
150
-
151
- # ------------------------ Gradio Interface ------------------------
152
-
153
  interface = gr.Interface(
154
  fn=process_image,
155
  inputs=gr.Image(type="numpy", label="Upload an Image"),
 
1
+ import spaces # import before anything else to request GPU
2
+ spaces.GPU.require("H200") # request H200 GPU
3
+
4
  import gradio as gr
5
  import os
6
  import cv2
 
10
  from transformers import CLIPProcessor, CLIPModel
11
  from paddleocr import PaddleOCR
12
  import tempfile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
+ # (Keep all your utility functions here, unchanged)
 
 
 
 
 
 
 
 
15
 
16
+ # Load your models outside your GPU function to avoid reloading
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
 
18
  clip_model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
19
  clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")
 
32
  "This is Korean text"
33
  ]
34
 
35
+ @spaces.GPU # This decorator tells Spaces to run this function on GPU
 
 
36
  def process_image(image):
37
+ # your processing code here, exactly as before
38
  image_pil = Image.fromarray(image).convert("RGB")
39
  img_path = "uploaded.jpg"
40
  image_pil.save(img_path)
 
54
  det=False,
55
  rec=True,
56
  cls=False,
57
+ show_log=False
 
58
  )
59
  texts = run_paddle_ocr(crop, ocr_model)
60
  all_results.append({
 
66
  final_lines = group_text_by_position(all_results, boxes)
67
  return "\n".join(final_lines)
68
 
 
 
 
69
  interface = gr.Interface(
70
  fn=process_image,
71
  inputs=gr.Image(type="numpy", label="Upload an Image"),