prithivMLmods commited on
Commit
a3f3e60
Β·
verified Β·
1 Parent(s): e305e72

update app

Browse files
Files changed (1) hide show
  1. app.py +5 -17
app.py CHANGED
@@ -1,5 +1,6 @@
1
  import gradio as gr
2
  import torch
 
3
  from transformers import AutoModel, AutoTokenizer
4
  import spaces
5
  from typing import Iterable
@@ -13,7 +14,6 @@ from docling_core.types.doc import DoclingDocument, DocTagsDocument
13
 
14
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
15
 
16
- # --- # Device and CUDA Setup Check ---
17
  print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
18
  print("torch.__version__ =", torch.__version__)
19
  print("torch.version.cuda =", torch.version.cuda)
@@ -33,7 +33,7 @@ colors.steel_blue = colors.Color(
33
  c200="#A8CCE1",
34
  c300="#7DB3D2",
35
  c400="#529AC3",
36
- c500="#4682B4", # SteelBlue base color
37
  c600="#3E72A0",
38
  c700="#36638C",
39
  c800="#2E5378",
@@ -97,8 +97,6 @@ css = """
97
  }
98
  """
99
 
100
-
101
- # --- 1. Load Model and Tokenizer directly to the correct device ---
102
  print("Determining device...")
103
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
104
  print(f"βœ… Using device: {device}")
@@ -107,7 +105,6 @@ print("Loading model and tokenizer...")
107
  model_name = "strangervisionhf/deepseek-ocr-latest-transformers"
108
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
109
 
110
- # Load the model directly to the specified device and set to evaluation mode
111
  model = AutoModel.from_pretrained(
112
  model_name,
113
  _attn_implementation="flash_attention_2",
@@ -115,14 +112,11 @@ model = AutoModel.from_pretrained(
115
  use_safetensors=True,
116
  ).to(device).eval() # Move to device and set to eval mode
117
 
118
- # Also apply the desired dtype if using a GPU
119
  if device.type == 'cuda':
120
  model = model.to(torch.bfloat16)
121
 
122
  print("βœ… Model loaded successfully to device and in eval mode.")
123
 
124
-
125
- # --- Helper function to find pre-generated result images ---
126
  def find_result_image(path):
127
  for filename in os.listdir(path):
128
  if "grounding" in filename or "result" in filename:
@@ -133,7 +127,6 @@ def find_result_image(path):
133
  print(f"Error opening result image {filename}: {e}")
134
  return None
135
 
136
- # --- 2. Main Processing Function (Simplified) ---
137
  @spaces.GPU
138
  def process_ocr_task(image, model_size, task_type, ref_text):
139
  """
@@ -142,7 +135,6 @@ def process_ocr_task(image, model_size, task_type, ref_text):
142
  if image is None:
143
  return "Please upload an image first.", None
144
 
145
- # No need to move the model to GPU here; it's already done at startup.
146
  print("βœ… Model is already on the designated device.")
147
 
148
  with tempfile.TemporaryDirectory() as output_path:
@@ -163,7 +155,6 @@ def process_ocr_task(image, model_size, task_type, ref_text):
163
  temp_image_path = os.path.join(output_path, "temp_image.png")
164
  image.save(temp_image_path)
165
 
166
- # Configure model size
167
  size_configs = {
168
  "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
169
  "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
@@ -174,7 +165,6 @@ def process_ocr_task(image, model_size, task_type, ref_text):
174
  config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
175
 
176
  print(f"πŸƒ Running inference with prompt: {prompt}")
177
- # Use the globally defined 'model' which is already on the GPU
178
  text_result = model.infer(
179
  tokenizer,
180
  prompt=prompt,
@@ -190,7 +180,6 @@ def process_ocr_task(image, model_size, task_type, ref_text):
190
 
191
  print(f"====\nπŸ“„ Text Result: {text_result}\n====")
192
 
193
- # --- Logic to draw bounding boxes ---
194
  result_image_pil = None
195
  pattern = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
196
  matches = list(pattern.finditer(text_result))
@@ -224,9 +213,11 @@ example_image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
224
 
225
  with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
226
  gr.Markdown("# **DeepSeek OCR [exp]**", elem_id="main-title")
 
 
227
  with gr.Row():
228
  with gr.Column(scale=1):
229
- image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard"],value=example_image)
230
  model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Large", label="Resolution Size")
231
  task_type = gr.Dropdown(choices=["Free OCR", "Convert to Markdown", "Parse Figure", "Locate Object by Reference"], value="Convert to Markdown", label="Task Type")
232
  ref_text_input = gr.Textbox(label="Reference Text (for Locate task)", placeholder="e.g., the teacher, 20-10, a red car...", visible=False)
@@ -236,14 +227,11 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
236
  output_text = gr.Textbox(label="Output(OCR)", lines=15, show_copy_button=True)
237
  output_image = gr.Image(label="Layout Detection(If Any)", type="pil")
238
 
239
- # --- UI Interaction Logic ---
240
  def toggle_ref_text_visibility(task):
241
  return gr.Textbox(visible=True) if task == "Locate Object by Reference" else gr.Textbox(visible=False)
242
 
243
  task_type.change(fn=toggle_ref_text_visibility, inputs=task_type, outputs=ref_text_input)
244
  submit_btn.click(fn=process_ocr_task, inputs=[image_input, model_size, task_type, ref_text_input], outputs=[output_text, output_image])
245
 
246
-
247
- # --- 4. Launch the App ---
248
  if __name__ == "__main__":
249
  demo.queue(max_size=20).launch(share=True)
 
1
  import gradio as gr
2
  import torch
3
+ import requests
4
  from transformers import AutoModel, AutoTokenizer
5
  import spaces
6
  from typing import Iterable
 
14
 
15
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
 
 
17
  print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
18
  print("torch.__version__ =", torch.__version__)
19
  print("torch.version.cuda =", torch.version.cuda)
 
33
  c200="#A8CCE1",
34
  c300="#7DB3D2",
35
  c400="#529AC3",
36
+ c500="#4682B4",
37
  c600="#3E72A0",
38
  c700="#36638C",
39
  c800="#2E5378",
 
97
  }
98
  """
99
 
 
 
100
  print("Determining device...")
101
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
102
  print(f"βœ… Using device: {device}")
 
105
  model_name = "strangervisionhf/deepseek-ocr-latest-transformers"
106
  tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
107
 
 
108
  model = AutoModel.from_pretrained(
109
  model_name,
110
  _attn_implementation="flash_attention_2",
 
112
  use_safetensors=True,
113
  ).to(device).eval() # Move to device and set to eval mode
114
 
 
115
  if device.type == 'cuda':
116
  model = model.to(torch.bfloat16)
117
 
118
  print("βœ… Model loaded successfully to device and in eval mode.")
119
 
 
 
120
  def find_result_image(path):
121
  for filename in os.listdir(path):
122
  if "grounding" in filename or "result" in filename:
 
127
  print(f"Error opening result image {filename}: {e}")
128
  return None
129
 
 
130
  @spaces.GPU
131
  def process_ocr_task(image, model_size, task_type, ref_text):
132
  """
 
135
  if image is None:
136
  return "Please upload an image first.", None
137
 
 
138
  print("βœ… Model is already on the designated device.")
139
 
140
  with tempfile.TemporaryDirectory() as output_path:
 
155
  temp_image_path = os.path.join(output_path, "temp_image.png")
156
  image.save(temp_image_path)
157
 
 
158
  size_configs = {
159
  "Tiny": {"base_size": 512, "image_size": 512, "crop_mode": False},
160
  "Small": {"base_size": 640, "image_size": 640, "crop_mode": False},
 
165
  config = size_configs.get(model_size, size_configs["Gundam (Recommended)"])
166
 
167
  print(f"πŸƒ Running inference with prompt: {prompt}")
 
168
  text_result = model.infer(
169
  tokenizer,
170
  prompt=prompt,
 
180
 
181
  print(f"====\nπŸ“„ Text Result: {text_result}\n====")
182
 
 
183
  result_image_pil = None
184
  pattern = re.compile(r"<\|det\|>\[\[(\d+),\s*(\d+),\s*(\d+),\s*(\d+)\]\]<\|/det\|>")
185
  matches = list(pattern.finditer(text_result))
 
213
 
214
  with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
215
  gr.Markdown("# **DeepSeek OCR [exp]**", elem_id="main-title")
216
+ gr.Markdown("> This app is running with transformers v.4.57.1 and torch v.2.6.0.")
217
+
218
  with gr.Row():
219
  with gr.Column(scale=1):
220
+ image_input = gr.Image(type="pil", label="Upload Image", sources=["upload", "clipboard"], value=example_image)
221
  model_size = gr.Dropdown(choices=["Tiny", "Small", "Base", "Large", "Gundam (Recommended)"], value="Large", label="Resolution Size")
222
  task_type = gr.Dropdown(choices=["Free OCR", "Convert to Markdown", "Parse Figure", "Locate Object by Reference"], value="Convert to Markdown", label="Task Type")
223
  ref_text_input = gr.Textbox(label="Reference Text (for Locate task)", placeholder="e.g., the teacher, 20-10, a red car...", visible=False)
 
227
  output_text = gr.Textbox(label="Output(OCR)", lines=15, show_copy_button=True)
228
  output_image = gr.Image(label="Layout Detection(If Any)", type="pil")
229
 
 
230
  def toggle_ref_text_visibility(task):
231
  return gr.Textbox(visible=True) if task == "Locate Object by Reference" else gr.Textbox(visible=False)
232
 
233
  task_type.change(fn=toggle_ref_text_visibility, inputs=task_type, outputs=ref_text_input)
234
  submit_btn.click(fn=process_ocr_task, inputs=[image_input, model_size, task_type, ref_text_input], outputs=[output_text, output_image])
235
 
 
 
236
  if __name__ == "__main__":
237
  demo.queue(max_size=20).launch(share=True)