[Admin maintenance] Migrate to ZeroGPU

#1
by multimodalart HF Staff - opened
Files changed (1) hide show
  1. app.py +76 -72
app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import gradio as gr
2
  import random
3
  import numpy as np
@@ -208,92 +209,95 @@ def draw_entity_boxes_on_image(image, entities, show=False, save_path=None, enti
208
  return pil_image
209
 
210
 
211
- def main():
212
- ckpt = "microsoft/kosmos-2-patch14-224"
213
- model = AutoModelForVision2Seq.from_pretrained(ckpt).to("cuda")
214
- processor = AutoProcessor.from_pretrained(ckpt)
215
-
216
- def generate_predictions(image_input, text_input):
217
- """
218
- Generate a grounded image description and annotated entity boxes with Kosmos-2.
219
-
220
- Use this tool when you need to describe an image and identify grounded visual entities.
221
-
222
- Args:
223
- image_input (PIL.Image.Image): Input image to describe and ground.
224
- text_input (str): Description mode, either "Brief" or "Detailed".
225
-
226
- Returns:
227
- tuple: Annotated image, highlighted generated description, and serialized entity data.
228
- """
229
- # Save the image and load it again to match the original Kosmos-2 demo.
230
- # (https://github.com/microsoft/unilm/blob/f4695ed0244a275201fff00bee495f76670fbe70/kosmos-2/demo/gradio_app.py#L345-L346)
231
- user_image_path = "/tmp/user_input_test_image.jpg"
232
- image_input.save(user_image_path)
233
-
234
- # This might give different results from the original argument `image_input`
235
- image_input = Image.open(user_image_path)
236
-
237
- if text_input == "Brief":
238
- text_input = "An image of"
239
- elif text_input == "Detailed":
240
- text_input = "Describe this image in detail:"
241
- else:
242
- text_input = f"{text_input}"
243
 
244
- inputs = processor(text=text_input, images=image_input, return_tensors="pt").to(
245
- "cuda"
246
- )
247
 
248
- generated_ids = model.generate(
249
- pixel_values=inputs["pixel_values"],
250
- input_ids=inputs["input_ids"],
251
- attention_mask=inputs["attention_mask"],
252
- image_embeds=None,
253
- image_embeds_position_mask=inputs["image_embeds_position_mask"],
254
- use_cache=True,
255
- max_new_tokens=128,
256
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
257
 
258
- generated_text = processor.batch_decode(
259
- generated_ids, skip_special_tokens=True
260
- )[0]
261
 
262
- # By default, the generated text is cleanup and the entities are extracted.
263
- processed_text, entities = processor.post_process_generation(generated_text)
264
 
265
- annotated_image = draw_entity_boxes_on_image(image_input, entities, show=False)
266
 
267
- color_id = -1
268
- entity_info = []
269
- filtered_entities = []
270
 
271
- for entity in entities:
272
- entity_name, (start, end), bboxes = entity
273
 
274
- if start == end:
275
- # skip bounding bbox without a `phrase` associated
276
- continue
277
 
278
- color_id += 1
279
- entity_info.append(((start, end), color_id))
280
- filtered_entities.append(entity)
281
 
282
- colored_text = []
283
- prev_start = 0
284
- end = 0
285
 
286
- for idx, ((start, end), color_id) in enumerate(entity_info):
287
- if start > prev_start:
288
- colored_text.append((processed_text[prev_start:start], None))
289
- colored_text.append((processed_text[start:end], f"{color_id}"))
290
- prev_start = end
291
 
292
- if end < len(processed_text):
293
- colored_text.append((processed_text[end : len(processed_text)], None))
294
 
295
- return annotated_image, colored_text, str(filtered_entities)
296
 
 
 
297
  term_of_use = """
298
  ### Terms of use
299
 
 
1
+ import spaces
2
  import gradio as gr
3
  import random
4
  import numpy as np
 
209
  return pil_image
210
 
211
 
212
+ ckpt = "microsoft/kosmos-2-patch14-224"
213
+ model = AutoModelForVision2Seq.from_pretrained(ckpt).to("cuda")
214
+ processor = AutoProcessor.from_pretrained(ckpt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
215
 
 
 
 
216
 
217
+ @spaces.GPU
218
+ def generate_predictions(image_input, text_input):
219
+ """
220
+ Generate a grounded image description and annotated entity boxes with Kosmos-2.
221
+
222
+ Use this tool when you need to describe an image and identify grounded visual entities.
223
+
224
+ Args:
225
+ image_input (PIL.Image.Image): Input image to describe and ground.
226
+ text_input (str): Description mode, either "Brief" or "Detailed".
227
+
228
+ Returns:
229
+ tuple: Annotated image, highlighted generated description, and serialized entity data.
230
+ """
231
+ # Save the image and load it again to match the original Kosmos-2 demo.
232
+ # (https://github.com/microsoft/unilm/blob/f4695ed0244a275201fff00bee495f76670fbe70/kosmos-2/demo/gradio_app.py#L345-L346)
233
+ user_image_path = "/tmp/user_input_test_image.jpg"
234
+ image_input.save(user_image_path)
235
+
236
+ # This might give different results from the original argument `image_input`
237
+ image_input = Image.open(user_image_path)
238
+
239
+ if text_input == "Brief":
240
+ text_input = "An image of"
241
+ elif text_input == "Detailed":
242
+ text_input = "Describe this image in detail:"
243
+ else:
244
+ text_input = f"{text_input}"
245
+
246
+ inputs = processor(text=text_input, images=image_input, return_tensors="pt").to(
247
+ "cuda"
248
+ )
249
+
250
+ generated_ids = model.generate(
251
+ pixel_values=inputs["pixel_values"],
252
+ input_ids=inputs["input_ids"],
253
+ attention_mask=inputs["attention_mask"],
254
+ image_embeds=None,
255
+ image_embeds_position_mask=inputs["image_embeds_position_mask"],
256
+ use_cache=True,
257
+ max_new_tokens=128,
258
+ )
259
 
260
+ generated_text = processor.batch_decode(
261
+ generated_ids, skip_special_tokens=True
262
+ )[0]
263
 
264
+ # By default, the generated text is cleanup and the entities are extracted.
265
+ processed_text, entities = processor.post_process_generation(generated_text)
266
 
267
+ annotated_image = draw_entity_boxes_on_image(image_input, entities, show=False)
268
 
269
+ color_id = -1
270
+ entity_info = []
271
+ filtered_entities = []
272
 
273
+ for entity in entities:
274
+ entity_name, (start, end), bboxes = entity
275
 
276
+ if start == end:
277
+ # skip bounding bbox without a `phrase` associated
278
+ continue
279
 
280
+ color_id += 1
281
+ entity_info.append(((start, end), color_id))
282
+ filtered_entities.append(entity)
283
 
284
+ colored_text = []
285
+ prev_start = 0
286
+ end = 0
287
 
288
+ for idx, ((start, end), color_id) in enumerate(entity_info):
289
+ if start > prev_start:
290
+ colored_text.append((processed_text[prev_start:start], None))
291
+ colored_text.append((processed_text[start:end], f"{color_id}"))
292
+ prev_start = end
293
 
294
+ if end < len(processed_text):
295
+ colored_text.append((processed_text[end : len(processed_text)], None))
296
 
297
+ return annotated_image, colored_text, str(filtered_entities)
298
 
299
+
300
+ def main():
301
  term_of_use = """
302
  ### Terms of use
303