Spaces:
Build error
Build error
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import
|
| 2 |
import torch
|
| 3 |
import onnxruntime as ort
|
| 4 |
from PIL import Image
|
|
@@ -7,13 +7,9 @@ import numpy as np
|
|
| 7 |
from transformers import AutoTokenizer, AutoProcessor
|
| 8 |
import os
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/decoder_model_merged_q4f16.onnx')
|
| 14 |
-
if not os.path.exists("embed_tokens_q4f16.onnx"):
|
| 15 |
-
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/embed_tokens_q4f16.onnx')
|
| 16 |
-
|
| 17 |
# Load the tokenizer and processor
|
| 18 |
tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
|
| 19 |
processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
|
|
@@ -22,48 +18,51 @@ vision_encoder_session = ort.InferenceSession("vision_encoder_q4f16.onnx")
|
|
| 22 |
decoder_session = ort.InferenceSession("decoder_model_merged_q4f16.onnx")
|
| 23 |
embed_tokens_session = ort.InferenceSession("embed_tokens_q4f16.onnx")
|
| 24 |
|
| 25 |
-
def merge_input_ids_with_image_features(image_features, inputs_embeds, input_ids, attention_mask,
|
| 26 |
num_images, num_image_patches, embed_dim = image_features.shape
|
| 27 |
batch_size, sequence_length = input_ids.shape
|
| 28 |
-
|
|
|
|
| 29 |
special_image_token_mask = input_ids == special_image_token_id
|
| 30 |
num_special_image_tokens = np.sum(special_image_token_mask, axis=-1)
|
| 31 |
-
|
| 32 |
max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
|
| 33 |
batch_indices, non_image_indices = np.where(input_ids != special_image_token_id)
|
| 34 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 35 |
new_token_positions = np.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
|
| 36 |
nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
|
| 37 |
-
|
| 38 |
-
|
| 39 |
text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
|
| 40 |
-
|
|
|
|
| 41 |
final_embedding = np.zeros((batch_size, max_embed_dim, embed_dim), dtype=np.float32)
|
| 42 |
final_attention_mask = np.zeros((batch_size, max_embed_dim), dtype=np.int64)
|
| 43 |
-
|
|
|
|
|
|
|
| 44 |
final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
|
| 45 |
final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
|
| 46 |
-
|
| 47 |
-
image_to_overwrite = np.full((batch_size, max_embed_dim),
|
| 48 |
image_to_overwrite[batch_indices, text_to_overwrite] = False
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
num_image_positions = len(image_positions[0])
|
| 53 |
-
assert num_image_positions <= image_features.size, "Mismatch in image feature positions and available features."
|
| 54 |
-
|
| 55 |
-
# Assign reshaped image features
|
| 56 |
-
reshaped_image_features = image_features.reshape(-1, embed_dim)[:num_image_positions]
|
| 57 |
-
final_embedding[image_positions] = reshaped_image_features
|
| 58 |
final_attention_mask = np.logical_or(final_attention_mask, image_to_overwrite).astype(final_attention_mask.dtype)
|
| 59 |
-
|
| 60 |
position_ids = final_attention_mask.cumsum(axis=-1) - 1
|
| 61 |
position_ids = np.where(final_attention_mask == 0, 1, position_ids)
|
| 62 |
-
|
|
|
|
| 63 |
batch_indices, pad_indices = np.where(input_ids == pad_token_id)
|
| 64 |
indices_to_mask = new_token_positions[batch_indices, pad_indices]
|
| 65 |
final_embedding[batch_indices, indices_to_mask] = 0
|
| 66 |
-
|
| 67 |
return final_embedding, final_attention_mask, position_ids
|
| 68 |
|
| 69 |
# Load model and processor
|
|
@@ -199,10 +198,14 @@ def describe_image(image):
|
|
| 199 |
decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
|
| 200 |
return decoded_output
|
| 201 |
|
| 202 |
-
#
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
import torch
|
| 3 |
import onnxruntime as ort
|
| 4 |
from PIL import Image
|
|
|
|
| 7 |
from transformers import AutoTokenizer, AutoProcessor
|
| 8 |
import os
|
| 9 |
|
| 10 |
+
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/decoder_model_merged_q4f16.onnx')
|
| 11 |
+
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/embed_tokens_q4f16.onnx')
|
| 12 |
+
os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/vision_encoder_q4f16.onnx')
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
# Load the tokenizer and processor
|
| 14 |
tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
|
| 15 |
processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
|
|
|
|
| 18 |
decoder_session = ort.InferenceSession("decoder_model_merged_q4f16.onnx")
|
| 19 |
embed_tokens_session = ort.InferenceSession("embed_tokens_q4f16.onnx")
|
| 20 |
|
| 21 |
+
def merge_input_ids_with_image_features(image_features, inputs_embeds, input_ids, attention_mask,pad_token_id,special_image_token_id):
|
| 22 |
num_images, num_image_patches, embed_dim = image_features.shape
|
| 23 |
batch_size, sequence_length = input_ids.shape
|
| 24 |
+
left_padding = not np.sum(input_ids[:, -1] == pad_token_id)
|
| 25 |
+
# 1. Create a mask to know where special image tokens are
|
| 26 |
special_image_token_mask = input_ids == special_image_token_id
|
| 27 |
num_special_image_tokens = np.sum(special_image_token_mask, axis=-1)
|
| 28 |
+
# Compute the maximum embed dimension
|
| 29 |
max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
|
| 30 |
batch_indices, non_image_indices = np.where(input_ids != special_image_token_id)
|
| 31 |
+
|
| 32 |
+
# 2. Compute the positions where text should be written
|
| 33 |
+
# Calculate new positions for text tokens in merged image-text sequence.
|
| 34 |
+
# `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
|
| 35 |
+
# `np.cumsum` computes how each image token shifts subsequent text token positions.
|
| 36 |
+
# - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
|
| 37 |
new_token_positions = np.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
|
| 38 |
nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
|
| 39 |
+
if left_padding:
|
| 40 |
+
new_token_positions += nb_image_pad[:, None] # offset for left padding
|
| 41 |
text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
|
| 42 |
+
|
| 43 |
+
# 3. Create the full embedding, already padded to the maximum position
|
| 44 |
final_embedding = np.zeros((batch_size, max_embed_dim, embed_dim), dtype=np.float32)
|
| 45 |
final_attention_mask = np.zeros((batch_size, max_embed_dim), dtype=np.int64)
|
| 46 |
+
|
| 47 |
+
# 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
|
| 48 |
+
# we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
|
| 49 |
final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
|
| 50 |
final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
|
| 51 |
+
# 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
|
| 52 |
+
image_to_overwrite = np.full((batch_size, max_embed_dim), True)
|
| 53 |
image_to_overwrite[batch_indices, text_to_overwrite] = False
|
| 54 |
+
image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None]
|
| 55 |
+
|
| 56 |
+
final_embedding[image_to_overwrite] = image_features.reshape(-1, embed_dim)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
final_attention_mask = np.logical_or(final_attention_mask, image_to_overwrite).astype(final_attention_mask.dtype)
|
|
|
|
| 58 |
position_ids = final_attention_mask.cumsum(axis=-1) - 1
|
| 59 |
position_ids = np.where(final_attention_mask == 0, 1, position_ids)
|
| 60 |
+
|
| 61 |
+
# 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
|
| 62 |
batch_indices, pad_indices = np.where(input_ids == pad_token_id)
|
| 63 |
indices_to_mask = new_token_positions[batch_indices, pad_indices]
|
| 64 |
final_embedding[batch_indices, indices_to_mask] = 0
|
| 65 |
+
|
| 66 |
return final_embedding, final_attention_mask, position_ids
|
| 67 |
|
| 68 |
# Load model and processor
|
|
|
|
| 198 |
decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
|
| 199 |
return decoded_output
|
| 200 |
|
| 201 |
+
# Create Gradio interface
|
| 202 |
+
interface = gr.Interface(
|
| 203 |
+
fn=describe_image,
|
| 204 |
+
inputs=gr.Image(type="pil"),
|
| 205 |
+
outputs=gr.Textbox(lines=5, placeholder="Description will appear here"),
|
| 206 |
+
title="Image Description Generator",
|
| 207 |
+
description="Upload an image to get a detailed description."
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
# Enable API
|
| 211 |
+
interface.launch(share=True,show_error=True,debug=True)
|