Spaces:

Narayana02
/

image_description

Build error

App Files Files Community

Narayana02 commited on Dec 20, 2024

Commit

81dbae6

verified ·

1 Parent(s): 8de60e6

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -37

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import streamlit as st
 import torch
 import onnxruntime as ort
 from PIL import Image
@@ -7,13 +7,9 @@ import numpy as np
 from transformers import AutoTokenizer, AutoProcessor
 import os
-if not os.path.exists("vision_encoder_q4f16.onnx"):
-    os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/vision_encoder_q4f16.onnx')
-if not os.path.exists("decoder_model_merged_q4f16.onnx"):
-    os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/decoder_model_merged_q4f16.onnx')
-if not os.path.exists("embed_tokens_q4f16.onnx"):
-    os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/embed_tokens_q4f16.onnx')
 # Load the tokenizer and processor
 tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
 processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
@@ -22,48 +18,51 @@ vision_encoder_session = ort.InferenceSession("vision_encoder_q4f16.onnx")
 decoder_session = ort.InferenceSession("decoder_model_merged_q4f16.onnx")
 embed_tokens_session = ort.InferenceSession("embed_tokens_q4f16.onnx")
-def merge_input_ids_with_image_features(image_features, inputs_embeds, input_ids, attention_mask, pad_token_id, special_image_token_id):
     num_images, num_image_patches, embed_dim = image_features.shape
     batch_size, sequence_length = input_ids.shape
     special_image_token_mask = input_ids == special_image_token_id
     num_special_image_tokens = np.sum(special_image_token_mask, axis=-1)
     max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
     batch_indices, non_image_indices = np.where(input_ids != special_image_token_id)
     new_token_positions = np.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
     nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
-    new_token_positions += nb_image_pad[:, None]
     text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
     final_embedding = np.zeros((batch_size, max_embed_dim, embed_dim), dtype=np.float32)
     final_attention_mask = np.zeros((batch_size, max_embed_dim), dtype=np.int64)
     final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
     final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
-    image_to_overwrite = np.full((batch_size, max_embed_dim), False)
     image_to_overwrite[batch_indices, text_to_overwrite] = False
-    image_positions = np.where(np.logical_not(image_to_overwrite))
-    # Ensure proper reshaping
-    num_image_positions = len(image_positions[0])
-    assert num_image_positions <= image_features.size, "Mismatch in image feature positions and available features."
-    # Assign reshaped image features
-    reshaped_image_features = image_features.reshape(-1, embed_dim)[:num_image_positions]
-    final_embedding[image_positions] = reshaped_image_features
     final_attention_mask = np.logical_or(final_attention_mask, image_to_overwrite).astype(final_attention_mask.dtype)
     position_ids = final_attention_mask.cumsum(axis=-1) - 1
     position_ids = np.where(final_attention_mask == 0, 1, position_ids)
     batch_indices, pad_indices = np.where(input_ids == pad_token_id)
     indices_to_mask = new_token_positions[batch_indices, pad_indices]
     final_embedding[batch_indices, indices_to_mask] = 0
     return final_embedding, final_attention_mask, position_ids
 # Load model and processor
@@ -199,10 +198,14 @@ def describe_image(image):
     decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
     return decoded_output
-# Streamlit app
-st.title("Image Description Generator")
-uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"])
-if uploaded_image is not None:
-    image = Image.open(uploaded_image)
-    description = describe_image(image)
-    st.text_area("Description", description, height=300)

+import gradio as gr
 import torch
 import onnxruntime as ort
 from PIL import Image
 from transformers import AutoTokenizer, AutoProcessor
 import os
+os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/decoder_model_merged_q4f16.onnx')
+os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/embed_tokens_q4f16.onnx')
+os.system('wget https://huggingface.co/llava-hf/llava-interleave-qwen-0.5b-hf/resolve/main/onnx/vision_encoder_q4f16.onnx')
 # Load the tokenizer and processor
 tokenizer = AutoTokenizer.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
 processor = AutoProcessor.from_pretrained("llava-hf/llava-interleave-qwen-0.5b-hf")
 decoder_session = ort.InferenceSession("decoder_model_merged_q4f16.onnx")
 embed_tokens_session = ort.InferenceSession("embed_tokens_q4f16.onnx")
+def merge_input_ids_with_image_features(image_features, inputs_embeds, input_ids, attention_mask,pad_token_id,special_image_token_id):
     num_images, num_image_patches, embed_dim = image_features.shape
     batch_size, sequence_length = input_ids.shape
+    left_padding = not np.sum(input_ids[:, -1] == pad_token_id)
+    # 1. Create a mask to know where special image tokens are
     special_image_token_mask = input_ids == special_image_token_id
     num_special_image_tokens = np.sum(special_image_token_mask, axis=-1)
+    # Compute the maximum embed dimension
     max_embed_dim = (num_special_image_tokens.max() * (num_image_patches - 1)) + sequence_length
     batch_indices, non_image_indices = np.where(input_ids != special_image_token_id)
+    # 2. Compute the positions where text should be written
+    # Calculate new positions for text tokens in merged image-text sequence.
+    # `special_image_token_mask` identifies image tokens. Each image token will be replaced by `nb_text_tokens_per_images - 1` text tokens.
+    # `np.cumsum` computes how each image token shifts subsequent text token positions.
+    # - 1 to adjust for zero-based indexing, as `cumsum` inherently increases indices by one.
     new_token_positions = np.cumsum((special_image_token_mask * (num_image_patches - 1) + 1), -1) - 1
     nb_image_pad = max_embed_dim - 1 - new_token_positions[:, -1]
+    if left_padding:
+        new_token_positions += nb_image_pad[:, None]  # offset for left padding
     text_to_overwrite = new_token_positions[batch_indices, non_image_indices]
+    # 3. Create the full embedding, already padded to the maximum position
     final_embedding = np.zeros((batch_size, max_embed_dim, embed_dim), dtype=np.float32)
     final_attention_mask = np.zeros((batch_size, max_embed_dim), dtype=np.int64)
+    # 4. Fill the embeddings based on the mask. If we have ["hey" "<image>", "how", "are"]
+    # we need to index copy on [0, 577, 578, 579] for the text and [1:576] for the image features
     final_embedding[batch_indices, text_to_overwrite] = inputs_embeds[batch_indices, non_image_indices]
     final_attention_mask[batch_indices, text_to_overwrite] = attention_mask[batch_indices, non_image_indices]
+    # 5. Fill the embeddings corresponding to the images. Anything that is not `text_positions` needs filling (#29835)
+    image_to_overwrite = np.full((batch_size, max_embed_dim), True)
     image_to_overwrite[batch_indices, text_to_overwrite] = False
+    image_to_overwrite &= image_to_overwrite.cumsum(-1) - 1 >= nb_image_pad[:, None]
+    final_embedding[image_to_overwrite] = image_features.reshape(-1, embed_dim)
     final_attention_mask = np.logical_or(final_attention_mask, image_to_overwrite).astype(final_attention_mask.dtype)
     position_ids = final_attention_mask.cumsum(axis=-1) - 1
     position_ids = np.where(final_attention_mask == 0, 1, position_ids)
+    # 6. Mask out the embedding at padding positions, as we later use the past_key_value value to determine the non-attended tokens.
     batch_indices, pad_indices = np.where(input_ids == pad_token_id)
     indices_to_mask = new_token_positions[batch_indices, pad_indices]
     final_embedding[batch_indices, indices_to_mask] = 0
     return final_embedding, final_attention_mask, position_ids
 # Load model and processor
     decoded_output = tokenizer.decode(token_ids, skip_special_tokens=True)
     return decoded_output
+# Create Gradio interface
+interface = gr.Interface(
+    fn=describe_image,
+    inputs=gr.Image(type="pil"),
+    outputs=gr.Textbox(lines=5, placeholder="Description will appear here"),
+    title="Image Description Generator",
+    description="Upload an image to get a detailed description."
+)
+# Enable API
+interface.launch(share=True,show_error=True,debug=True)