Add embedding tag to model card
#1
by
nielsr
HF Staff
- opened
README.md
CHANGED
|
@@ -1,15 +1,16 @@
|
|
| 1 |
---
|
| 2 |
-
license: apache-2.0
|
| 3 |
-
library_name: transformers
|
| 4 |
language:
|
| 5 |
- en
|
|
|
|
|
|
|
|
|
|
| 6 |
tags:
|
| 7 |
- Sentence Similarity
|
| 8 |
- Embedding
|
| 9 |
-
- zero-shot-image-classification
|
| 10 |
-
- video-text-to-text
|
| 11 |
-
pipeline_tag: image-text-to-text
|
| 12 |
---
|
|
|
|
| 13 |
# LLaVE-7B
|
| 14 |
|
| 15 |
## Model Summary
|
|
@@ -81,7 +82,8 @@ conv_template = "qwen_1_5" # Make sure you use correct chat template for differ
|
|
| 81 |
question = DEFAULT_IMAGE_TOKEN + " Represent the given image with the following question: What is in the image"
|
| 82 |
conv = copy.deepcopy(conv_templates[conv_template])
|
| 83 |
conv.append_message(conv.roles[0], question)
|
| 84 |
-
conv.append_message(conv.roles[1], "
|
|
|
|
| 85 |
prompt_question = conv.get_prompt()
|
| 86 |
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
|
| 87 |
attention_mask=input_ids.ne(tokenizer.pad_token_id)
|
|
@@ -91,7 +93,8 @@ query_embed = model.encode_multimodal_embeddings(input_ids, attention_mask=atten
|
|
| 91 |
target_string = "A cat and a dog"
|
| 92 |
conv = copy.deepcopy(conv_templates[conv_template])
|
| 93 |
conv.append_message(conv.roles[0], target_string)
|
| 94 |
-
conv.append_message(conv.roles[1], "
|
|
|
|
| 95 |
target_string = conv.get_prompt()
|
| 96 |
target_input_ids = tokenizer(target_string, return_tensors="pt").input_ids.to(device)
|
| 97 |
attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
|
|
@@ -103,7 +106,8 @@ print("A cat and a dog similarity score: ", query_embed @ target_embed.T)
|
|
| 103 |
neg_string = "A cat and a tiger"
|
| 104 |
conv = copy.deepcopy(conv_templates[conv_template])
|
| 105 |
conv.append_message(conv.roles[0], neg_string)
|
| 106 |
-
conv.append_message(conv.roles[1], "
|
|
|
|
| 107 |
neg_string = conv.get_prompt()
|
| 108 |
neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
|
| 109 |
attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
|
|
@@ -116,7 +120,8 @@ print("A cat and a tiger similarity score: ", query_embed @ neg_embed.T)
|
|
| 116 |
pos_string = "Find me an everyday image that matches the given caption: A cat and a dog."
|
| 117 |
conv = copy.deepcopy(conv_templates[conv_template])
|
| 118 |
conv.append_message(conv.roles[0], pos_string)
|
| 119 |
-
conv.append_message(conv.roles[1], "
|
|
|
|
| 120 |
pos_string = conv.get_prompt()
|
| 121 |
pos_input_ids = tokenizer(pos_string, return_tensors="pt").input_ids.to(device)
|
| 122 |
attention_mask=pos_input_ids.ne(tokenizer.pad_token_id)
|
|
@@ -125,7 +130,8 @@ pos_query_embed = model.encode_multimodal_embeddings(pos_input_ids, attention_ma
|
|
| 125 |
target = DEFAULT_IMAGE_TOKEN + " Represent the given image."
|
| 126 |
conv = copy.deepcopy(conv_templates[conv_template])
|
| 127 |
conv.append_message(conv.roles[0], target)
|
| 128 |
-
conv.append_message(conv.roles[1], "
|
|
|
|
| 129 |
prompt_target = conv.get_prompt()
|
| 130 |
target_input_ids = tokenizer_image_token(prompt_target, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
|
| 131 |
attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
|
|
@@ -138,7 +144,8 @@ print("A cat and a dog image similarity score: ", pos_query_embed @ target_embed
|
|
| 138 |
neg_string = "Find me an everyday image that matches the given caption: A cat and a tiger."
|
| 139 |
conv = copy.deepcopy(conv_templates[conv_template])
|
| 140 |
conv.append_message(conv.roles[0], neg_string)
|
| 141 |
-
conv.append_message(conv.roles[1], "
|
|
|
|
| 142 |
neg_string = conv.get_prompt()
|
| 143 |
neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
|
| 144 |
attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
|
|
|
|
| 1 |
---
|
|
|
|
|
|
|
| 2 |
language:
|
| 3 |
- en
|
| 4 |
+
library_name: transformers
|
| 5 |
+
license: apache-2.0
|
| 6 |
+
pipeline_tag: image-text-to-text
|
| 7 |
tags:
|
| 8 |
- Sentence Similarity
|
| 9 |
- Embedding
|
| 10 |
+
- zero-shot-image-classification
|
| 11 |
+
- video-text-to-text
|
|
|
|
| 12 |
---
|
| 13 |
+
|
| 14 |
# LLaVE-7B
|
| 15 |
|
| 16 |
## Model Summary
|
|
|
|
| 82 |
question = DEFAULT_IMAGE_TOKEN + " Represent the given image with the following question: What is in the image"
|
| 83 |
conv = copy.deepcopy(conv_templates[conv_template])
|
| 84 |
conv.append_message(conv.roles[0], question)
|
| 85 |
+
conv.append_message(conv.roles[1], "
|
| 86 |
+
")
|
| 87 |
prompt_question = conv.get_prompt()
|
| 88 |
input_ids = tokenizer_image_token(prompt_question, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
|
| 89 |
attention_mask=input_ids.ne(tokenizer.pad_token_id)
|
|
|
|
| 93 |
target_string = "A cat and a dog"
|
| 94 |
conv = copy.deepcopy(conv_templates[conv_template])
|
| 95 |
conv.append_message(conv.roles[0], target_string)
|
| 96 |
+
conv.append_message(conv.roles[1], "
|
| 97 |
+
")
|
| 98 |
target_string = conv.get_prompt()
|
| 99 |
target_input_ids = tokenizer(target_string, return_tensors="pt").input_ids.to(device)
|
| 100 |
attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
|
|
|
|
| 106 |
neg_string = "A cat and a tiger"
|
| 107 |
conv = copy.deepcopy(conv_templates[conv_template])
|
| 108 |
conv.append_message(conv.roles[0], neg_string)
|
| 109 |
+
conv.append_message(conv.roles[1], "
|
| 110 |
+
")
|
| 111 |
neg_string = conv.get_prompt()
|
| 112 |
neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
|
| 113 |
attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
|
|
|
|
| 120 |
pos_string = "Find me an everyday image that matches the given caption: A cat and a dog."
|
| 121 |
conv = copy.deepcopy(conv_templates[conv_template])
|
| 122 |
conv.append_message(conv.roles[0], pos_string)
|
| 123 |
+
conv.append_message(conv.roles[1], "
|
| 124 |
+
")
|
| 125 |
pos_string = conv.get_prompt()
|
| 126 |
pos_input_ids = tokenizer(pos_string, return_tensors="pt").input_ids.to(device)
|
| 127 |
attention_mask=pos_input_ids.ne(tokenizer.pad_token_id)
|
|
|
|
| 130 |
target = DEFAULT_IMAGE_TOKEN + " Represent the given image."
|
| 131 |
conv = copy.deepcopy(conv_templates[conv_template])
|
| 132 |
conv.append_message(conv.roles[0], target)
|
| 133 |
+
conv.append_message(conv.roles[1], "
|
| 134 |
+
")
|
| 135 |
prompt_target = conv.get_prompt()
|
| 136 |
target_input_ids = tokenizer_image_token(prompt_target, tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt").unsqueeze(0).to(device)
|
| 137 |
attention_mask=target_input_ids.ne(tokenizer.pad_token_id)
|
|
|
|
| 144 |
neg_string = "Find me an everyday image that matches the given caption: A cat and a tiger."
|
| 145 |
conv = copy.deepcopy(conv_templates[conv_template])
|
| 146 |
conv.append_message(conv.roles[0], neg_string)
|
| 147 |
+
conv.append_message(conv.roles[1], "
|
| 148 |
+
")
|
| 149 |
neg_string = conv.get_prompt()
|
| 150 |
neg_input_ids = tokenizer(neg_string, return_tensors="pt").input_ids.to(device)
|
| 151 |
attention_mask=neg_input_ids.ne(tokenizer.pad_token_id)
|