JUNJIE99 commited on
Commit
2b0f1cd
·
1 Parent(s): 8f2764b

Integrate with Sentence Transformers v5.4 (#3)

Browse files

- Integrate with Sentence Transformers v5.4 (c1b7bef7440a5b0b5a101263274c4683fe7db712)
- Use the correct ST version (b2543ad42881007d9ade76455e943c6ce6bca9c1)

1_Pooling/config.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "embedding_dimension": 2048,
3
+ "pooling_mode": "lasttoken",
4
+ "include_prompt": true
5
+ }
README.md CHANGED
@@ -10,7 +10,10 @@ metrics:
10
  - recall
11
  base_model:
12
  - Qwen/Qwen2.5-VL-3B-Instruct
13
- library_name: transformers == 4.51.3
 
 
 
14
  ---
15
 
16
  <h1 align="center">Vis-IR: Unifying Search With Visualized Information Retrieval</h1>
@@ -71,9 +74,49 @@ In this work, we formally define an emerging IR paradigm called Visualized Infor
71
 
72
  ## Model Usage
73
 
74
- > Our code works well on transformers==4.51.3, and we recommend using this version.
 
 
 
 
 
75
 
76
- ### 1. UniSE-MLLM Models
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
  ```python
79
  import torch
 
10
  - recall
11
  base_model:
12
  - Qwen/Qwen2.5-VL-3B-Instruct
13
+ library_name: sentence-transformers
14
+ tags:
15
+ - sentence-transformers
16
+ pipeline_tag: sentence-similarity
17
  ---
18
 
19
  <h1 align="center">Vis-IR: Unifying Search With Visualized Information Retrieval</h1>
 
74
 
75
  ## Model Usage
76
 
77
+ ### Using Sentence Transformers
78
+
79
+ Install Sentence Transformers:
80
+ ```bash
81
+ pip install "sentence_transformers[image]"
82
+ ```
83
 
84
+ ```python
85
+ from sentence_transformers import SentenceTransformer
86
+
87
+ model = SentenceTransformer("BAAI/BGE-VL-Screenshot")
88
+
89
+ # Queries: composed image + text inputs (prefix text with "Query:")
90
+ query_inputs = [
91
+ {"text": "Query:After a 17% drop, what is Nvidia's closing stock price?", "image": "https://huggingface.co/BAAI/BGE-VL-Screenshot/resolve/main/assets/query_1.png"},
92
+ {"text": "Query:I would like to see a detailed and intuitive performance comparison between the two models.", "image": "https://huggingface.co/BAAI/BGE-VL-Screenshot/resolve/main/assets/query_2.png"},
93
+ ]
94
+ query_embeddings = model.encode_query(query_inputs)
95
+ print(query_embeddings.shape)
96
+ # (2, 2048)
97
+
98
+ # Candidates: screenshot images
99
+ candidate_inputs = [
100
+ "https://huggingface.co/BAAI/BGE-VL-Screenshot/resolve/main/assets/positive_1.jpeg",
101
+ "https://huggingface.co/BAAI/BGE-VL-Screenshot/resolve/main/assets/neg_1.jpeg",
102
+ "https://huggingface.co/BAAI/BGE-VL-Screenshot/resolve/main/assets/positive_2.jpeg",
103
+ "https://huggingface.co/BAAI/BGE-VL-Screenshot/resolve/main/assets/neg_2.jpeg",
104
+ ]
105
+ candidate_embeddings = model.encode_document(candidate_inputs)
106
+ print(candidate_embeddings.shape)
107
+ # (4, 2048)
108
+
109
+ similarities = model.similarity(query_embeddings, candidate_embeddings)
110
+ print(similarities)
111
+ # tensor([[0.5725, 0.3449, 0.1913, 0.1497],
112
+ # [0.1457, 0.0795, 0.4243, 0.4177]])
113
+ ```
114
+
115
+ The model provides two prompts: `"query"` for composed image+text queries and `"document"` (default) for screenshot candidates.
116
+
117
+ ### Using transformers
118
+
119
+ > Our code works well on transformers==4.51.3, and we recommend using this version.
120
 
121
  ```python
122
  import torch
chat_template.jinja ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {%- if messages[0].role == 'system' -%}
2
+ {{- '<|im_start|>system\n' }}
3
+ {%- if messages[0].content is string -%}
4
+ {{- messages[0].content }}
5
+ {%- else -%}
6
+ {%- for content in messages[0].content -%}
7
+ {%- if 'text' in content -%}
8
+ {{- content.text }}
9
+ {%- endif -%}
10
+ {%- endfor -%}
11
+ {%- endif -%}
12
+ {{- '<|im_end|>\n' }}
13
+ {%- else -%}
14
+ {{- '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n' }}
15
+ {%- endif -%}
16
+ {%- set image_count = namespace(value=0) -%}
17
+ {%- set video_count = namespace(value=0) -%}
18
+ {%- for message in messages -%}
19
+ {%- if message.role == "user" -%}
20
+ {{- '<|im_start|>' + message.role + '\n' }}
21
+ {%- if message.content is string -%}
22
+ {{- message.content }}
23
+ {%- else -%}
24
+ {%- for content in message.content -%}
25
+ {%- if content.type == 'image' or 'image' in content or 'image_url' in content -%}
26
+ {%- set image_count.value = image_count.value + 1 -%}
27
+ {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
28
+ {%- elif content.type == 'video' or 'video' in content -%}
29
+ {%- set video_count.value = video_count.value + 1 -%}
30
+ {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
31
+ {%- elif 'text' in content -%}
32
+ {{- content.text }}
33
+ {%- endif -%}
34
+ {%- endfor -%}
35
+ {%- endif -%}
36
+ {{- '<|im_end|>\n' }}
37
+ {%- elif message.role != "system" -%}
38
+ {{- '<|im_start|>' + message.role + '\n' }}
39
+ {%- if message.content is string -%}
40
+ {{- message.content }}
41
+ {%- else -%}
42
+ {%- for content in message.content -%}
43
+ {%- if 'text' in content -%}
44
+ {{- content.text }}
45
+ {%- endif -%}
46
+ {%- endfor -%}
47
+ {%- endif -%}
48
+ {{- '<|im_end|>\n' }}
49
+ {%- endif -%}
50
+ {%- endfor -%}
51
+ {%- if add_generation_prompt -%}
52
+ {{- '<|im_start|>assistant\n<|endoftext|>' }}
53
+ {%- endif -%}
config_sentence_transformers.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "pytorch": "2.10.0+cu128",
4
+ "sentence_transformers": "5.4.0",
5
+ "transformers": "5.5.0"
6
+ },
7
+ "default_prompt_name": "document",
8
+ "model_type": "SentenceTransformer",
9
+ "prompts": {
10
+ "document": "Represent the given text-rich image, focusing on extracting and interpreting both its rich text content and visual features.",
11
+ "query": "Represent the given image with the given query."
12
+ },
13
+ "similarity_fn_name": "cosine"
14
+ }
modules.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx": 0,
4
+ "name": "0",
5
+ "path": "",
6
+ "type": "sentence_transformers.base.modules.transformer.Transformer"
7
+ },
8
+ {
9
+ "idx": 1,
10
+ "name": "1",
11
+ "path": "1_Pooling",
12
+ "type": "sentence_transformers.sentence_transformer.modules.pooling.Pooling"
13
+ },
14
+ {
15
+ "idx": 2,
16
+ "name": "2",
17
+ "path": "2_Normalize",
18
+ "type": "sentence_transformers.sentence_transformer.modules.normalize.Normalize"
19
+ }
20
+ ]
preprocessor_config.json CHANGED
@@ -1,6 +1,6 @@
1
  {
2
- "min_pixels": 3136,
3
- "max_pixels": 12845056,
4
  "patch_size": 14,
5
  "temporal_patch_size": 2,
6
  "merge_size": 2,
 
1
  {
2
+ "min_pixels": 50176,
3
+ "max_pixels": 1960000,
4
  "patch_size": 14,
5
  "temporal_patch_size": 2,
6
  "merge_size": 2,
sentence_bert_config.json ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "transformer_task": "feature-extraction",
3
+ "modality_config": {
4
+ "text": {
5
+ "method": "forward",
6
+ "method_output_name": "last_hidden_state"
7
+ },
8
+ "image": {
9
+ "method": "forward",
10
+ "method_output_name": "last_hidden_state"
11
+ },
12
+ "image+text": {
13
+ "method": "forward",
14
+ "method_output_name": "last_hidden_state"
15
+ },
16
+ "message": {
17
+ "method": "forward",
18
+ "method_output_name": "last_hidden_state",
19
+ "format": "structured"
20
+ }
21
+ },
22
+ "module_output_name": "token_embeddings",
23
+ "processing_kwargs": {
24
+ "chat_template": {
25
+ "add_generation_prompt": true
26
+ }
27
+ }
28
+ }