zhibinlan
/

UME-R1-7B

Image-Text-to-Text

Sentence Similarity

zero-shot-image-classification

video-text-to-text

text-generation-inference

Model card Files Files and versions

zhibinlan commited on Oct 15, 2025

Commit

391930a

·

1 Parent(s): caab229

update

Files changed (1) hide show

README.md +3 -3

README.md CHANGED Viewed

@@ -57,13 +57,13 @@ from qwen_vl_utils import process_vision_info
 import torch
 model = Qwen2VLForConditionalGeneration.from_pretrained(
-    "zhibinlan/UME-R1-2B",
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
     device_map="cuda:0",
 )
-processor = AutoProcessor.from_pretrained("zhibinlan/UME-R1-2B")
 prompt = '''Represent the above input text, images, videos, or any combination of the three as embeddings.
 First output the thinking process in <think> </think> tags and then summarize the entire input in a word or sentence.
@@ -148,7 +148,7 @@ from transformers import Qwen2VLForConditionalGeneration,AutoProcessor
 from qwen_vl_utils import process_vision_info
 import torch
-pretrained_path = "release/UME-R1-2B"
 # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
 model = Qwen2VLForConditionalGeneration.from_pretrained(

 import torch
 model = Qwen2VLForConditionalGeneration.from_pretrained(
+    "zhibinlan/UME-R1-7B",
     torch_dtype=torch.bfloat16,
     attn_implementation="flash_attention_2",
     device_map="cuda:0",
 )
+processor = AutoProcessor.from_pretrained("zhibinlan/UME-R1-7B")
 prompt = '''Represent the above input text, images, videos, or any combination of the three as embeddings.
 First output the thinking process in <think> </think> tags and then summarize the entire input in a word or sentence.
 from qwen_vl_utils import process_vision_info
 import torch
+pretrained_path = "zhibinlan/UME-R1-7B"
 # We recommend enabling flash_attention_2 for better acceleration and memory saving, especially in multi-image and video scenarios.
 model = Qwen2VLForConditionalGeneration.from_pretrained(