TIGER-Lab/MMEB-train
Viewer • Updated • 2.14M • 6.32k • 18
How to use radi-cho/cho-embedding-0.8b with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("feature-extraction", model="radi-cho/cho-embedding-0.8b", trust_remote_code=True) # Load model directly
from transformers import AutoProcessor, AutoModel
processor = AutoProcessor.from_pretrained("radi-cho/cho-embedding-0.8b", trust_remote_code=True)
model = AutoModel.from_pretrained("radi-cho/cho-embedding-0.8b", trust_remote_code=True)A multimodal embedding model distilled from Qwen3-VL-Embedding-8B into the Qwen3.5-0.8B architecture. Supports text, images, and multimodal inputs.
| Model | Size | CLS | VQA | RET | GND | Overall |
|---|---|---|---|---|---|---|
| cho-embedding-0.8b | 853M | 54.5 | 59.2 | 60.1 | 81.9 | 60.7 |
| CAFe-0.5B | 894M | 59.1 | 49.1 | 61.0 | 83.0 | 59.6 |
| LLaVE-0.5B | 894M | 57.4 | 50.3 | 59.8 | 82.9 | 59.1 |
| VLM2Vec-V2.0-2B | 2.2B | 62.9 | 56.4 | 69.6 | 77.1 | 64.9 |
| VLM2Vec-V1-2B | 2.2B | 58.6 | 49.2 | 65.0 | 73.1 | 59.7 |
| VLM2Vec-Phi3.5V | 4.2B | 54.8 | 54.9 | 62.3 | 79.5 | 60.1 |
from transformers import AutoModel, AutoProcessor
import torch
import torch.nn.functional as F
model_path = "radi-cho/cho-embedding-0.8b"
model = AutoModel.from_pretrained(model_path, trust_remote_code=True, dtype=torch.bfloat16).to("cuda").eval()
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
processor.tokenizer.padding_side = "right"
def embed(texts, instruction="Represent the user's input."):
conversations = []
for text in texts:
conversations.append([
{"role": "system", "content": [{"type": "text", "text": instruction}]},
{"role": "user", "content": [{"type": "text", "text": text}]}
])
formatted = processor.tokenizer.apply_chat_template(
conversations, add_generation_prompt=False, tokenize=False)
formatted = [t.rstrip() + "<|endoftext|>" for t in formatted]
inputs = processor.tokenizer(formatted, padding=True, return_tensors="pt").to("cuda")
with torch.no_grad():
outputs = model(**inputs)
last_hidden = outputs.last_hidden_state
attn = inputs["attention_mask"]
last_pos = attn.shape[1] - attn.flip(dims=[1]).argmax(dim=1) - 1
row_idx = torch.arange(last_hidden.shape[0], device="cuda")
embeddings = last_hidden[row_idx, last_pos, :1024]
embeddings = F.normalize(embeddings.float(), p=2, dim=-1)
return embeddings
# Example
queries = embed(["A dog playing on the beach"], instruction="Find a matching image caption.")
docs = embed(["A golden retriever runs along the shoreline at sunset"])
similarity = (queries @ docs.T).item()
print(f"Similarity: {similarity:.4f}")
from qwen_vl_utils.vision_process import process_vision_info
conversations = [[
{"role": "system", "content": [{"type": "text", "text": "Represent the image for retrieval."}]},
{"role": "user", "content": [{"type": "image", "image": "file:///path/to/image.jpg"}]}
]]
texts = processor.tokenizer.apply_chat_template(conversations, add_generation_prompt=False, tokenize=False)
texts = [t.rstrip() + "<|endoftext|>" for t in texts]
images, _, video_kwargs = process_vision_info(conversations, return_video_metadata=True, return_video_kwargs=True)
inputs = processor(text=texts, images=images, padding=True, return_tensors="pt", **video_kwargs).to("cuda")
with torch.no_grad():
outputs = model(**inputs)
# ... extract last token embedding as above
Model is released under Apache 2.0. Please cite this work if used in academic publications, preprints, etc.
@misc{choembedding,
title={cho-embedding-0.8b: Vision-Language Embeddings via Contrastive Hard-negatives Objective},
author={Cholakov, Radostin},
year={2026}
}