Need to define tokenizer and process_vision_info helper in demo code
Summary
There are missing definitions for tokenizer and process_vision_info, which are referenced in the example script but were not previously declared. With these additions, the end-to-end vision–language demo now runs without raising NameErrors.
Tokenizer definition
# Initialize tokenizer from the loaded processor
tokenizer = processor.tokenizer
Utility: process_vision_info
def process_vision_info(
msgs,
num_video_frames: int = 8,
download_timeout: int = 20,
):
import io, os, base64, tempfile, requests, math
from PIL import Image
import numpy as np
try:
from decord import VideoReader, cpu as dec_cpu
_use_decord = True
except ImportError:
import cv2
_use_decord = False
images, videos = [], []
def _download(url: str) -> str:
resp = requests.get(url, stream=True, timeout=download_timeout)
resp.raise_for_status()
suffix = os.path.splitext(url)[-1] or ".mp4"
tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
for chunk in resp.iter_content(chunk_size=1 << 20):
tmp.write(chunk)
tmp.close()
return tmp.name
for msg in msgs:
for part in msg.get("content", []):
typ = part.get("type")
data = part.get("image") if typ == "image" else part.get("video")
if typ == "image":
# ---------- 이미지 ----------
if isinstance(data, Image.Image):
img = data.convert("RGB")
elif isinstance(data, str) and data.startswith("data:image"):
header, b64data = data.split(",", 1)
img = Image.open(io.BytesIO(base64.b64decode(b64data))).convert("RGB")
elif isinstance(data, str) and data.startswith("http"):
img = Image.open(io.BytesIO(requests.get(data, timeout=download_timeout).content)).convert("RGB")
else: # 로컬 경로
img = Image.open(data).convert("RGB")
images.append(img)
elif typ == "video":
# ---------- 비디오 ----------
fps_hint = float(part.get("fps", 0) or 0) # 메시지에 fps 지정이 있으면 사용
n_frames = int(part.get("num_frames", num_video_frames))
# 1) 로컬 경로 or URL → 파일 확보
if isinstance(data, str) and data.startswith("http"):
vid_path = _download(data)
else:
vid_path = data # 이미 로컬 경로
# 2) 프레임 추출
frames = []
if _use_decord:
vr = VideoReader(vid_path, dec_cpu())
total = len(vr)
idxs = np.linspace(0, total - 1, n_frames, dtype=int)
for i in idxs:
try:
frame = vr[i].asnumpy() # (H,W,3) RGB
except:
frame = vr[i].numpy()
frames.append(Image.fromarray(frame))
else:
cap = cv2.VideoCapture(vid_path)
if not cap.isOpened():
continue
total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
# decord가 없으면 일정 간격으로 CAP_PROP_POS_FRAMES 이동
idxs = np.linspace(0, total - 1, n_frames, dtype=int)
for i in idxs:
cap.set(cv2.CAP_PROP_POS_FRAMES, int(i))
ret, frame = cap.read()
if not ret:
continue
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frames.append(Image.fromarray(frame))
cap.release()
videos.append(frames)
return images, videos
I used the code above. I’m not exactly sure how it works, but it does.
The T2V results on the MSRVTT-1K (JSFusion split) are as follows:
R@1 42.2
R@5 68.5
R@10 77.3
AVG 62.7
Hi! Thanks for your comment.
We've just updated the README to better guide you in using GME-VARCO-VISION-Embedding. Please remember to install qwen_vl_utils beforehand, as we utilize process_vision_info from that package.
It fixed!! I tried with updated code again. Now the T2V(Text to Video Retrieval) results on the MSRVTT-1K (JSFusion split) are as follows:
R@1 47.0
R@5 71.8
R@10 79.80000000000001
AVG 66.2
Thank you for the update :)