This is the vision encoder component of the Qwen-3VL-4B-Instruct model. For more details, please visit the original model page or refer to the Qwen-VL technical reports published by Qwen.
Qwen3-VL-4B Vision Encoder: The strangeropshf/qwen3-vl-4b-vision_encoder is the isolated vision encoder component extracted from Qwen3-VL-4B-Instruct, featuring a DeepStack multi-level Vision Transformer (ViT) architecture that fuses hierarchical feature maps from multiple layers to capture both fine-grained details and global context simultaneously. It employs Interleaved-MRoPE positional embeddings for full-frequency spatial-temporal encoding across width, height, and time dimensions, enabling robust handling of high-resolution images (up to 896×896, 256 tokens/image) and long videos with precise text-timestamp alignment for second-level event localization. This native-resolution encoder processes dynamic aspect ratios without fixed-size cropping via NaViT-style dynamic tiling, delivering superior spatial reasoning, 2D/3D grounding, and video dynamics comprehension critical for the model's agentic capabilities.
Quick Start with Transformers
Install the required packages
gradio # - gradio@6.3.0
torch==2.8.0
torchvision
transformers==4.57.6
accelerate
Usage
import torch
import gradio as gr
from transformers import AutoProcessor
from transformers.models.qwen3_vl import Qwen3VLVisionModel
from PIL import Image
MODEL_ID = "strangeropshf/qwen3-vl-4b-vision_encoder"
device = "cuda" if torch.cuda.is_available() else "cpu"
dtype = torch.bfloat16 if device == "cuda" else torch.float32
print("Loading image processor...")
full_processor = AutoProcessor.from_pretrained(
MODEL_ID, trust_remote_code=True
)
image_processor = full_processor.image_processor
print("Image processor loaded.")
print("Loading Qwen3-VL vision encoder...")
vision_model = Qwen3VLVisionModel.from_pretrained(
MODEL_ID,
trust_remote_code=True,
torch_dtype=dtype
).to(device).eval()
print("Vision encoder ready.")
import inspect
sig = inspect.signature(vision_model.forward)
print(f"forward() signature: {sig}")
@torch.inference_mode()
def run_vision_encoder(image_path: str):
if image_path is None:
return "No image provided."
image = Image.open(image_path).convert("RGB")
inputs = image_processor(images=[image], return_tensors="pt")
pixel_values = inputs["pixel_values"].to(device=device, dtype=dtype)
grid_thw = inputs["image_grid_thw"].to(device=device)
# Qwen3VLVisionModel.forward(hidden_states, grid_thw) → Tensor
# "hidden_states" = pixel_values
# Returns a PLAIN tensor, not a named tuple
feats = vision_model(
hidden_states=pixel_values,
grid_thw=grid_thw
)
if hasattr(feats, "last_hidden_state"):
feats = feats.last_hidden_state
elif isinstance(feats, (tuple, list)):
feats = feats[0]
shape_str = f"Output shape: {tuple(feats.shape)}"
if feats.dim() == 3: # (batch, seq, hidden)
sample = feats[0, 0, :8]
elif feats.dim() == 2: # (total_patches, hidden)
sample = feats[0, :8]
else:
sample = feats.flatten()[:8]
sample_np = sample.detach().cpu().float().numpy()
return (
f"{shape_str}\n"
f"dtype: {feats.dtype}\n"
f"device: {feats.device}\n"
f"grid_thw: {grid_thw.cpu().tolist()}\n\n"
f"Sample values (first token, first 8 dims):\n"
f"{sample_np}"
)
with gr.Blocks() as demo:
gr.Markdown("## Qwen3-VL Vision Encoder")
with gr.Row():
image_input = gr.Image(type="filepath", label="Input Image")
output_text = gr.Textbox(label="Vision Encoder Output", lines=10)
run_btn = gr.Button("Run Vision Encoder")
run_btn.click(
fn=run_vision_encoder,
inputs=image_input,
outputs=output_text
)
demo.launch(debug=True)
- Downloads last month
- 7
Model tree for strangeropshf/qwen3-vl-4b-vision_encoder
Base model
Qwen/Qwen3-VL-4B-Instruct