Using 🤗 Transformers to Import Model
from modeling_contextvla import ContextVLA_Qwen3VL
from transformers import AutoProcessor
model = ContextVLA_Qwen3VL.from_pretrained(
"huiwon/roboalign_contextvla_oxe_sft_1epoch",
attn_implementation="flash_attention_2",
torch_dtype=torch.bfloat16
)
processor = AutoProcessor.from_pretrained(
"huiwon/roboalign_contextvla_oxe_sft_1epoch"
)
processor.tokenizer.model_max_length = training_args.model_max_length
processor.tokenizer.padding_side = "left"
conversations = [
{
"from": "system",
"value": "You are an embodied vision-language robotic assistant for multi-object manipulation."
},
{
"from": "human",
"value": "<video>\nCurrent task is <instruction>. <embodiment_tag>. <state>. Output the robot\u2019s actions to perform this task through Fast tokens."
},
{
"from": "gpt",
"value": "<fast_token>"
}
]