owl10's picture
Upload folder using huggingface_hub
a219443 verified
{
"use_ray": false,
"ray_exp_name": null,
"device_groups": null,
"model": "/high_perf_store3/world-model/zhuzhenxin/ckpts/Qwen3-VL-2B-Instruct/",
"model_type": "qwen3_vl",
"model_revision": null,
"task_type": "causal_lm",
"torch_dtype": "bfloat16",
"attn_impl": null,
"new_special_tokens": [
"<FRONT_VIEW>",
"<FRONT_LEFT_VIEW>",
"<FRONT_RIGHT_VIEW>",
"<BACK_LEFT_VIEW>",
"<BACK_RIGHT_VIEW>",
"<BACK_VIEW>"
],
"num_labels": null,
"problem_type": null,
"rope_scaling": {
"mrope_interleaved": true,
"mrope_section": [
24,
20,
20
],
"rope_type": "default"
},
"device_map": null,
"max_memory": {},
"max_model_len": null,
"local_repo_path": null,
"init_strategy": null,
"template": "qwen3_vl",
"system": "Generalist Autonomous Driving Agent\nRole: You are an advanced, multimodal AI brain for an autonomous vehicle, capable of Perception, Reasoning, and Planning. Your goal is to drive safely, follow instructions, and deeply understand the dynamic world around you.\n\nContext & Coordinate System\n- Ego-Centric View: You are at the origin (0,0). The X-axis represents the lateral distance (perpendicular), and the Y-axis represents the longitudinal distance (forward).\n- Inputs: You receive multi-view visual observations (<FRONT_VIEW>, <BACK_VIEW>, etc.), historical ego-motion, and vehicle states (velocity, acceleration).\n\nCore Capabilities\n1. **Driving & Planning**:\n - Objective: Generate a safe, comfortable, and feasible 3-second trajectory (6 waypoints, 0.5s interval).\n - Constraints: Strictly adhere to traffic rules, avoid collisions, and respect kinematic limits.\n - Output Format: A sequence of coordinates [(x1,y1), ..., (x6,y6)].\n\n2. **Reasoning & VQA** (Chain-of-Thought):\n - Tasks: Analyze traffic scenes, explain causal logic (e.g., \"Why stop?\"), identify hazards, and answer queries about the environment (weather, road layout, traffic lights).\n - Reasoning: Break down complex scenarios into step-by-step logic, grounding your answers in visual evidence.\n\n3. **Instruction Following & Grounding**:\n - Tasks: Execute navigation commands (e.g., \"Park behind the red truck\") and ground textual descriptions to specific visual regions or objects.\n\n4. **Perception & World Modeling** (Future & Current State):\n - Tasks: Detect and track objects, predict their future motion, and estimate 3D occupancy or scene geometry (Gaussian Splatting/Occ).\n - Understanding: Map semantic elements (lanes, crossings) and dynamic agents into a coherent world model.\n\nInstructions\n- For **Planning** tasks: Output the \"Trajectory\".\n- For **QA/Reasoning** tasks: Provide a clear, logical, and helpful text response.\n- For **Perception** tasks: Output structured descriptions or specific formats as requested.\n\nAlways prioritize safety and clarity in your responses.\n",
"max_length": 16384,
"truncation_strategy": "delete",
"max_pixels": null,
"agent_template": null,
"norm_bbox": null,
"use_chat_template": true,
"padding_side": "right",
"padding_free": true,
"loss_scale": "default",
"sequence_parallel_size": 1,
"template_backend": "swift",
"response_prefix": null,
"enable_thinking": null,
"add_non_thinking_prefix": true,
"dataset": [
"/high_perf_store3/world-model/yongkangli/ABCDEFG_NISHIDASHABI/A/B/UniDriveVLA/Bench2Drive/data/b2d_planning_qa_train_residual.jsonl",
"/high_perf_store3/world-model/yongkangli/Dataset_vqa/Orion_Data/train_converted_processed.jsonl",
"/high_perf_store3/world-model/yongkangli/B2D/Bench2DriveZoo-tcp-admlp/output_final_modified_finalview_processed.jsonl",
"/high_perf_store3/world-model/yongkangli/finevision_subset_cleaned.jsonl#1141184"
],
"val_dataset": [],
"cached_dataset": [],
"cached_val_dataset": [],
"split_dataset_ratio": 0.01,
"data_seed": 42,
"dataset_num_proc": 32,
"load_from_cache_file": true,
"dataset_shuffle": true,
"val_dataset_shuffle": false,
"streaming": false,
"interleave_prob": null,
"stopping_strategy": "first_exhausted",
"shuffle_buffer_size": 1000,
"download_mode": "reuse_dataset_if_exists",
"columns": {},
"strict": false,
"remove_unused_columns": true,
"model_name": null,
"model_author": null,
"custom_dataset_info": [],
"quant_method": null,
"quant_bits": null,
"hqq_axis": null,
"bnb_4bit_compute_dtype": "bfloat16",
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_use_double_quant": true,
"bnb_4bit_quant_storage": null,
"max_new_tokens": null,
"temperature": 0.9,
"top_k": 50,
"top_p": 0.9,
"repetition_penalty": 1.0,
"num_beams": 1,
"stream": false,
"stop_words": [],
"logprobs": false,
"top_logprobs": null,
"structured_outputs_regex": null,
"ckpt_dir": null,
"lora_modules": [],
"tuner_backend": "peft",
"train_type": "full",
"adapters": [],
"external_plugins": [],
"seed": 42,
"model_kwargs": {},
"load_args": false,
"load_data_args": false,
"packing": true,
"packing_length": 16384,
"packing_num_proc": 1,
"lazy_tokenize": false,
"custom_register_path": [],
"use_hf": false,
"hub_token": null,
"ddp_timeout": 18000000,
"ddp_backend": null,
"ignore_args_error": false,
"use_swift_lora": false,
"freeze_llm": false,
"freeze_vit": false,
"freeze_aligner": false,
"freeze_parameters": [],
"freeze_parameters_regex": null,
"freeze_parameters_ratio": 0.0,
"trainable_parameters": [],
"trainable_parameters_regex": null,
"adapter_load": null,
"target_modules": [
"all-linear"
],
"target_regex": null,
"modules_to_save": [],
"lora_rank": 8,
"lora_alpha": 32,
"lora_dropout": 0.05,
"lora_bias": "none",
"lora_dtype": null,
"use_rslora": false,
"rlhf_type": null,
"ref_load": null,
"ref_adapter_load": null,
"beta": null,
"rpo_alpha": null,
"reference_free": false,
"label_smoothing": 0.0,
"f_divergence_type": "reverse_kl",
"loss_type": null,
"desirable_weight": 1.0,
"undesirable_weight": 1.0,
"calculate_KL": null,
"center_rewards_coefficient": null,
"teacher_model": null,
"teacher_model_type": null,
"teacher_model_revision": null,
"lmbda": 0.5,
"seq_kd": false,
"offload_teacher_model": false,
"sft_alpha": 0.0,
"generation_batch_size": null,
"steps_per_generation": null,
"num_generations": 8,
"num_generations_eval": null,
"max_completion_length": 512,
"importance_sampling_level": "token",
"tau_pos": 1.0,
"tau_neg": 1.05,
"epsilon": 0.2,
"epsilon_high": null,
"delta": null,
"use_vllm": true,
"vllm_mode": null,
"vllm_enable_prefix_caching": true,
"vllm_gpu_memory_utilization": 0.9,
"vllm_tensor_parallel_size": 1,
"vllm_max_model_len": null,
"vllm_enforce_eager": false,
"vllm_limit_mm_per_prompt": null,
"vllm_disable_cascade_attn": false,
"vllm_max_num_seqs": null,
"vllm_mm_processor_cache_gb": null,
"vllm_engine_kwargs": null,
"sleep_level": 0,
"offload_optimizer": false,
"offload_model": false,
"offload_bridge": false,
"vllm_server_base_url": null,
"vllm_server_host": null,
"vllm_server_port": [
8000
],
"vllm_server_timeout": 240.0,
"vllm_server_group_port": null,
"reward_funcs": [],
"reward_weights": null,
"cosine_min_len_value_wrong": -0.5,
"cosine_max_len_value_wrong": 0.0,
"cosine_min_len_value_correct": 1.0,
"cosine_max_len_value_correct": 0.5,
"cosine_max_len": null,
"repetition_n_grams": 3,
"repetition_max_penalty": -1.0,
"soft_max_length": null,
"soft_cache_length": null,
"dynamic_sample": false,
"max_resample_times": 3,
"overlong_filter": false,
"scale_rewards": "group",
"advantage_estimator": "grpo",
"kl_in_reward": false,
"wandb_log_unique_prompts": null,
"log_completions": false,
"rollout_importance_sampling_mode": null,
"rollout_importance_sampling_threshold": 2.0,
"log_rollout_offpolicy_metrics": false,
"off_policy_sequence_mask_delta": null,
"log_entropy": false,
"top_entropy_quantile": 1.0,
"reward_model": null,
"reward_model_plugin": null,
"sync_ref_model": false,
"ref_model_sync_steps": 512,
"ref_model_mixup_alpha": 0.6,
"async_generate": false,
"move_model_batches": null,
"multi_turn_scheduler": null,
"max_turns": null,
"completion_length_limit_scope": "per_round",
"vllm_server_pass_dataset": false,
"num_iterations": 1,
"check_model": true,
"padded_vocab_size": 151936,
"initialize_embedding": false,
"mlp_padding_free": false,
"load_safetensors": true,
"save_safetensors": true,
"ref_model": null,
"ref_adapters": [],
"merge_lora": true,
"max_shard_size": "5GB",
"train_dataloader_shuffle": true,
"dataloader_pin_memory": true,
"dataloader_persistent_workers": true,
"dataloader_prefetch_factor": 2,
"group_by_length": false,
"architectures": "Qwen3VLForConditionalGeneration",
"llm_architectures": "Qwen3VLForConditionalGeneration",
"max_epochs": 3,
"enable_dft_loss": false,
"enable_channel_loss": false,
"save_strategy": "steps",
"original_max_position_embeddings": null,
"partial_rotary_factor": null,
"use_shared_expert_gate": false,
"report_to": null,
"vit_gradient_checkpointing": true,
"vit_lr": null,
"aligner_lr": null,
"gradient_checkpointing_kwargs": null,
"linear_num_value_heads": null,
"linear_num_key_heads": null,
"linear_key_head_dim": null,
"linear_value_head_dim": null,
"linear_conv_kernel_dim": null,
"layer_types": null,
"mrope_interleaved": true,
"micro_batch_size": 1,
"global_batch_size": 128,
"recompute_granularity": "selective",
"recompute_method": null,
"recompute_num_layers": null,
"recompute_modules": [
"core_attn"
],
"use_cpu_initialization": false,
"deterministic_mode": false,
"train_iters": null,
"log_interval": 5,
"tensorboard_dir": "/high_perf_store3/world-model/yongkangli/ms-swift-main/megatron_output/Qwen3-VL-2B-Instruct-3-7-b2d/v1-20260213-201159/runs",
"no_masked_softmax_fusion": false,
"no_bias_dropout_fusion": false,
"no_bias_swiglu_fusion": false,
"no_rope_fusion": false,
"no_gradient_accumulation_fusion": false,
"cross_entropy_loss_fusion": true,
"cross_entropy_fusion_impl": "native",
"calculate_per_token_loss": true,
"use_flash_attn": false,
"attention_backend": "flash",
"optimizer": "adam",
"optimizer_cpu_offload": false,
"optimizer_offload_fraction": 1.0,
"use_precision_aware_optimizer": false,
"main_grads_dtype": "fp32",
"main_params_dtype": "fp32",
"exp_avg_dtype": "fp32",
"exp_avg_sq_dtype": "fp32",
"dataloader_type": "cyclic",
"manual_gc": false,
"manual_gc_interval": 0,
"lr": 4e-05,
"lr_decay_style": "cosine",
"lr_decay_iters": null,
"lr_warmup_iters": 0,
"lr_warmup_fraction": 0.05,
"min_lr": 1e-06,
"weight_decay": 0.1,
"clip_grad": 1.0,
"adam_beta1": 0.9,
"adam_beta2": 0.95,
"adam_eps": 1e-08,
"sgd_momentum": 0.9,
"save": "/high_perf_store3/world-model/yongkangli/ms-swift-main/megatron_output/Qwen3-VL-2B-Instruct-3-7-b2d/v1-20260213-201159",
"save_interval": 500,
"save_retain_interval": null,
"no_save_optim": true,
"no_save_rng": true,
"load": null,
"no_load_optim": false,
"no_load_rng": false,
"finetune": true,
"ckpt_format": "torch_dist",
"no_initialization": true,
"auto_detect_ckpt_format": true,
"exit_on_missing_checkpoint": true,
"async_save": false,
"use_persistent_ckpt_worker": false,
"ckpt_fully_parallel_load": false,
"ckpt_assume_constant_structure": false,
"distributed_backend": "nccl",
"local_rank": 0,
"use_distributed_optimizer": true,
"tensor_model_parallel_size": 1,
"pipeline_model_parallel_size": 1,
"decoder_first_pipeline_num_layers": null,
"decoder_last_pipeline_num_layers": null,
"account_for_embedding_in_pipeline_split": false,
"account_for_loss_in_pipeline_split": false,
"sequence_parallel": true,
"context_parallel_size": 1,
"tp_comm_overlap": false,
"overlap_grad_reduce": false,
"overlap_param_gather": false,
"distributed_timeout_minutes": 300000,
"num_layers_per_virtual_pipeline_stage": null,
"num_virtual_stages_per_pipeline_rank": null,
"microbatch_group_size_per_virtual_pipeline_stage": null,
"pipeline_model_parallel_layout": null,
"num_layers": 28,
"hidden_size": 2048,
"ffn_hidden_size": 6144,
"num_attention_heads": 16,
"group_query_attention": true,
"num_query_groups": 8,
"softmax_type": "vanilla",
"window_size": null,
"window_attn_skip_freq": null,
"max_position_embeddings": 262144,
"position_embedding_type": "mrope",
"mrope_section": [
24,
20,
20
],
"rotary_base": 5000000,
"rotary_percent": 1.0,
"rotary_interleaved": false,
"normalization": "RMSNorm",
"norm_epsilon": 1e-06,
"swiglu": true,
"quick_geglu": false,
"activation_func_clamp_value": null,
"glu_linear_offset": 0.0,
"untie_embeddings_and_output_weights": false,
"disable_bias_linear": true,
"add_qkv_bias": false,
"attention_dropout": 0.0,
"hidden_dropout": 0.0,
"kv_channels": 128,
"qk_layernorm": true,
"qk_l2_norm": null,
"no_rope_freq": null,
"moe_apply_probs_on_input": null,
"transformer_impl": "transformer_engine",
"num_experts": null,
"moe_layer_freq": 1,
"moe_ffn_hidden_size": null,
"moe_shared_expert_intermediate_size": null,
"moe_router_topk": 2,
"moe_router_num_groups": null,
"moe_router_group_topk": null,
"moe_router_pre_softmax": false,
"moe_router_dtype": "fp32",
"moe_router_score_function": "softmax",
"moe_router_bias_update_rate": null,
"moe_router_enable_expert_bias": false,
"moe_router_topk_scaling_factor": null,
"moe_router_load_balancing_type": "aux_loss",
"expert_model_parallel_size": 1,
"expert_tensor_parallel_size": 1,
"moe_token_dispatcher_type": null,
"moe_enable_deepep": false,
"moe_grouped_gemm": true,
"moe_permute_fusion": false,
"moe_aux_loss_coeff": 0.0,
"moe_z_loss_coeff": null,
"moe_shared_expert_overlap": false,
"moe_layer_recompute": false,
"moe_expert_capacity_factor": null,
"moe_pad_expert_input_to_capacity": false,
"moe_token_drop_policy": null,
"multi_latent_attention": false,
"q_lora_rank": null,
"kv_lora_rank": 32,
"qk_head_dim": 128,
"qk_pos_emb_head_dim": 64,
"mtp_num_layers": null,
"mtp_loss_scaling_factor": 0.1,
"fp8_format": null,
"fp8_recipe": "delayed",
"fp8_amax_history_len": 1024,
"fp8_amax_compute_algo": "max",
"fp8_param_gather": false,
"fp16": false,
"bf16": true,
"apply_query_key_layer_scaling": false,
"attention_softmax_in_fp32": true,
"log_params_norm": false,
"log_throughput": false,
"tensorboard_log_interval": 1,
"tensorboard_queue_size": 50,
"log_timers_to_tensorboard": true,
"no_log_learning_rate_to_tensorboard": false,
"log_validation_ppl_to_tensorboard": true,
"log_memory_to_tensorboard": true,
"logging_level": null,
"wandb_project": "megatron-swift",
"wandb_exp_name": "/high_perf_store3/world-model/yongkangli/ms-swift-main/megatron_output/Qwen3-VL-2B-Instruct-3-7-b2d/v1-20260213-201159",
"wandb_save_dir": null,
"eval_iters": -1,
"eval_interval": 500,
"seq_length": 16384,
"num_workers": 32,
"no_data_sharding": false,
"megatron_extra_kwargs": {},
"add_version": true,
"rank": 0,
"global_world_size": 32,
"local_world_size": 8,
"model_suffix": "Qwen3-VL-2B-Instruct",
"model_info": "ModelInfo(model_type='qwen3_vl', model_dir='/high_perf_store3/world-model/zhuzhenxin/ckpts/Qwen3-VL-2B-Instruct', torch_dtype=torch.bfloat16, max_model_len=262144, quant_method=None, quant_bits=None, rope_scaling={'mrope_interleaved': True, 'mrope_section': [24, 20, 20], 'rope_type': 'default'}, is_moe_model=False, is_multimodal=True, config=None, task_type='causal_lm', num_labels=None)",
"model_meta": "ModelMeta(model_type='qwen3_vl', model_groups=[ModelGroup(models=[Model(ms_model_id='Qwen/Qwen3-VL-2B-Instruct', hf_model_id='Qwen/Qwen3-VL-2B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-VL-2B-Thinking', hf_model_id='Qwen/Qwen3-VL-2B-Thinking', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-VL-2B-Instruct-FP8', hf_model_id='Qwen/Qwen3-VL-2B-Instruct-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-VL-2B-Thinking-FP8', hf_model_id='Qwen/Qwen3-VL-2B-Thinking-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-VL-4B-Instruct', hf_model_id='Qwen/Qwen3-VL-4B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-VL-4B-Thinking', hf_model_id='Qwen/Qwen3-VL-4B-Thinking', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-VL-4B-Instruct-FP8', hf_model_id='Qwen/Qwen3-VL-4B-Instruct-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-VL-4B-Thinking-FP8', hf_model_id='Qwen/Qwen3-VL-4B-Thinking-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-VL-8B-Instruct', hf_model_id='Qwen/Qwen3-VL-8B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-VL-8B-Thinking', hf_model_id='Qwen/Qwen3-VL-8B-Thinking', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-VL-8B-Instruct-FP8', hf_model_id='Qwen/Qwen3-VL-8B-Instruct-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-VL-8B-Thinking-FP8', hf_model_id='Qwen/Qwen3-VL-8B-Thinking-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-VL-32B-Instruct', hf_model_id='Qwen/Qwen3-VL-32B-Instruct', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-VL-32B-Thinking', hf_model_id='Qwen/Qwen3-VL-32B-Thinking', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-VL-32B-Instruct-FP8', hf_model_id='Qwen/Qwen3-VL-32B-Instruct-FP8', model_path=None, ms_revision=None, hf_revision=None), Model(ms_model_id='Qwen/Qwen3-VL-32B-Thinking-FP8', hf_model_id='Qwen/Qwen3-VL-32B-Thinking-FP8', model_path=None, ms_revision=None, hf_revision=None)], ignore_patterns=None, requires=None, tags=[])], template='qwen3_vl', get_function=<function get_model_tokenizer_qwen3_vl at 0x7fc8e8576d40>, model_arch=MultiModelKeys(arch_name='qwen3_vl', embedding=None, module_list=None, lm_head=None, q_proj=None, k_proj=None, v_proj=None, o_proj=None, attention=None, mlp=None, down_proj=None, qkv_proj=None, qk_proj=None, qa_proj=None, qb_proj=None, kv_proj=None, kva_proj=None, kvb_proj=None, language_model=['model.language_model', 'lm_head'], aligner=['model.visual.merger', 'model.visual.deepstack_merger_list'], vision_tower=['model.visual'], generator=[]), architectures=['Qwen3VLForConditionalGeneration'], additional_saved_files=[], torch_dtype=None, is_multimodal=True, is_reward=False, is_reranker=False, task_type=None, ignore_patterns=None, requires=['transformers>=4.57', 'qwen_vl_utils>=0.0.14', 'decord'], tags=['vision', 'video'])",
"model_dir": "/high_perf_store3/world-model/zhuzhenxin/ckpts/Qwen3-VL-2B-Instruct",
"_val_dataset_exists": true,
"hub": "<class 'swift.hub.hub.MSHub'>",
"megatron_model_meta": "MegatronModelMeta(megatron_model_type='qwen3_vl', model_types=['qwen3_vl', 'qwen3_moe_vl'], is_multimodal=True, bridge_cls=<class 'swift.megatron.model.gpt_bridge.MultimodalGPTBridge'>, model_cls=<class 'swift.megatron.model.mm_gpt.qwen3_vl.Qwen3VLGPTModel'>, get_transformer_layer_spec=None, model_provider=<function model_provider at 0x7fc86522f240>, visual_cls=<class 'swift.megatron.model.mm_gpt.qwen3_vl.Qwen3VL_Vit'>, get_mtp_block_spec=None, extra_args_provider=None)",
"extra_args": {
"model_dir": "/high_perf_store3/world-model/zhuzhenxin/ckpts/Qwen3-VL-2B-Instruct",
"is_multimodal": true,
"hf_model_type": "qwen3_vl",
"use_ray": false,
"ray_exp_name": null,
"device_groups": null,
"model": "/high_perf_store3/world-model/zhuzhenxin/ckpts/Qwen3-VL-2B-Instruct/",
"model_type": "qwen3_vl",
"model_revision": null,
"task_type": "causal_lm",
"torch_dtype": "bfloat16",
"attn_impl": null,
"new_special_tokens": [
"<FRONT_VIEW>",
"<FRONT_LEFT_VIEW>",
"<FRONT_RIGHT_VIEW>",
"<BACK_LEFT_VIEW>",
"<BACK_RIGHT_VIEW>",
"<BACK_VIEW>"
],
"num_labels": null,
"problem_type": null,
"rope_scaling": {
"mrope_interleaved": true,
"mrope_section": [
24,
20,
20
],
"rope_type": "default"
},
"device_map": null,
"max_memory": {},
"max_model_len": null,
"local_repo_path": null,
"init_strategy": null,
"template": "qwen3_vl",
"system": "Generalist Autonomous Driving Agent\nRole: You are an advanced, multimodal AI brain for an autonomous vehicle, capable of Perception, Reasoning, and Planning. Your goal is to drive safely, follow instructions, and deeply understand the dynamic world around you.\n\nContext & Coordinate System\n- Ego-Centric View: You are at the origin (0,0). The X-axis represents the lateral distance (perpendicular), and the Y-axis represents the longitudinal distance (forward).\n- Inputs: You receive multi-view visual observations (<FRONT_VIEW>, <BACK_VIEW>, etc.), historical ego-motion, and vehicle states (velocity, acceleration).\n\nCore Capabilities\n1. **Driving & Planning**:\n - Objective: Generate a safe, comfortable, and feasible 3-second trajectory (6 waypoints, 0.5s interval).\n - Constraints: Strictly adhere to traffic rules, avoid collisions, and respect kinematic limits.\n - Output Format: A sequence of coordinates [(x1,y1), ..., (x6,y6)].\n\n2. **Reasoning & VQA** (Chain-of-Thought):\n - Tasks: Analyze traffic scenes, explain causal logic (e.g., \"Why stop?\"), identify hazards, and answer queries about the environment (weather, road layout, traffic lights).\n - Reasoning: Break down complex scenarios into step-by-step logic, grounding your answers in visual evidence.\n\n3. **Instruction Following & Grounding**:\n - Tasks: Execute navigation commands (e.g., \"Park behind the red truck\") and ground textual descriptions to specific visual regions or objects.\n\n4. **Perception & World Modeling** (Future & Current State):\n - Tasks: Detect and track objects, predict their future motion, and estimate 3D occupancy or scene geometry (Gaussian Splatting/Occ).\n - Understanding: Map semantic elements (lanes, crossings) and dynamic agents into a coherent world model.\n\nInstructions\n- For **Planning** tasks: Output the \"Trajectory\".\n- For **QA/Reasoning** tasks: Provide a clear, logical, and helpful text response.\n- For **Perception** tasks: Output structured descriptions or specific formats as requested.\n\nAlways prioritize safety and clarity in your responses.\n",
"max_length": 16384,
"truncation_strategy": "delete",
"max_pixels": null,
"agent_template": null,
"norm_bbox": null,
"use_chat_template": true,
"padding_side": "right",
"padding_free": true,
"sequence_parallel_size": 1,
"template_backend": "swift",
"response_prefix": null,
"enable_thinking": null,
"add_non_thinking_prefix": true,
"dataset": [
"/high_perf_store3/world-model/yongkangli/ABCDEFG_NISHIDASHABI/A/B/UniDriveVLA/Bench2Drive/data/b2d_planning_qa_train_residual.jsonl",
"/high_perf_store3/world-model/yongkangli/Dataset_vqa/Orion_Data/train_converted_processed.jsonl",
"/high_perf_store3/world-model/yongkangli/B2D/Bench2DriveZoo-tcp-admlp/output_final_modified_finalview_processed.jsonl",
"/high_perf_store3/world-model/yongkangli/finevision_subset_cleaned.jsonl#1141184"
],
"val_dataset": [],
"cached_dataset": [],
"cached_val_dataset": [],
"split_dataset_ratio": 0.01,
"data_seed": 42,
"dataset_num_proc": 32,
"load_from_cache_file": true,
"dataset_shuffle": true,
"val_dataset_shuffle": false,
"streaming": false,
"interleave_prob": null,
"stopping_strategy": "first_exhausted",
"shuffle_buffer_size": 1000,
"download_mode": "reuse_dataset_if_exists",
"columns": {},
"strict": false,
"remove_unused_columns": true,
"model_name": null,
"model_author": null,
"custom_dataset_info": [],
"quant_method": null,
"quant_bits": null,
"hqq_axis": null,
"bnb_4bit_compute_dtype": "bfloat16",
"bnb_4bit_quant_type": "nf4",
"bnb_4bit_use_double_quant": true,
"bnb_4bit_quant_storage": null,
"max_new_tokens": null,
"temperature": 0.9,
"top_k": 50,
"top_p": 0.9,
"repetition_penalty": 1.0,
"num_beams": 1,
"stream": false,
"stop_words": [],
"logprobs": false,
"top_logprobs": null,
"structured_outputs_regex": null,
"ckpt_dir": null,
"lora_modules": [],
"tuner_backend": "peft",
"train_type": "full",
"adapters": [],
"external_plugins": [],
"model_kwargs": {},
"load_args": false,
"load_data_args": false,
"packing": true,
"packing_length": 16384,
"packing_num_proc": 1,
"lazy_tokenize": false,
"custom_register_path": [],
"use_hf": false,
"hub_token": null,
"ddp_timeout": 18000000,
"ddp_backend": null,
"ignore_args_error": false,
"use_swift_lora": false,
"freeze_llm": false,
"freeze_vit": false,
"freeze_aligner": false,
"freeze_parameters": [],
"freeze_parameters_regex": null,
"freeze_parameters_ratio": 0.0,
"trainable_parameters": [],
"trainable_parameters_regex": null,
"adapter_load": null,
"target_modules": [
"all-linear"
],
"target_regex": null,
"modules_to_save": [],
"lora_rank": 8,
"lora_alpha": 32,
"lora_dropout": 0.05,
"lora_bias": "none",
"lora_dtype": null,
"use_rslora": false,
"rlhf_type": null,
"ref_load": null,
"ref_adapter_load": null,
"beta": null,
"rpo_alpha": null,
"reference_free": false,
"label_smoothing": 0.0,
"f_divergence_type": "reverse_kl",
"loss_type": null,
"desirable_weight": 1.0,
"undesirable_weight": 1.0,
"calculate_KL": null,
"center_rewards_coefficient": null,
"teacher_model": null,
"teacher_model_type": null,
"teacher_model_revision": null,
"lmbda": 0.5,
"seq_kd": false,
"offload_teacher_model": false,
"sft_alpha": 0.0,
"generation_batch_size": null,
"steps_per_generation": null,
"num_generations": 8,
"num_generations_eval": null,
"max_completion_length": 512,
"importance_sampling_level": "token",
"tau_pos": 1.0,
"tau_neg": 1.05,
"epsilon": 0.2,
"epsilon_high": null,
"delta": null,
"use_vllm": true,
"vllm_mode": null,
"vllm_enable_prefix_caching": true,
"vllm_gpu_memory_utilization": 0.9,
"vllm_tensor_parallel_size": 1,
"vllm_max_model_len": null,
"vllm_enforce_eager": false,
"vllm_limit_mm_per_prompt": null,
"vllm_disable_cascade_attn": false,
"vllm_max_num_seqs": null,
"vllm_mm_processor_cache_gb": null,
"vllm_engine_kwargs": null,
"sleep_level": 0,
"offload_optimizer": false,
"offload_model": false,
"offload_bridge": false,
"vllm_server_base_url": null,
"vllm_server_host": null,
"vllm_server_port": [
8000
],
"vllm_server_timeout": 240.0,
"vllm_server_group_port": null,
"reward_funcs": [],
"reward_weights": null,
"cosine_min_len_value_wrong": -0.5,
"cosine_max_len_value_wrong": 0.0,
"cosine_min_len_value_correct": 1.0,
"cosine_max_len_value_correct": 0.5,
"cosine_max_len": null,
"repetition_n_grams": 3,
"repetition_max_penalty": -1.0,
"soft_max_length": null,
"soft_cache_length": null,
"dynamic_sample": false,
"max_resample_times": 3,
"overlong_filter": false,
"scale_rewards": "group",
"advantage_estimator": "grpo",
"kl_in_reward": false,
"wandb_log_unique_prompts": null,
"log_completions": false,
"rollout_importance_sampling_mode": null,
"rollout_importance_sampling_threshold": 2.0,
"log_rollout_offpolicy_metrics": false,
"off_policy_sequence_mask_delta": null,
"log_entropy": false,
"top_entropy_quantile": 1.0,
"reward_model": null,
"reward_model_plugin": null,
"sync_ref_model": false,
"ref_model_sync_steps": 512,
"ref_model_mixup_alpha": 0.6,
"async_generate": false,
"move_model_batches": null,
"multi_turn_scheduler": null,
"max_turns": null,
"completion_length_limit_scope": "per_round",
"vllm_server_pass_dataset": false,
"num_iterations": 1,
"check_model": true,
"padded_vocab_size": 151936,
"initialize_embedding": false,
"mlp_padding_free": false,
"load_safetensors": true,
"save_safetensors": true,
"ref_model": null,
"ref_adapters": [],
"merge_lora": true,
"max_shard_size": "5GB",
"train_dataloader_shuffle": true,
"dataloader_pin_memory": true,
"dataloader_persistent_workers": true,
"dataloader_prefetch_factor": 2,
"group_by_length": false,
"architectures": "Qwen3VLForConditionalGeneration",
"llm_architectures": "Qwen3VLForConditionalGeneration",
"max_epochs": 3,
"enable_dft_loss": false,
"enable_channel_loss": false,
"save_strategy": "steps",
"original_max_position_embeddings": null,
"partial_rotary_factor": null,
"use_shared_expert_gate": false,
"report_to": null,
"vit_gradient_checkpointing": true,
"vit_lr": null,
"aligner_lr": null,
"gradient_checkpointing_kwargs": null,
"linear_num_value_heads": null,
"linear_num_key_heads": null,
"linear_key_head_dim": null,
"linear_value_head_dim": null,
"linear_conv_kernel_dim": null,
"layer_types": null,
"mrope_interleaved": true,
"add_version": true
}
}