Update config.json
Browse files- config.json +0 -12
config.json
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"Ubit": 100,
|
| 3 |
"_attn_implementation_autoset": true,
|
| 4 |
-
"_name_or_path": "/horizon-bucket/robot_lab/users/shuo03.wang/nips2025/sft_2B_model/step3_aux_qwen/tmp-checkpoint-25000",
|
| 5 |
"architectures": [
|
| 6 |
"LlavaLlamaModel"
|
| 7 |
],
|
|
@@ -26,13 +25,9 @@
|
|
| 26 |
"group_size": -1,
|
| 27 |
"hidden_size": 1536,
|
| 28 |
"image_aspect_ratio": "dynamic",
|
| 29 |
-
"image_encoder": {
|
| 30 |
-
"_target_": "robo_orchard_lab.models.monodream.multimodal_encoder.BasicImageEncoder"
|
| 31 |
-
},
|
| 32 |
"interpolate_mode": "linear",
|
| 33 |
"llm_cfg": {
|
| 34 |
"_attn_implementation_autoset": false,
|
| 35 |
-
"_name_or_path": "/horizon-bucket/robot_lab/users/shuo03.wang/nips2025/sft_2B_model/step3_aux_qwen/tmp-checkpoint-25000/llm",
|
| 36 |
"add_cross_attention": false,
|
| 37 |
"architectures": [
|
| 38 |
"Qwen2ForCausalLM"
|
|
@@ -125,7 +120,6 @@
|
|
| 125 |
"mm_projector": "mlp_downsample_3x3_fix",
|
| 126 |
"mm_projector_cfg": {
|
| 127 |
"_attn_implementation_autoset": false,
|
| 128 |
-
"_name_or_path": "/horizon-bucket/robot_lab/users/shuo03.wang/nips2025/sft_2B_model/step3_aux_qwen/tmp-checkpoint-25000/mm_projector",
|
| 129 |
"add_cross_attention": false,
|
| 130 |
"architectures": [
|
| 131 |
"MultimodalProjector"
|
|
@@ -196,7 +190,6 @@
|
|
| 196 |
"mm_vision_select_feature": "cls_patch",
|
| 197 |
"mm_vision_select_layer": -2,
|
| 198 |
"model_dtype": "torch.bfloat16",
|
| 199 |
-
"model_name_or_path": "/bucket/input/robot_lab/users/shuo03.wang/NVILA-Lite-2B",
|
| 200 |
"model_type": "llava_llama",
|
| 201 |
"num_time_tokens": 0,
|
| 202 |
"num_video_frames": 8,
|
|
@@ -213,7 +206,6 @@
|
|
| 213 |
"refine_mlp_blocksize": false,
|
| 214 |
"refine_residual_fp": false,
|
| 215 |
"refine_row_blocksize": 4,
|
| 216 |
-
"resume_path": "/bucket/input/robot_lab/users/shuo03.wang/NVILA-Lite-2B",
|
| 217 |
"row_blocksize": -1,
|
| 218 |
"row_blocksize_optimizer": 1,
|
| 219 |
"s2": false,
|
|
@@ -232,14 +224,10 @@
|
|
| 232 |
"tune_vision_tower": true,
|
| 233 |
"use_quantize_optimizer": false,
|
| 234 |
"version": "auto",
|
| 235 |
-
"video_encoder": {
|
| 236 |
-
"_target_": "robo_orchard_lab.models.monodream.multimodal_encoder.BasicVideoEncoder"
|
| 237 |
-
},
|
| 238 |
"vision_resolution": -1,
|
| 239 |
"vision_tower": "Efficient-Large-Model/paligemma-siglip-so400m-patch14-448",
|
| 240 |
"vision_tower_cfg": {
|
| 241 |
"_attn_implementation_autoset": false,
|
| 242 |
-
"_name_or_path": "/horizon-bucket/robot_lab/users/shuo03.wang/nips2025/sft_2B_model/step3_aux_qwen/tmp-checkpoint-25000/vision_tower",
|
| 243 |
"add_cross_attention": false,
|
| 244 |
"architectures": [
|
| 245 |
"SiglipVisionModel"
|
|
|
|
| 1 |
{
|
| 2 |
"Ubit": 100,
|
| 3 |
"_attn_implementation_autoset": true,
|
|
|
|
| 4 |
"architectures": [
|
| 5 |
"LlavaLlamaModel"
|
| 6 |
],
|
|
|
|
| 25 |
"group_size": -1,
|
| 26 |
"hidden_size": 1536,
|
| 27 |
"image_aspect_ratio": "dynamic",
|
|
|
|
|
|
|
|
|
|
| 28 |
"interpolate_mode": "linear",
|
| 29 |
"llm_cfg": {
|
| 30 |
"_attn_implementation_autoset": false,
|
|
|
|
| 31 |
"add_cross_attention": false,
|
| 32 |
"architectures": [
|
| 33 |
"Qwen2ForCausalLM"
|
|
|
|
| 120 |
"mm_projector": "mlp_downsample_3x3_fix",
|
| 121 |
"mm_projector_cfg": {
|
| 122 |
"_attn_implementation_autoset": false,
|
|
|
|
| 123 |
"add_cross_attention": false,
|
| 124 |
"architectures": [
|
| 125 |
"MultimodalProjector"
|
|
|
|
| 190 |
"mm_vision_select_feature": "cls_patch",
|
| 191 |
"mm_vision_select_layer": -2,
|
| 192 |
"model_dtype": "torch.bfloat16",
|
|
|
|
| 193 |
"model_type": "llava_llama",
|
| 194 |
"num_time_tokens": 0,
|
| 195 |
"num_video_frames": 8,
|
|
|
|
| 206 |
"refine_mlp_blocksize": false,
|
| 207 |
"refine_residual_fp": false,
|
| 208 |
"refine_row_blocksize": 4,
|
|
|
|
| 209 |
"row_blocksize": -1,
|
| 210 |
"row_blocksize_optimizer": 1,
|
| 211 |
"s2": false,
|
|
|
|
| 224 |
"tune_vision_tower": true,
|
| 225 |
"use_quantize_optimizer": false,
|
| 226 |
"version": "auto",
|
|
|
|
|
|
|
|
|
|
| 227 |
"vision_resolution": -1,
|
| 228 |
"vision_tower": "Efficient-Large-Model/paligemma-siglip-so400m-patch14-448",
|
| 229 |
"vision_tower_cfg": {
|
| 230 |
"_attn_implementation_autoset": false,
|
|
|
|
| 231 |
"add_cross_attention": false,
|
| 232 |
"architectures": [
|
| 233 |
"SiglipVisionModel"
|