Sombit
/

trajectoryvla

@@ -1,37 +1,64 @@
 {
-  "arch_specifier": "no-align+gelu-mlp",
-  "architectures": [
-    "TrajectoryVLA"
-  ],
   "auto_map": {
-    "AutoModelForVision2Seq": "prismatic_model.TrajectoryVLA"
   },
-  "hf_llm_id": "meta-llama/Llama-2-7b-hf",
-  "image_resize_strategy": "letterbox",
-  "image_sizes": [
-    224,
-    224
-  ],
-  "llm_backbone_id": "llama2-7b-pure",
-  "llm_max_length": 2048,
-  "model_type": "prismatic",
-  "output_projector_states": false,
-  "pad_to_multiple_of": 64,
-  "pad_token_id": 32000,
-  "return_dict": false,
-  "text_config": {
-    "model_type": "llama"
   },
-  "timm_model_ids": [
-    "vit_large_patch14_reg4_dinov2.lvd142m",
-    "vit_so400m_patch14_siglip_224"
-  ],
-  "timm_override_act_layers": [
-    null,
-    null
-  ],
-  "torch_dtype": "bfloat16",
-  "transformers_version": "4.44.2",
-  "use_fused_vision_backbone": true,
-  "vision_backbone_id": "dinosiglip-vit-so-224px"
 }

 {
   "auto_map": {
+    "AutoConfig": "prismatic_config.TrajectoryVLAConfig"
   },
+  "cheat": false,
+  "model_type": "trajectoryvla",
+  "num_timesteps": 6,
+  "prismatic_config": {
+    "architectures": [
+      "TrajectoryVLA"
+    ],
+    "auto_map": {
+      "AutoModelForVision2Seq": "prismatic_model.TrajectoryVLA"
+    },
+    "model_type": "prismatic",
+    "return_dict": false,
+    "torch_dtype": "bfloat16"
   },
+  "rotation_components": 9,
+  "seperate_control_proj": true,
+  "timestep_proj_config": {
+    "num_tokens": 3,
+    "pos_embed_scale": 8,
+    "proj_layers": [
+      128,
+      512,
+      1024
+    ],
+    "time_delta_sec": 0.1
+  },
+  "token_proj_config": {
+    "control_tokens_layers": [
+      4096,
+      2048,
+      1024
+    ],
+    "image_tokens_mode": "vit",
+    "llm_image_tokens_layers": [],
+    "vit_tokens_layers": [
+      2176,
+      1024
+    ]
+  },
+  "token_size": 1024,
+  "transformer_config": {
+    "decoder_block_config": {
+      "dropout": 0.0,
+      "feature_size": 1024,
+      "head_dim": 64,
+      "num_heads": 16
+    },
+    "encoder_block_config": {
+      "feature_size": 1024,
+      "head_dim": 64,
+      "num_heads": 16
+    },
+    "num_blocks": 2,
+    "pos_embed_config": {
+      "embedding_dim": 1024,
+      "num_embeddings": 300
+    }
+  },
+  "transformers_version": "4.44.2"
 }

prismatic_config.py CHANGED Viewed

@@ -7,7 +7,7 @@ Default configuration specifies `siglip-224px+7b`.
 from typing import Any, Dict, List, Optional
 import transformers
-from transformers import PretrainedConfig
 from transformers.models.auto import CONFIG_MAPPING
 import numpy as np
@@ -155,7 +155,6 @@ class PrismaticConfig(PretrainedConfig):
 # Here  we need trajectory_vla config, with
 # prismatic_config fields and then the waypointer fields
 class TrajectoryVLAConfig(PretrainedConfig):
     model_type: str = "trajectoryvla"
@@ -217,7 +216,8 @@ if  __name__ == "__main__" :
     # prismatic_config = PrismaticConfig()
     # print(prismatic_config)
     prismatic_config_dict = {
         "vision_backbone_id":"dinosiglip-vit-so-224px",
         "llm_backbone_id":"llama2-7b-pure",
@@ -280,30 +280,3 @@ if  __name__ == "__main__" :
     TrajectoryVLAConfig = TrajectoryVLAConfig( **TrajectoryVlaConfig_config)
     print(TrajectoryVLAConfig)
-# class WaypointTokenizer:
-#     """
-#     Wraps base LLM/VLM tokenizer and overloads least used token as a control token
-#     NOTE: By default, assumes a BPE-style tokenizer akin to the LlamaTokenizer,
-#         where *the least used tokens* appear at the end of the vocabulary!
-#     TODO: Adding new token vs overloading? When I call `tokenizer.add_token()` vocab stays the same
-#     """
-#     def __init__(self, tokenizer: transformers.PreTrainedTokenizerBase, num_tokens: int = 10) -> None:
-#         self.tokenizer = tokenizer
-#         self.num_tokens = num_tokens
-#     def __call__(self, *_) -> str:
-#         """Get the text token for control"""
-#         return self.tokenizer.decode(self.control_token_ids)
-#     @property
-#     def control_token_ids(self) -> np.ndarray:
-#         # Assumes we're overwriting the final tokens of the vocabulary (least used tokens)
-#         return np.arange(self.num_tokens) + int(self.tokenizer.vocab_size - self.num_tokens)
-#     @property
-#     def num_control_tokens(self) -> int:
-#         return self.num_tokens

 from typing import Any, Dict, List, Optional
 import transformers
+from transformers import PretrainedConfig,AutoModel, AutoConfig
 from transformers.models.auto import CONFIG_MAPPING
 import numpy as np
 # Here  we need trajectory_vla config, with
 # prismatic_config fields and then the waypointer fields
 class TrajectoryVLAConfig(PretrainedConfig):
     model_type: str = "trajectoryvla"
     # prismatic_config = PrismaticConfig()
     # print(prismatic_config)
+    AutoConfig.register("prismatic",PrismaticConfig)
+    AutoConfig.register("trajectoryvla",TrajectoryVLAConfig)
     prismatic_config_dict = {
         "vision_backbone_id":"dinosiglip-vit-so-224px",
         "llm_backbone_id":"llama2-7b-pure",
     TrajectoryVLAConfig = TrajectoryVLAConfig( **TrajectoryVlaConfig_config)
     print(TrajectoryVLAConfig)