Robotics
Transformers
Safetensors
qwen3_vl_geometry
text-generation
vision-language-action
vla
manipulation
qwen3-vl
depth
3d-trajectory
Instructions to use DAVIAN-Robotics/3D_HAMSTER with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- Transformers
How to use DAVIAN-Robotics/3D_HAMSTER with Transformers:
# Load model directly from transformers import AutoModelForSeq2SeqLM model = AutoModelForSeq2SeqLM.from_pretrained("DAVIAN-Robotics/3D_HAMSTER", dtype="auto") - Notebooks
- Google Colab
- Kaggle
| { | |
| "architectures": [ | |
| "Qwen3VLGeometryForConditionalGeneration" | |
| ], | |
| "depth_drop_prob": 0.0, | |
| "depth_loss_weight": 1.0, | |
| "depth_only_training": true, | |
| "dtype": "bfloat16", | |
| "eos_token_id": 151645, | |
| "geometry_config": { | |
| "dtype": "bfloat16", | |
| "enabled": true, | |
| "encoder_type": "lingbot_depth", | |
| "freeze_encoder": true, | |
| "fusion_layers": null, | |
| "fusion_method": "resize_and_add", | |
| "hidden_size": 1024, | |
| "match_post_merge_resolution": false, | |
| "merger_hidden_dim": null, | |
| "merger_type": "mlp", | |
| "model_name_or_path": null, | |
| "model_type": "geometry_encoder", | |
| "num_heads": 8, | |
| "num_layers": 1, | |
| "output_hidden_size": null, | |
| "reference_frame": "first", | |
| "use_3d_position_encoding": true, | |
| "encoder_model_config": { | |
| "encoder": { | |
| "backbone": "dinov2_vitl14", | |
| "intermediate_layers": 1, | |
| "dim_out": 1024, | |
| "strict": false, | |
| "depth_emb_mode": "conv_1c", | |
| "img_depth_fuse_mode": "cat_token" | |
| }, | |
| "neck": { | |
| "dim_in": [ | |
| 1026, | |
| 2, | |
| 2, | |
| 2, | |
| 2 | |
| ], | |
| "dim_out": null, | |
| "dim_res_blocks": [ | |
| 1024, | |
| 256, | |
| 128, | |
| 64, | |
| 32 | |
| ], | |
| "num_res_blocks": [ | |
| 0, | |
| 2, | |
| 2, | |
| 2, | |
| 0 | |
| ], | |
| "res_block_in_norm": "none", | |
| "res_block_hidden_norm": "none", | |
| "resamplers": [ | |
| "conv_transpose", | |
| "conv_transpose", | |
| "conv_transpose", | |
| "bilinear" | |
| ] | |
| }, | |
| "depth_head": { | |
| "dim_in": [ | |
| 1024, | |
| 256, | |
| 128, | |
| 64, | |
| 32 | |
| ], | |
| "dim_out": [ | |
| null, | |
| null, | |
| null, | |
| null, | |
| 1 | |
| ], | |
| "dim_res_blocks": [ | |
| 1024, | |
| 256, | |
| 128, | |
| 64, | |
| 32 | |
| ], | |
| "num_res_blocks": [ | |
| 0, | |
| 1, | |
| 1, | |
| 1, | |
| 0 | |
| ], | |
| "res_block_in_norm": "none", | |
| "res_block_hidden_norm": "none", | |
| "resamplers": [ | |
| "conv_transpose", | |
| "conv_transpose", | |
| "conv_transpose", | |
| "bilinear" | |
| ] | |
| }, | |
| "mask_head": { | |
| "dim_in": [ | |
| 1024, | |
| 256, | |
| 128, | |
| 64, | |
| 32 | |
| ], | |
| "dim_out": [ | |
| null, | |
| null, | |
| null, | |
| null, | |
| 1 | |
| ], | |
| "dim_res_blocks": [ | |
| 1024, | |
| 256, | |
| 128, | |
| 64, | |
| 32 | |
| ], | |
| "num_res_blocks": [ | |
| 0, | |
| 1, | |
| 1, | |
| 1, | |
| 0 | |
| ], | |
| "res_block_in_norm": "none", | |
| "res_block_hidden_norm": "none", | |
| "resamplers": [ | |
| "conv_transpose", | |
| "conv_transpose", | |
| "conv_transpose", | |
| "bilinear" | |
| ] | |
| }, | |
| "remap_output": "exp", | |
| "remap_depth_in": "log", | |
| "num_tokens_range": [ | |
| 1200, | |
| 3600 | |
| ] | |
| } | |
| }, | |
| "image_token_id": 151655, | |
| "model_type": "qwen3_vl_geometry", | |
| "pad_token_id": 151643, | |
| "save_depth_viz_dir": null, | |
| "save_depth_viz_interval": 1000, | |
| "save_depth_viz_max_per_dataset": 20, | |
| "text_config": { | |
| "attention_bias": false, | |
| "attention_dropout": 0.0, | |
| "bos_token_id": 151643, | |
| "dtype": "bfloat16", | |
| "eos_token_id": 151645, | |
| "head_dim": 128, | |
| "hidden_act": "silu", | |
| "hidden_size": 4096, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 12288, | |
| "max_position_embeddings": 262144, | |
| "model_type": "qwen3_vl_geometry_text", | |
| "num_attention_heads": 32, | |
| "num_hidden_layers": 36, | |
| "num_key_value_heads": 8, | |
| "rms_norm_eps": 1e-06, | |
| "rope_scaling": { | |
| "mrope_interleaved": true, | |
| "mrope_section": [ | |
| 24, | |
| 20, | |
| 20 | |
| ], | |
| "rope_type": "default" | |
| }, | |
| "rope_theta": 5000000, | |
| "use_cache": false, | |
| "vocab_size": 151936 | |
| }, | |
| "tie_word_embeddings": false, | |
| "transformers_version": "4.57.1", | |
| "use_cache": false, | |
| "use_depth_decoder": true, | |
| "use_encoder_output_for_depth_loss": false, | |
| "video_token_id": 151656, | |
| "vision_config": { | |
| "deepstack_visual_indexes": [ | |
| 8, | |
| 16, | |
| 24 | |
| ], | |
| "depth": 27, | |
| "dtype": "bfloat16", | |
| "hidden_act": "gelu_pytorch_tanh", | |
| "hidden_size": 1152, | |
| "in_channels": 3, | |
| "initializer_range": 0.02, | |
| "intermediate_size": 4304, | |
| "model_type": "qwen3_vl_geometry", | |
| "num_heads": 16, | |
| "num_position_embeddings": 2304, | |
| "out_hidden_size": 4096, | |
| "patch_size": 16, | |
| "spatial_merge_size": 2, | |
| "temporal_patch_size": 2 | |
| }, | |
| "vision_end_token_id": 151653, | |
| "vision_start_token_id": 151652 | |
| } |