Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

Molmo-7B-10131629-5000/config.yaml +344 -0
Molmo-7B-10131629-5000/model.pt +3 -0

Molmo-7B-10131629-5000/config.yaml ADDED Viewed

	@@ -0,0 +1,344 @@

+run_name: multitask_train
+seed: 6198
+epoch: null
+dry_run: false
+model:
+  d_model: 3584
+  n_heads: 28
+  n_kv_heads: 4
+  qkv_bias: true
+  clip_qkv: null
+  n_layers: 28
+  mlp_ratio: 4
+  mlp_hidden_size: 37888
+  activation_type: swiglu
+  block_type: sequential
+  block_group_size: 1
+  rope: true
+  rope_full_precision: true
+  rope_theta: 1000000.0
+  vision_backbone:
+    image_model_type: openai
+    image_default_input_size:
+    - 336
+    - 336
+    image_patch_size: 14
+    image_pos_patch_size: 14
+    image_emb_dim: 1024
+    image_num_heads: 16
+    image_num_key_value_heads: 16
+    image_num_layers: 23
+    image_head_dim: 64
+    image_mlp_dim: 4096
+    image_mlp_activations: quick_gelu
+    image_dropout_rate: 0.0
+    image_num_pos: 577
+    image_norm_eps: 1.0e-05
+    attention_dropout: 0.0
+    residual_dropout: 0.0
+    initializer_range: 0.02
+    fsdp_wrap: false
+    resize_mode: default
+  vit_load_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/vit-l-14-336.pt
+  llm_load_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen2-7b.pt
+  low_cpu_fsdp: true
+  attention_type: sdpa
+  float32_attention: true
+  attention_dropout: 0.0
+  attention_layer_norm: false
+  residual_dropout: 0.1
+  response_residual_dropout: 0.0
+  embedding_dropout: 0.0
+  layer_norm_type: rms
+  layer_norm_with_affine: true
+  layer_norm_eps: 1.0e-06
+  attention_layer_norm_with_affine: true
+  max_sequence_length: 4096
+  max_position_embeddings: null
+  include_bias: false
+  bias_for_layer_norm: null
+  scale_logits: false
+  vocab_size: 152064
+  embedding_size: 152064
+  ff_out_size: 152192
+  additional_vocab_size: 128
+  new_embedding_init_range: 0.02
+  weight_tying: false
+  init_device: cpu
+  init_fn: normal
+  init_std: 0.02
+  init_cutoff_factor: null
+  norm_after: false
+  precision: amp_bf16
+  max_crops: 12
+  crop_mode: overlap-and-resize-c2
+  use_col_tokens: true
+  prompt_type: uber_model
+  system_prompt_kind: demo_or_style
+  message_formatting: role
+  always_start_with_space: true
+  multi_annotation_weighting: root_subsegments
+  default_inference_len: 128
+  overlap_margins:
+  - 4
+  - 4
+  pad_value: 0.0
+  image_padding_embed: pad_and_partial_pad
+  fix_image_padding: true
+  vit_layers:
+  - -2
+  - -9
+  image_pooling_h: 2
+  image_pooling_w: 2
+  image_pooling_2d: attention_meanq
+  image_projector: mlp
+  image_feature_dropout: 0.0
+  initializer_range: 0.02
+  normalize_input_embeds: false
+  use_position_ids: true
+  head_dim: null
+  action_tokenizer:
+    identifier: physical-intelligence/fast
+    tokenizer_dir: null
+  action_dim: 7
+  num_actions_chunk: 8
+  tokenizer:
+    identifier: Qwen/Qwen2-7B
+    tokenizer_dir: null
+  pad_tokenizer: true
+  moe_num_experts: 8
+  moe_top_k: 2
+  moe_mlp_impl: sparse
+  moe_log_expert_assignment: false
+  moe_shared_expert: false
+  moe_lbl_in_fp32: false
+  moe_interleave: false
+  moe_loss_weight: 0.1
+  moe_zloss_weight: null
+  moe_dropless: true
+  moe_capacity_factor: 1.25
+  action_head: l1_regression
+  num_diffusion_steps: 1000
+  num_diffusion_inference_steps: 30
+  use_proprio: false
+  action_head_dit_hidden_size: 1024
+  action_head_dit_depth: 14
+  action_head_dit_num_heads: 16
+  llm_causal_attention: false
+  action_use_left_eef: false
+  action_use_mobile_base: false
+allow_resume: true
+ft_llm: true
+ft_vit: true
+ft_connector: true
+ft_embedding: lm_head
+lora: false
+use_lora: false
+lora_rank: 32
+lora_llm: false
+lora_vit: false
+lora_connector: false
+optimizer:
+  name: adamw
+  learning_rate: 0.0001
+  weight_decay: 0.01
+  betas:
+  - 0.9
+  - 0.95
+  eps: 1.0e-05
+  connector_learning_rate: 0.0002
+  vit_learning_rate: 6.0e-06
+  llm_learning_rate: 2.0e-05
+  connector_weight_decay: 0.0
+  vit_weight_decay: 0.0
+  llm_weight_decay: 0.0
+  connector_betas:
+  - 0.9
+  - 0.95
+  vit_betas:
+  - 0.9
+  - 0.95
+  llm_betas:
+  - 0.9
+  - 0.95
+  connector_eps: 1.0e-06
+  vit_eps: 1.0e-06
+  llm_eps: 1.0e-06
+  metrics_log_interval: 20
+scheduler:
+  name: multimodal
+  units: steps
+  t_warmup: 100
+  t_max: null
+  alpha_f: 0.1
+  connector_t_warmup: 200
+  vit_t_warmup: 1000
+  llm_t_warmup: 1000
+  grad_clip_warmup_steps: null
+  grad_clip_warmup_factor: null
+  warmup_min_lr: 0.0
+data:
+  dataset: null
+  mixture: null
+  root_size_mixture:
+  - rate: 0.125
+    mixture:
+      sr_planning: 500000.0
+      robovqa: 300000.0
+  - rate: 0.125
+    mixture:
+      pixmo_ask_model_anything: null
+      pixmo_cap: null
+      pixmo_points: null
+      pixmo_count: null
+      blip_laion_cc: null
+  - rate: 0.125
+    mixture:
+      sr_affordance: null
+  - rate: 0.125
+    mixture:
+      sr_trajectory: null
+  - rate: 0.5
+    mixture:
+      oxe_magic_soup_plus_minus_A1: null
+  split: train
+  seed: 50189
+  shuffle_messages: true
+  pad: to_max
+  sequence_length: 2304
+  shuffle: true
+  for_inference: false
+  multi_modal: torch
+  num_workers: 0
+  drop_last: true
+  pin_memory: true
+  prefetch_factor: null
+  persistent_workers: false
+  timeout: 0
+  rlds_dataset_name: ''
+  rlds_data_root_dir: null
+  use_wrist_image: false
+  use_proprio: false
+restore_dataloader: true
+fast_forward_batches: null
+evaluators: []
+eval_interval: 12000
+inf_eval_interval: 12000
+inf_evaluators:
+- label: robovqa
+  data:
+    dataset: robovqa
+    mixture: null
+    root_size_mixture: null
+    split: validation
+    seed: null
+    shuffle_messages: true
+    pad: to_max
+    sequence_length: 1792
+    shuffle: true
+    for_inference: true
+    multi_modal: torch
+    num_workers: 0
+    drop_last: true
+    pin_memory: true
+    prefetch_factor: null
+    persistent_workers: true
+    timeout: 0
+    rlds_dataset_name: ''
+    rlds_data_root_dir: null
+    use_wrist_image: false
+    use_proprio: false
+  device_eval_batch_size: null
+  subset_num_batches: null
+  max_examples: 2048
+  max_new_tokens: 128
+  mm_evaluator:
+    n_to_log: 0
+    num_wandb_examples: 32
+    save_predictions: null
+    save_tokens: false
+    vqa_eval: robovqa_score
+    multi_threshold_box_eval: false
+    coordinate_eval: false
+    pointing_eval: false
+    count_eval: false
+    point_count_eval: false
+    trajectory_eval: false
+    android_eval: false
+    clock_eval: false
+    clock_bench_eval: false
+    math_vista_eval: false
+    action_eval: false
+  save_dir: null
+  save_to_checkpoint_dir: false
+  eval_name: null
+  skip_if_metrics_cached: true
+save_folder: /vast/users/xiaodan/zhangkaidong/A1/model/checkpoints/10131629
+remote_save_folder: null
+canceled_check_interval: 50
+save_interval: 500
+save_interval_unsharded: 32000
+save_interval_ephemeral: null
+save_interval_action_head: null
+save_num_checkpoints_to_keep: 1
+save_num_unsharded_checkpoints_to_keep: -1
+save_num_action_head_checkpoints_to_keep: -1
+save_overwrite: true
+force_save_unsharded: false
+no_pre_train_checkpoint: true
+initial_model_checkpoint: /vast/users/xiaodan/zhangkaidong/A1/model/MolmoE-7B-10061402-3000
+load_model_config: null
+load_path: null
+load_path_sharded_checkpointer: null
+reset_optimizer_state: false
+reset_trainer_state: false
+save_dataloader_state: false
+reset_dataloader_state: false
+sharded_checkpointer: torch_legacy
+max_duration: 32000
+global_train_batch_size: 256
+device_train_batch_size: 32
+device_train_microbatch_size: 4
+device_eval_batch_size: 32
+eval_subset_num_batches: 8
+eval_on_load: false
+device_inf_eval_batch_size: 4
+inf_eval_subset_num_batches: -1
+device_train_grad_accum: 8
+max_grad_norm: 1.0
+multi_component_grad_norm: true
+batch_divisor: global_batch
+max_grad_norm_ratio: null
+precision: amp_bf16
+wandb:
+  project: molmo_training
+  entity: ''
+  group: null
+  name: MolmoE-7B-A1-pretrain-A1
+  tags:
+  - watching
+  log_artifacts: false
+  rank_zero_only: true
+  log_interval: 20
+speed_monitor:
+  window_size: 20
+  gpu_flops_available: null
+console_log_interval: 1
+gen1_gc_interval: 1
+compile: null
+fsdp:
+  use_orig_params: true
+  sharding_strategy: FULL_SHARD
+  wrapping_strategy: by_block_and_size
+  precision: pure
+  hybrid_sharding_num_model_replicas: null
+softmax_auxiliary_loss: true
+softmax_auxiliary_loss_scale: 0.0001
+time_limit: null
+extra_steps_after_cancel: 10
+python_profiling: false
+torch_profiling: false
+stop_at: 32000
+stop_after: null
+activation_checkpointing: whole_layer
+fused_loss: null

Molmo-7B-10131629-5000/model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1859ce065348151318d9b847981b27dcf6f764188725dc45bf7e0e9e7657f133
+size 32086136618