Update config.yaml

2c73e97 verified 29 days ago

17.2 kB

	run_name: Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
	model:
	model_name: molmobot
	data_formatter:
	prompt_templates: uber_model_v2
	message_format: qwen3
	system_prompt: demo_or_style_v2
	always_start_with_space: false
	default_inference_len: 65
	select_answer: best
	debug: false
	image_last: false
	format_message_list: null
	p_one_message: 0.0
	eval_system_prompt_mapping: null
	p_choice_content_in_mc: 1.0
	template_video_mc_questions: true
	pointing_format: html-v2
	points_decimal_places: 1
	use_seperate_non_pointing_qa_style: false
	timestamp_mode: 50-percent-seconds
	output_timestamp_mode: seconds
	seconds_decimal_places: 1
	p_multi_point_all_image: 0.5
	use_seperate_count_without_pointing_style: false
	sample_random_initial_point: true
	llm:
	d_model: 2560
	n_heads: 32
	n_kv_heads: 8
	head_dim: 128
	qkv_bias: false
	clip_qkv: null
	n_layers: 36
	mlp_ratio: 4
	mlp_hidden_size: 19456
	activation_type: swiglu
	block_type: sequential
	rope: true
	rope_full_precision: true
	rope_theta: 5000000.0
	rope_type: default
	rope_factor: null
	rope_high_freq_factor: null
	rope_low_freq_factor: null
	rope_original_max_position_embeddings: null
	rope_attention_factor: null
	rope_beta_fast: null
	rope_beta_slow: null
	rope_mscale: null
	rope_mscale_all_dim: null
	rope_truncate: null
	attention_type: sdpa
	full_attention_layers: null
	sliding_attention_rope_scaling: false
	float32_attention: true
	attention_dropout: 0.0
	attention_layer_norm: true
	attention_layer_norm_type: qwen3
	residual_dropout: 0.1
	response_residual_dropout: 0.0
	layer_norm_type: rms
	layer_norm_with_affine: true
	layer_norm_eps: 1.0e-06
	attention_layer_norm_with_affine: true
	max_sequence_length: 8192
	max_position_embeddings: null
	include_bias: false
	bias_for_layer_norm: null
	norm_after: false
	moe_num_experts: 8
	moe_top_k: 2
	moe_mlp_impl: sparse
	moe_log_expert_assignment: false
	moe_shared_expert: false
	moe_lbl_in_fp32: false
	moe_interleave: false
	moe_loss_weight: 0.1
	moe_zloss_weight: null
	moe_dropless: true
	moe_capacity_factor: 1.25
	embedding_dropout: 0.0
	scale_logits: false
	vocab_size: 151936
	additional_vocab_size: 128
	weight_tying: true
	embedding_size: 151936
	use_position_ids: true
	tokenizer:
	identifier: Qwen/Qwen3-4B-Instruct-2507
	tokenizer_dir: null
	init_path: /weka/oe-training-default/mm-olmo/pretrained_llms/qwen3-4b-instruct.pt
	init_incremental: null
	new_embedding_init_range: 0.02
	initializer_range: 0.02
	normalize_input_embeds: false
	activation_checkpoint: whole_layer
	compile: blocks
	fix_pad_tokenizer: false
	init_std: 0.02
	init_fn: normal
	init_cutoff_factor: null
	vision_backbone:
	vit:
	image_model_type: siglip
	image_default_input_size:
	- 378
	- 378
	image_patch_size: 14
	image_pos_patch_size: 14
	image_emb_dim: 1152
	image_num_heads: 16
	image_num_key_value_heads: 16
	image_num_layers: 27
	image_head_dim: 72
	image_mlp_dim: 4304
	image_mlp_activations: gelu_pytorch_tanh
	image_dropout_rate: 0.0
	image_num_pos: 729
	image_norm_eps: 1.0e-06
	attention_dropout: 0.0
	residual_dropout: 0.0
	initializer_range: 0.02
	float32_attention: true
	attention_type: sdpa
	sdpa_backend: all
	activation_checkpointing: true
	init_path: /weka/oe-training-default/mm-olmo/pretrained_image_encoders/siglip2-so400m-14-384.pt
	resize_mode: siglip
	pad_value: 0.0
	normalize: siglip
	image_pooling_2d: attention_meanq
	pooling_attention_mask: true
	image_projector: mlp
	image_padding_embed: null
	vit_layers:
	- -3
	- -9
	skip_unused_layers: true
	use_deepstack: false
	share_connector: false
	image_feature_dropout: 0.0
	connector_activation_checkpointing: true
	compile_vit: blocks
	pool_size_embeds: null
	compile_connector: null
	normalize_on_gpu: true
	use_image_augmentation: true
	use_resize_bottleneck: false
	mm_preprocessor:
	max_answer_len: null
	last_message_loss_only: false
	max_text_tokens: null
	loss_token_weighting: root_subsegments_root_tokens
	max_frames: 1
	frame_sample_mode: uniform_last_frame
	candidate_sampling_fps:
	- 0.25
	- 0.5
	- 1.0
	- 2.0
	- 4.0
	- 6.0
	- 8.0
	- 16.0
	cache_videos: true
	loading_method: torchcodec_exact
	max_fps:
	- 2.0
	time_sampling: true
	time_mode: per-frame-compact
	subtitle_mode: frame_1
	max_crops: 1
	overlap_margins:
	- 4.0
	- 4.0
	use_col_tokens: false
	periodic_high_res_frame: null
	high_low_train_mode: local_rnd
	high_res_frame_sample_options: null
	periodic_sample_rate_training:
	4:
	- 0.9
	- 0.03
	- 0.03
	- 0.04
	3:
	- 0.6
	- 0.2
	- 0.2
	skip_low_res_in_high_low: false
	pooling_w: 3
	pooling_h: 3
	high_res_pooling_w: null
	high_res_pooling_h: null
	query_based_resolution_selection: false
	max_queries_for_resolution_selection: 8
	use_frame_special_tokens: true
	frame_sel_clip_identifier: google/siglip2-so400m-patch14-384
	image_padding_mask: false
	max_subtitle_tokens: null
	image:
	crop_mode: resize
	use_col_tokens: true
	max_crops: 8
	high_res_max_crops: 24
	p_high_res: 0.0
	pooling_w: 2
	pooling_h: 2
	overlap_margins:
	- 4
	- 4
	max_images: 4
	max_multi_image_crops: 8
	multi_image_pooling_w: 2
	multi_image_pooling_h: 2
	use_single_crop_col_tokens: false
	use_single_crop_start_token: true
	single_frame: false
	topk: null
	prune_from_frame: 0
	bi_directional_attn: image_tokens
	shared_low_high_embedding: true
	debug: null
	cp_enabled: false
	apply_cp_to_vision_backbone: false
	action_dim: 8
	action_horizon: 16
	n_action_steps: 8
	n_obs_steps: 2
	obs_step_delta: 8
	action_expert:
	max_horizon: 32
	action_dim: 8
	hidden_size: 768
	num_layers: 36
	num_heads: 8
	mlp_ratio: 4.0
	timestep_embed_dim: 256
	dropout: 0.0
	attn_dropout: 0.0
	context_layer_norm: true
	action_expert_layer_mode: per_layer
	flow_matching_num_steps: 10
	flow_matching_cutoff: 0.999
	flow_matching_beta_alpha: 1.0
	flow_matching_beta_beta: 1.5
	num_flow_timestamps: 8
	same_noise_per_time: false
	states_mode: cross_attn
	robot_preprocessor:
	stats_by_repo:
	synthmanip:
	observation.state:
	q01:
	- -0.8200882077217102
	- -1.0460078716278076
	- -1.2745805978775024
	- -2.864607334136963
	- -1.0115491151809692
	- 1.2138986587524414
	- -2.057372808456421
	- -0.027562683448195457
	q99:
	- 0.7587710618972778
	- 0.9406100511550903
	- 0.9344996809959412
	- -0.9798629283905029
	- 0.8359407782554626
	- 3.0869405269622803
	- 1.9223058223724365
	- 0.8661524057388306
	action:
	q01:
	- -0.8200882077217102
	- -1.0460078716278076
	- -1.2745805978775024
	- -2.864607334136963
	- -1.0115491151809692
	- 1.2138986587524414
	- -2.057372808456421
	- 0.0
	q99:
	- 0.7587710618972778
	- 0.9406100511550903
	- 0.9344996809959412
	- -0.9798629283905029
	- 0.8359407782554626
	- 3.0869405269622803
	- 1.9223058223724365
	- 255.0
	default_repo_id: synthmanip
	action_key: action
	state_keys:
	- observation.state
	action_norm_mode: quantiles
	state_norm_mode: quantiles
	robot_postprocessor:
	stats_by_repo:
	synthmanip:
	observation.state:
	q01:
	- -0.8200882077217102
	- -1.0460078716278076
	- -1.2745805978775024
	- -2.864607334136963
	- -1.0115491151809692
	- 1.2138986587524414
	- -2.057372808456421
	- -0.027562683448195457
	q99:
	- 0.7587710618972778
	- 0.9406100511550903
	- 0.9344996809959412
	- -0.9798629283905029
	- 0.8359407782554626
	- 3.0869405269622803
	- 1.9223058223724365
	- 0.8661524057388306
	action:
	q01:
	- -0.8200882077217102
	- -1.0460078716278076
	- -1.2745805978775024
	- -2.864607334136963
	- -1.0115491151809692
	- 1.2138986587524414
	- -2.057372808456421
	- 0.0
	q99:
	- 0.7587710618972778
	- 0.9406100511550903
	- 0.9344996809959412
	- -0.9798629283905029
	- 0.8359407782554626
	- 3.0869405269622803
	- 1.9223058223724365
	- 255.0
	default_repo_id: synthmanip
	action_key: action
	state_keys:
	- observation.state
	action_norm_mode: quantiles
	state_norm_mode: quantiles
	parallelism:
	data_parallel_replicate_degree: 1
	enable_compiled_autograd: false
	data_parallel_shard_degree: -1
	fsdp_reshard_after_forward: default
	context_parallel_config:
	degree: 1
	attention_type: ulysses
	load_balancer: ulysses
	head_stride: 1
	tensor_parallel_config:
	degree: 1
	enable_async: false
	data_parallel_config:
	name: fsdp
	param_dtype: null
	reduce_dtype: float32
	num_replicas: null
	shard_degree: null
	wrapping_strategy: full
	prefetch_factor: 0
	context_parallel_rotate_method: allgather
	seed: 6198
	epoch: null
	dry_run: false
	ft_llm: true
	ft_vit: false
	ft_connector: false
	ft_embedding: ae
	optimizer:
	name: adamw
	learning_rate: 0.0001
	weight_decay: 0.01
	betas:
	- 0.9
	- 0.95
	eps: 1.0e-05
	connector_learning_rate: 5.0e-06
	vit_learning_rate: 5.0e-06
	llm_learning_rate: 1.0e-05
	frame_selector_learning_rate: 0.0001
	temporal_token_scorer_learning_rate: 0.0001
	action_expert_learning_rate: 0.0001
	connector_weight_decay: 0.0
	vit_weight_decay: 0.0
	llm_weight_decay: 0.0
	frame_selector_weight_decay: 0.01
	temporal_token_scorer_weight_decay: 0.01
	action_expert_weight_decay: 0.0
	connector_betas:
	- 0.9
	- 0.95
	vit_betas:
	- 0.9
	- 0.95
	llm_betas:
	- 0.9
	- 0.95
	frame_selector_betas:
	- 0.9
	- 0.95
	temporal_token_scorer_betas:
	- 0.9
	- 0.95
	action_expert_betas:
	- 0.9
	- 0.95
	connector_eps: 1.0e-06
	vit_eps: 1.0e-06
	llm_eps: 1.0e-06
	frame_selector_eps: 1.0e-06
	temporal_token_scorer_eps: 1.0e-06
	action_expert_eps: 1.0e-06
	metrics_log_interval: -1
	scheduler:
	name: multimodal
	units: steps
	t_warmup: 100
	t_max: null
	alpha_f: 0.1
	connector_t_warmup: 200
	vit_t_warmup: 200
	llm_t_warmup: 2000
	frame_selector_t_warmup: 200
	temporal_token_scorer_t_warmup: 200
	action_expert_t_warmup: 200
	grad_clip_warmup_steps: null
	grad_clip_warmup_factor: null
	warmup_min_lr: 0.0
	data:
	dataset: null
	mixture:
	synthmanip/task_0: 0.35
	synthmanip/task_1: 0.2
	synthmanip/task_2: 0.2
	synthmanip/task_3: 0.15
	synthmanip/task_4: 0.1
	root_size_mixture: null
	kwargs_mixture: null
	split: train
	seed: 50189
	pad: to_max
	sequence_length: 928
	max_text_seq_len: null
	shuffle: true
	start_index: 0
	packing: null
	enable_variable_sized_token_pooling: true
	num_workers: 4
	drop_last: true
	pin_memory: true
	prefetch_factor: 4
	persistent_workers: false
	timeout: 300
	action_data: null
	action_loader_rate: null
	action_batch_interval: 1
	restore_dataloader: true
	fast_forward_batches: null
	evaluators:
	- label: synthmanip_val
	data:
	dataset: synthmanip/task_0
	mixture: null
	root_size_mixture: null
	kwargs_mixture: null
	split: val
	seed: 691203
	pad: to_max
	sequence_length: 928
	max_text_seq_len: null
	shuffle: false
	start_index: 0
	packing: null
	enable_variable_sized_token_pooling: true
	num_workers: 3
	drop_last: false
	pin_memory: true
	prefetch_factor: 4
	persistent_workers: false
	timeout: 300
	device_batch_size: 16
	subset_num_batches: null
	max_examples: 2000
	console_log_interval: 10
	response_logits_only: true
	reduce_loss_metrics_manually: false
	eval_interval: 1000
	inf_evaluators: []
	inf_eval_interval: 1000
	eval_on_last_step: true
	eval_on_load: false
	eval_on: []
	save_folder: /weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
	checkpointer_config:
	save_thread_count: null
	load_thread_count: null
	pre_download: false
	work_dir: null
	throttle_uploads: false
	canceled_check_interval: 50
	save_interval: 2000
	save_at: null
	save_final_optim: false
	save_num_checkpoints_to_keep: 1
	checkpoint_retention_frequency: 10000
	save_final_unsharded_checkpoint: false
	save_interval_ephemeral: null
	save_overwrite: true
	load_path: null
	reset_optimizer_state: true
	reset_trainer_state: true
	initial_model_checkpoint: /weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_-03-06-17-32-00_bs1024_dbs16_stp200000-mix_5_feb20_copy/step200000
	allow_resume: true
	max_duration: 50000
	global_train_batch_size: 1024
	device_train_microbatch_size: 16
	max_grad_norm: 1.0
	multi_component_grad_norm: true
	batch_divisor: global_batch
	max_grad_norm_ratio: null
	precision: amp_bf16
	wandb:
	project: molmo_ae_synth
	entity: prior-ai2
	group: null
	name: Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
	tags:
	- watching
	log_artifacts: false
	rank_zero_only: true
	log_interval: 20
	allow_resume: true
	finish_on_sigterm: true
	beaker_log_interval: 50
	speed_monitor:
	window_size: 20
	gpu_flops_available: null
	console_log_interval: 20
	enable_timing_logs: false
	gen1_gc_interval: 1
	compile:
	mode: default
	fullgraph: false
	dynamic: false
	backend: inductor
	activation_checkpointing: true
	fsdp:
	fsdp2: true
	precision: pure
	use_orig_params: true
	wrapping_strategy: null
	sharding_strategy: FULL_SHARD
	hybrid_sharding_num_model_replicas: null
	softmax_auxiliary_loss: false
	softmax_auxiliary_loss_scale: 0.0001
	response_logits_only: true
	saliency_score_loss_wt: null
	frame_score_loss_wt: null
	frame_score_loss_type: mse
	frame_score_loss_target: 0.7
	time_limit: null
	extra_steps_after_cancel: 0
	python_profiling: false
	torch_profiling: false
	stop_at: 50000
	stop_after: null
	fused_loss: false
	compile_loss: true
	runtime_data:
	args: launch_scripts/train_synthmanip.py /weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_-03-06-17-32-00_bs1024_dbs16_stp200000-mix_5_feb20_copy/step200000
	--data_paths mix --stats_path=/weka/oe-training-default/rohunt/robo/stats/franka_mltask_abs_pos.yaml
	--action_preset franka_joint --camera_preset franka_one_random_then_wrist --wandb.name=Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
	--wandb.entity=prior-ai2 --wandb.project=molmo_ae_synth --seq_len=928 --max_duration=50000
	--device_batch_size=16 --global_batch_size=1024 --log_interval=20 --model.mm_preprocessor.use_frame_special_tokens=True
	--model.mm_preprocessor.max_subtitle_tokens=null --prefetch_factor=4 --data.num_workers=4
	--save_interval=2000 --save_num_checkpoints_to_keep=1 --checkpoint_retention_frequency=10000
	--save_folder=/weka/oe-training-default/rohunt/model_runs/Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
	--exp_name=Frnk-8n_abs_vid_2f_8gap_2p-03-08-18-52-01_bs1024_dbs16_stp50000-mix_5_feb20
	--data.packing=null --model.mm_preprocessor.image.crop_mode=resize --model.mm_preprocessor.max_frames=1
	--model.same_noise_per_time=False --weighted_sampling --randomize_prompts --ft_embedding=ae
	--model.mm_preprocessor.image.max_images=4 --model.num_flow_timestamps=8 --ft_llm=True
	--scheduler.llm_t_warmup=2000 --optimizer.llm_learning_rate=1e-05 --img_aug --model.mm_preprocessor.image.multi_image_pooling_w=2
	--model.mm_preprocessor.image.multi_image_pooling_h=2 --n_obs_steps=2 --obs_step_delta=8
	--model.mm_preprocessor.image.single_frame=False --reset_optimizer_state --reset_trainer_state
	--furthest_camera_prob=0.5
	hostname: jupiter-cs-aus-148.reviz.ai2.in
	date: 03/09/2026, 01:55
	world_size: 64
	resuming_from: null
	beaker_experiment_id: 01KK84PM8EQZW1SC6YRT12PYRR
	beaker_experiment_url: null
	wandb_id: 1umcfp2f
	wandb_url: https://wandb.ai/prior-ai2/molmo_ae_synth/runs/1umcfp2f
	distributed_eval_enabled: false
	distributed_eval_benchmark_path: /weka/oe/rohunt/robo-bench/FrankaPickandPlaceDroidBench_5ep_json_benchmark
	distributed_eval_config_cls: launch_scripts.synthvla.configure_mujoco_thor:FrankaState8ClampConfig
	distributed_eval_task_horizon: 300
	distributed_eval_num_worker_jobs: 1
	distributed_eval_wandb_project: mjthor-online-eval
	distributed_eval_workspace: ai2/robo-molmo
	distributed_eval_clusters:
	- ai2/saturn
	- ai2/neptune
	- ai2/rhea
	- ai2/ceres
	distributed_eval_priority: high
	distributed_eval_preemptible: true