| """ |
| Saving a s1 pretrained model for training |
| |
| """ |
|
|
| import torch |
| from starforce.model.starforce_s1 import Starforce_S1, Starforce_S1_Config |
| from starforce.model.action_head.flow_matching_action_head import FlowmatchingActionHeadConfig |
|
|
| config = Starforce_S1_Config() |
| config.backbone_cfg = { |
| "tune_llm": False, |
| |
| "vllm_base_model_path": "/pfs/pfs-ahGxdf/data/wujingyi/huggingface/Qwen2.5-VL-7B-Instruct", |
| "select_layer": 12, |
| "feature_dim": 3584, |
| "project_to_dim": 2048, |
| } |
| config.action_horizon = 16 |
| config.action_dim = 32 |
| config.action_head_cfg = { |
| "action_dim": 32, |
| "action_horizon": 16, |
| "add_pos_embed": True, |
| "backbone_embedding_dim": 2048, |
| "diffusion_model_cfg": { |
| "attention_head_dim": 48, |
| "cross_attention_dim": 2048, |
| "dropout": 0.2, |
| "final_dropout": True, |
| "interleave_self_attention": True, |
| "norm_type": "ada_norm", |
| "num_attention_heads": 32, |
| "num_layers": 16, |
| "output_dim": 1024, |
| "positional_embeddings": None, |
| }, |
| "hidden_size": 1024, |
| "input_embedding_dim": 1536, |
| "max_action_dim": 32, |
| "max_state_dim": 64, |
| "model_dtype": "float32", |
| "noise_beta_alpha": 1.5, |
| "noise_beta_beta": 1.0, |
| "noise_s": 0.999, |
| "num_inference_timesteps": 4, |
| "num_target_vision_tokens": 32, |
| "num_timestep_buckets": 1000, |
| "tune_diffusion_model": True, |
| "tune_projector": True, |
| "use_vlln": True, |
| "vl_self_attention_cfg": { |
| "attention_head_dim": 64, |
| "dropout": 0.2, |
| "final_dropout": True, |
| "num_attention_heads": 32, |
| "num_layers": 4, |
| "positional_embeddings": None, |
| }, |
| } |
| model = Starforce_S1(config=config, local_model_path=None) |
|
|
| |
| action_head_state_dict = torch.load("checkpoints/qz-action-expert.pth") |
| model.action_head.load_state_dict(action_head_state_dict) |
|
|
| model.save_pretrained("checkpoints/Starforce-S1-7B") |
|
|
| print("done!") |
|
|