wybertwang commited on
Commit
ea4b752
·
verified ·
1 Parent(s): e9a9b3f

Upload folder using huggingface_hub

Browse files
audiostory_3b/config.yaml ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ seedx_dit_model_cfg:
2
+ _target_: src.models.mllm.modeling_audiostory_unified.AudioStory_unified
3
+ rec_loss_type: mse
4
+ model_dims: 2048
5
+ lm_loss_scale: 5.0
6
+ rec_loss_scale: 10.0
7
+ dit_loss_scale: 1.0
8
+ agent_model:
9
+ _target_: src.models.mllm.modeling_audiostory_llm.AudioStory_llm.from_pretrained
10
+ input_resampler: None
11
+ output_resampler: None
12
+ whisper_resampler_llava: None
13
+ mse: true
14
+ t5_feature_scale: 10
15
+ audio_feature_scale: 10.0
16
+ lm_loss_scale: 1.0
17
+ t5_rec_loss_scale: 5.0
18
+ target_audio_type: T5
19
+ model_dims: 2048
20
+ pretrained_model_path: audioseed_ckpt/seed_omni_t5_multi_audio_duration/seed_omni_qwen_3b_t5_multi_audio_unav_scale10_5e5_loss0105_bz8_genpretrain_withinst_duration_begin0/checkpoint-15000/pytorch_model.bin
21
+ llm_model:
22
+ _target_: src.models.mllm.peft_models.get_peft_model_with_resize_embedding
23
+ model:
24
+ _target_: transformers.AutoModelForCausalLM.from_pretrained
25
+ pretrained_model_name_or_path: ckpt/Qwen2.5-3B-Instruct
26
+ peft_config:
27
+ _target_: peft.LoraConfig
28
+ _convert_: object
29
+ r: 32
30
+ lora_alpha: 32
31
+ modules_to_save:
32
+ - input_layernorm
33
+ - post_attention_layernorm
34
+ - norm
35
+ target_modules:
36
+ - q_proj
37
+ - v_proj
38
+ - k_proj
39
+ - o_proj
40
+ - gate_proj
41
+ - down_proj
42
+ - up_proj
43
+ task_type: CAUSAL_LM
44
+ lora_dropout: 0.05
45
+ vocab_size: 152277
46
+ tokenizer:
47
+ _target_: src.models.tokenizer.init_qwen_tokenizer_special_token.init_tokenizer
48
+ pretrained_model_path: tokenizer
49
+ add_tokens_path: tokenizer/added_tokens.json
50
+ train_dataset:
51
+ _target_: src.data.sft_clm_audio_multi_audio_unav_tomjerry_cotrain.build_multi_datapipes
52
+ _recursive_: false
53
+ datapipes:
54
+ - _target_: src.data.sft_clm_audio_multi_audio_unav_tomjerry_cotrain.build_t2t_Flant5_audiotoken_json_datapipes_qwen_reasoning_captionloss_multi_audio_cotrain_duration
55
+ data_dir: datasets_audio_json/audio_seedomni_UnAV_multi_audio_generation_instruction_duration_chunk
56
+ audio_dir: ''
57
+ max_length: 1300
58
+ batch_size: 1
59
+ add_boi_token: false
60
+ add_gen_prompt: false
61
+ instruction_prompt: '<|im_start|>user
62
+
63
+ {instruction}<|im_end|>
64
+
65
+ '
66
+ assistant_template: '<|im_start|>assistant
67
+
68
+ {gen_prompt_response}'
69
+ system_message: '<|im_start|>system
70
+
71
+ You are a helpful assistant.<|im_end|>
72
+
73
+ '
74
+ reasoning_template: <|think|>{reasoning}<|/think|>
75
+ aud_first_ratio: -1
76
+ num_t5_in_tokens: 64
77
+ num_t5_out_tokens: 64
78
+ num_aud_in_tokens: 8
79
+ num_aud_out_tokens: 8
80
+ audio_max_length: 30.0
81
+ assure_text: true
82
+ cycle_count: 50
83
+ multi_resolution: false
84
+ dataset_name: wavcaps_clotho_audiocaps
85
+ train_args:
86
+ output_dir: audioseed_ckpt/seed_omni_t5_multi_audio_duration/audiostory_qwen_3b_t5_multi_audio_unav_scale10_1e4_loss0105_bz8_genpretrain_withinst_t5_aud_attn_cotrain_with_mhattn_weight_detokenizer_full_open_1opt_coscale_8token_duration_begin0_new
87
+ resume_from_checkpoint: null
88
+ resume_steps: null
89
+ batch_size: 8
90
+ learning_rate: 0.0001
91
+ weight_decay: 0.0001
92
+ adam_beta1: 0.9
93
+ adam_beta2: 0.98
94
+ adam_epsilon: 0.0002
95
+ max_grad_norm: 1.0
96
+ gradient_accumulation_steps: 1
97
+ mixed_precision: bf16
98
+ num_train_epochs: 60
99
+ max_steps: 12000
100
+ save_steps: 4000
101
+ lr_scheduler_type: cosine
102
+ warmup_steps: 300
103
+ min_lr_ratio: 0.05
104
+ dataloader_num_workers: 8
105
+ project_name: ContinuousVLM
106
+ expr_name: audiostory_qwen_3b_t5_multi_audio_unav_scale10_1e4_loss0105_bz8_genpretrain_withinst_t5_aud_attn_cotrain_with_mhattn_weight_detokenizer_full_open_1opt_coscale_8token_duration_begin0_new
107
+ unfreeze_agent_model_part: lora
108
+ freeze_dit: false
109
+ dit_open_type: full_open
110
+ use_whisper: true
111
+ use_detokenizer: true
112
+ load_pretrained_model: audioseed_ckpt/seed_omni_t5_multi_audio_duration/seed_omni_qwen_3b_t5_multi_audio_unav_scale10_5e5_loss0105_bz8_genpretrain_withinst_duration_begin0/checkpoint-15000
113
+ pretrain: true
114
+ zero_attn_last_layer: true
audiostory_3b/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f15dbd66ea12c30f18e9589eb424fddf76599255959155b668bee23ff9426794
3
+ size 9833703234