mrq commited on
Commit
cdbe2c2
·
1 Parent(s): 02d0f7b
models/ckpt/ar+nar-layerskip-llama-8/fp32.sft DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:cad3d44bdb88d694986a886a7799812d19a0bac5dec0a351142ae322c90cf2dc
3
- size 456274490
 
 
 
 
models/ckpt/ar+nar-llama-8/fp32.sft DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:7b3db54a17771aacf14bb0ee32b506f6222b898cf2ca16442913367b53991030
3
- size 458923764
 
 
 
 
models/ckpt/nar-len-llama-8/fp32.sft DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c965b7b365e410ef26c84fe1f5c7d818173e7b0544d23f2106aa2623412d447c
3
- size 458923748
 
 
 
 
models/config.llama[layerskip].yaml DELETED
@@ -1,17 +0,0 @@
1
- models:
2
- - name: "ar+nar-layerskip"
3
- size: "full"
4
- resp_levels: 8
5
- tasks: 9
6
- langs: 4
7
- tones: 1
8
- arch_type: llama
9
- attention: auto
10
- version: 5
11
-
12
- capabilities: ["ar", "nar"]
13
- experimental:
14
- split_classifiers: True
15
- audio_embedding_sums: True
16
- unified_position_ids: False
17
- layerskip: True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
models/config.llama[nar-len].yaml DELETED
@@ -1,169 +0,0 @@
1
- sample_rate: 24_000
2
- audio_backend: "vocos"
3
- weights_format: sft
4
- experimental: True
5
-
6
- models:
7
- - name: "nar-len"
8
- size: "full"
9
- resp_levels: 8
10
- tasks: 9
11
- langs: 4
12
- tones: 1
13
- arch_type: llama
14
- training: True
15
- version: 5
16
- attention: sdpa
17
- dropout: 0.1
18
- #loss_factors:
19
- # text: 0.01
20
- # prom: 0.5
21
- # resp: 1.0
22
- capabilities: ["ar", "nar", "len"]
23
- experimental:
24
- audio_embedding_sums: True
25
- split_classifiers: True
26
- unified_position_ids: False
27
- rvq_levels_p: [
28
- 0, 0, 0, 0, 0, 0, 0,
29
- 0, 0, 0, 0, 0, 0, 0,
30
- 1, 2, 3, 4, 5, 6, 7
31
- ]
32
-
33
- masking_train_p: 1.0
34
- masking_ratio_fixed: True
35
- ignore_inputs_for_loss: True
36
-
37
- cfg_cond_dropout_p: 0.1
38
- cfg_prom_dropout_p: 0.05
39
-
40
- #token_dropout_error: 0.001
41
- #token_dropout_rate: 0.001
42
- #layerskip: True
43
- #layerskip_r: 2
44
- #layerskip_e_scale: 0.1
45
-
46
- #loras:
47
- #- name : "lora-shodan"
48
- # rank: 128
49
- # alpha: 128
50
- # training: True
51
- # rvq_levels: []
52
-
53
- hyperparameters:
54
- batch_size: 32
55
- gradient_accumulation_steps: 4 # 8
56
- gradient_clipping: 1.0
57
- warmup_steps: 10
58
-
59
- optimizer: Prodigy
60
- learning_rate: 1.0
61
- torch_optimizer: True
62
-
63
- scheduler: "" # ScheduleFree
64
- torch_scheduler: True
65
-
66
- evaluation:
67
- batch_size: 8
68
- frequency: 500
69
- size: 8
70
-
71
- kwargs:
72
- max_duration: 500
73
- max_steps: 25
74
- ar_temperature: 1.0
75
- repetition_penalty: 1.0
76
- cfg_strength: 1.0
77
- nar_temperature: 0.0
78
-
79
- trainer:
80
- iterations: 1_000_000
81
- save_frequency: 250
82
- keep_last_checkpoints: 4
83
-
84
- resize_modules: True
85
-
86
- check_for_oom: False
87
- gradient_checkpointing: True
88
-
89
- weight_dtype: float16
90
- amp: True
91
-
92
- backend: deepspeed
93
- deepspeed:
94
- inferencing: False
95
- amp: False
96
- loss_scale_window: 250
97
- min_loss_scale: 32768
98
-
99
- load_webui: False
100
-
101
- inference:
102
- backend: local
103
- normalize: False
104
-
105
- weight_dtype: float16
106
- amp: True
107
-
108
- optimizations:
109
- injects: False
110
- replace: True
111
-
112
- linear: False
113
- embedding: False
114
- optimizers: True
115
-
116
- bitsandbytes: False
117
- dadaptation: False
118
- bitnet: False
119
- fp8: False
120
-
121
- dataset:
122
- speaker_name_getter: "lambda p: f'{p.parts[-3]}_{p.parts[-2]}'"
123
- speaker_group_getter: "lambda p: f'{p.parts[-3]}'"
124
- speaker_languages:
125
- ja: [
126
- "housamo",
127
- "JA-"
128
- ]
129
- de: [
130
- "DE-"
131
- ]
132
- fr: [
133
- "FR-"
134
- ]
135
-
136
- use_hdf5: True
137
- hdf5_flag: r
138
-
139
- use_metadata: True
140
- validate: True
141
-
142
- workers: 2
143
- cache: True
144
-
145
- duration_range: [1.0, 16.0]
146
-
147
- prompt_max_samples: 1
148
- prompt_duration_range: [1.0, 6.0]
149
- prompt_similar_p: 0.825
150
- prompt_similar_top_k: 6
151
-
152
- resps_max_samples: 1
153
- resps_append_p: 0.0
154
-
155
- sample_type: path # path # speaker
156
- sample_order: duration
157
- sample_max_duration_batch: 120
158
- sample_shuffle: True
159
- retokenize_text: True
160
-
161
- tasks_list: [
162
- "tts", "tts", "tts", "tts", "tts", "tts", "tts",
163
- "tts", "tts", "tts", "tts", "tts", "tts", "tts",
164
- "len",
165
- ] #, "stt", "tts-c", "ns", "sr" ]
166
-
167
- training: []
168
- validation: []
169
- noise: []