Spaces:
Sleeping
Sleeping
primepake
commited on
Commit
·
9980d76
1
Parent(s):
e422499
update data preprocessinG
Browse files- README.md +15 -0
- speech/config.yaml +4 -4
- speech/files_test.txt +5 -0
README.md
CHANGED
|
@@ -69,6 +69,21 @@ pip install -r requirements.txt
|
|
| 69 |
--model "speech_tokenizer_v2_25hz"
|
| 70 |
```
|
| 71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
2. **Extracting DAC-VAE latent**
|
| 73 |
```bash
|
| 74 |
cd dac-vae
|
|
|
|
| 69 |
--model "speech_tokenizer_v2_25hz"
|
| 70 |
```
|
| 71 |
|
| 72 |
+
# or you can install via this repo, it will use filelist.txt to extract, each line in filelist.txt contains file audio path
|
| 73 |
+
# example files_test.txt
|
| 74 |
+
|
| 75 |
+
```
|
| 76 |
+
cd speech/tools/S3Tokenizer
|
| 77 |
+
pip3 install .
|
| 78 |
+
# example cmd to run
|
| 79 |
+
torchrun --nproc_per_node=4 --nnodes=1 --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" `which s3tokenizer` --root_path /data/dataset/ \
|
| 80 |
+
--model speech_tokenizer_v2_25hz \
|
| 81 |
+
--device "cuda" \
|
| 82 |
+
--batch_size 64 \
|
| 83 |
+
--file_list /data/learnable-speech/speech/files_test.txt \
|
| 84 |
+
--skip_existing
|
| 85 |
+
```
|
| 86 |
+
|
| 87 |
2. **Extracting DAC-VAE latent**
|
| 88 |
```bash
|
| 89 |
cd dac-vae
|
speech/config.yaml
CHANGED
|
@@ -13,7 +13,7 @@ qwen_pretrain_path: ''
|
|
| 13 |
token_frame_rate: 25
|
| 14 |
token_mel_ratio: 2
|
| 15 |
use_speaker_encoder: True
|
| 16 |
-
speaker_encoder_path: '
|
| 17 |
# stream related params
|
| 18 |
chunk_size: 25 # streaming inference chunk size, in token
|
| 19 |
num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
|
|
@@ -51,8 +51,8 @@ llm: !new:cosyvoice.llm.llm.Qwen2LM
|
|
| 51 |
extract_reference_mel: !name:cosyvoice.dataset.processor.extract_reference_mel_from_speech
|
| 52 |
feat_extractor: !ref <feat_extractor>
|
| 53 |
min_length: 0.5
|
| 54 |
-
max_length:
|
| 55 |
-
num_crops:
|
| 56 |
training: True
|
| 57 |
sample_rate: !ref <sample_rate>
|
| 58 |
|
|
@@ -221,7 +221,7 @@ train_conf:
|
|
| 221 |
scheduler_conf:
|
| 222 |
warmup_steps: 500
|
| 223 |
max_epoch: 2000
|
| 224 |
-
grad_clip:
|
| 225 |
accum_grad: 1
|
| 226 |
log_interval: 5
|
| 227 |
save_per_step: 2000
|
|
|
|
| 13 |
token_frame_rate: 25
|
| 14 |
token_mel_ratio: 2
|
| 15 |
use_speaker_encoder: True
|
| 16 |
+
speaker_encoder_path: ''
|
| 17 |
# stream related params
|
| 18 |
chunk_size: 25 # streaming inference chunk size, in token
|
| 19 |
num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
|
|
|
|
| 51 |
extract_reference_mel: !name:cosyvoice.dataset.processor.extract_reference_mel_from_speech
|
| 52 |
feat_extractor: !ref <feat_extractor>
|
| 53 |
min_length: 0.5
|
| 54 |
+
max_length: 4.0
|
| 55 |
+
num_crops: 1 # Multiple crops from same utterance
|
| 56 |
training: True
|
| 57 |
sample_rate: !ref <sample_rate>
|
| 58 |
|
|
|
|
| 221 |
scheduler_conf:
|
| 222 |
warmup_steps: 500
|
| 223 |
max_epoch: 2000
|
| 224 |
+
grad_clip: 5
|
| 225 |
accum_grad: 1
|
| 226 |
log_interval: 5
|
| 227 |
save_per_step: 2000
|
speech/files_test.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/data/dataset/emilia/en/EN_B00007/EN_B00007_S01193/EN_B00007_S01193_W000000.wav
|
| 2 |
+
/data/dataset/emilia/en/EN_B00007/EN_B00007_S01193/EN_B00007_S01193_W000001.wav
|
| 3 |
+
/data/dataset/emilia/en/EN_B00007/EN_B00007_S08642/EN_B00007_S08642_W000003.wav
|
| 4 |
+
/data/dataset/emilia/en/EN_B00007/EN_B00007_S08642/EN_B00007_S08642_W000046.wav
|
| 5 |
+
/data/dataset/emilia/en/EN_B00007/EN_B00007_S08642/EN_B00007_S08642_W000000.wav
|