primepake commited on
Commit
9980d76
·
1 Parent(s): e422499

update data preprocessinG

Browse files
Files changed (3) hide show
  1. README.md +15 -0
  2. speech/config.yaml +4 -4
  3. speech/files_test.txt +5 -0
README.md CHANGED
@@ -69,6 +69,21 @@ pip install -r requirements.txt
69
  --model "speech_tokenizer_v2_25hz"
70
  ```
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  2. **Extracting DAC-VAE latent**
73
  ```bash
74
  cd dac-vae
 
69
  --model "speech_tokenizer_v2_25hz"
70
  ```
71
 
72
+ # or you can install via this repo, it will use filelist.txt to extract, each line in filelist.txt contains file audio path
73
+ # example files_test.txt
74
+
75
+ ```
76
+ cd speech/tools/S3Tokenizer
77
+ pip3 install .
78
+ # example cmd to run
79
+ torchrun --nproc_per_node=4 --nnodes=1 --rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" `which s3tokenizer` --root_path /data/dataset/ \
80
+ --model speech_tokenizer_v2_25hz \
81
+ --device "cuda" \
82
+ --batch_size 64 \
83
+ --file_list /data/learnable-speech/speech/files_test.txt \
84
+ --skip_existing
85
+ ```
86
+
87
  2. **Extracting DAC-VAE latent**
88
  ```bash
89
  cd dac-vae
speech/config.yaml CHANGED
@@ -13,7 +13,7 @@ qwen_pretrain_path: ''
13
  token_frame_rate: 25
14
  token_mel_ratio: 2
15
  use_speaker_encoder: True
16
- speaker_encoder_path: '/data/checkpoint/llm/epoch_29_step_20001.pt'
17
  # stream related params
18
  chunk_size: 25 # streaming inference chunk size, in token
19
  num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
@@ -51,8 +51,8 @@ llm: !new:cosyvoice.llm.llm.Qwen2LM
51
  extract_reference_mel: !name:cosyvoice.dataset.processor.extract_reference_mel_from_speech
52
  feat_extractor: !ref <feat_extractor>
53
  min_length: 0.5
54
- max_length: 12.0
55
- num_crops: 3 # Multiple crops from same utterance
56
  training: True
57
  sample_rate: !ref <sample_rate>
58
 
@@ -221,7 +221,7 @@ train_conf:
221
  scheduler_conf:
222
  warmup_steps: 500
223
  max_epoch: 2000
224
- grad_clip: 1
225
  accum_grad: 1
226
  log_interval: 5
227
  save_per_step: 2000
 
13
  token_frame_rate: 25
14
  token_mel_ratio: 2
15
  use_speaker_encoder: True
16
+ speaker_encoder_path: ''
17
  # stream related params
18
  chunk_size: 25 # streaming inference chunk size, in token
19
  num_decoding_left_chunks: -1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
 
51
  extract_reference_mel: !name:cosyvoice.dataset.processor.extract_reference_mel_from_speech
52
  feat_extractor: !ref <feat_extractor>
53
  min_length: 0.5
54
+ max_length: 4.0
55
+ num_crops: 1 # Multiple crops from same utterance
56
  training: True
57
  sample_rate: !ref <sample_rate>
58
 
 
221
  scheduler_conf:
222
  warmup_steps: 500
223
  max_epoch: 2000
224
+ grad_clip: 5
225
  accum_grad: 1
226
  log_interval: 5
227
  save_per_step: 2000
speech/files_test.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ /data/dataset/emilia/en/EN_B00007/EN_B00007_S01193/EN_B00007_S01193_W000000.wav
2
+ /data/dataset/emilia/en/EN_B00007/EN_B00007_S01193/EN_B00007_S01193_W000001.wav
3
+ /data/dataset/emilia/en/EN_B00007/EN_B00007_S08642/EN_B00007_S08642_W000003.wav
4
+ /data/dataset/emilia/en/EN_B00007/EN_B00007_S08642/EN_B00007_S08642_W000046.wav
5
+ /data/dataset/emilia/en/EN_B00007/EN_B00007_S08642/EN_B00007_S08642_W000000.wav