primepake commited on
Commit
f7498a7
·
1 Parent(s): 6599f2a

data processing updates

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. speech/.gitignore +0 -0
  2. speech/.gitmodules +0 -0
  3. speech/asset/dingding.png +0 -0
  4. speech/cosyvoice/__init__.py +0 -0
  5. speech/cosyvoice/bin/export_jit.py +0 -0
  6. speech/cosyvoice/bin/export_onnx.py +0 -0
  7. speech/cosyvoice/cli/__init__.py +0 -0
  8. speech/cosyvoice/cli/cosyvoice.py +0 -0
  9. speech/cosyvoice/cli/frontend.py +0 -0
  10. speech/cosyvoice/cli/model.py +0 -0
  11. speech/cosyvoice/dataset/__init__.py +0 -0
  12. speech/cosyvoice/dataset/dataset.py +0 -0
  13. speech/cosyvoice/dataset/processor.py +136 -1
  14. speech/cosyvoice/flow/decoder.py +0 -0
  15. speech/cosyvoice/flow/flow.py +0 -0
  16. speech/cosyvoice/flow/flow_matching.py +0 -0
  17. speech/cosyvoice/flow/length_regulator.py +0 -0
  18. speech/cosyvoice/hifigan/discriminator.py +0 -0
  19. speech/cosyvoice/hifigan/f0_predictor.py +0 -0
  20. speech/cosyvoice/hifigan/generator.py +0 -0
  21. speech/cosyvoice/hifigan/hifigan.py +0 -0
  22. speech/cosyvoice/llm/llm.py +0 -0
  23. speech/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +0 -0
  24. speech/cosyvoice/tokenizer/tokenizer.py +0 -0
  25. speech/cosyvoice/transformer/__init__.py +0 -0
  26. speech/cosyvoice/transformer/activation.py +0 -0
  27. speech/cosyvoice/transformer/attention.py +0 -0
  28. speech/cosyvoice/transformer/convolution.py +0 -0
  29. speech/cosyvoice/transformer/decoder.py +0 -0
  30. speech/cosyvoice/transformer/decoder_layer.py +0 -0
  31. speech/cosyvoice/transformer/embedding.py +0 -0
  32. speech/cosyvoice/transformer/encoder.py +0 -0
  33. speech/cosyvoice/transformer/encoder_layer.py +0 -0
  34. speech/cosyvoice/transformer/label_smoothing_loss.py +0 -0
  35. speech/cosyvoice/transformer/positionwise_feed_forward.py +0 -0
  36. speech/cosyvoice/transformer/subsampling.py +0 -0
  37. speech/cosyvoice/transformer/upsample_encoder.py +0 -0
  38. speech/cosyvoice/utils/__init__.py +0 -0
  39. speech/cosyvoice/utils/class_utils.py +0 -0
  40. speech/cosyvoice/utils/common.py +0 -0
  41. speech/cosyvoice/utils/executor.py +0 -0
  42. speech/cosyvoice/utils/file_utils.py +0 -0
  43. speech/cosyvoice/utils/frontend_utils.py +0 -0
  44. speech/cosyvoice/utils/losses.py +0 -0
  45. speech/cosyvoice/utils/mask.py +0 -0
  46. speech/cosyvoice/utils/scheduler.py +0 -0
  47. speech/cosyvoice/utils/train_utils.py +0 -0
  48. speech/cosyvoice2.yaml +4 -15
  49. speech/examples/magicdata-read/cosyvoice/local/prepare_data.py +0 -0
  50. speech/examples/magicdata-read/cosyvoice/tts_text.json +0 -0
speech/.gitignore CHANGED
File without changes
speech/.gitmodules CHANGED
File without changes
speech/asset/dingding.png CHANGED

Git LFS Details

  • SHA256: ff82909abd313b24ab6c6bf1cf5ce09014068474a35a4d8d3b8084c8cf0e9503
  • Pointer size: 130 Bytes
  • Size of remote file: 96.4 kB

Git LFS Details

  • SHA256: ff82909abd313b24ab6c6bf1cf5ce09014068474a35a4d8d3b8084c8cf0e9503
  • Pointer size: 130 Bytes
  • Size of remote file: 96.4 kB
speech/cosyvoice/__init__.py CHANGED
File without changes
speech/cosyvoice/bin/export_jit.py CHANGED
File without changes
speech/cosyvoice/bin/export_onnx.py CHANGED
File without changes
speech/cosyvoice/cli/__init__.py CHANGED
File without changes
speech/cosyvoice/cli/cosyvoice.py CHANGED
File without changes
speech/cosyvoice/cli/frontend.py CHANGED
File without changes
speech/cosyvoice/cli/model.py CHANGED
File without changes
speech/cosyvoice/dataset/__init__.py CHANGED
File without changes
speech/cosyvoice/dataset/dataset.py CHANGED
File without changes
speech/cosyvoice/dataset/processor.py CHANGED
@@ -21,11 +21,146 @@ import torchaudio
21
  from torch.nn.utils.rnn import pad_sequence
22
  import torch.nn.functional as F
23
  import pyworld as pw
24
-
 
 
25
 
26
  AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
27
 
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def parquet_opener(data, mode='train', tts_data={}):
30
  """ Give url or local file, return file descriptor
31
  Inplace operation.
 
21
  from torch.nn.utils.rnn import pad_sequence
22
  import torch.nn.functional as F
23
  import pyworld as pw
24
+ import glob
25
+ import os
26
+ import json
27
 
28
  AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
29
 
30
 
31
+ def individual_file_opener(data, mode='train', tts_data={}):
32
+ """Load data from individual files instead of parquet
33
+
34
+ Args:
35
+ data: Iterable[{src}] where src is either:
36
+ - Path to a directory containing audio files
37
+ - Path to a JSON index file
38
+ mode: 'train' or 'test'
39
+ tts_data: Dict for TTS mode
40
+
41
+ Yields:
42
+ Dict with all required fields for training
43
+ """
44
+ for sample in data:
45
+ assert 'src' in sample
46
+ src = sample['src']
47
+
48
+ # Determine if src is a directory or index file
49
+ if src.endswith('.json'):
50
+ # Load from index file
51
+ with open(src, 'r') as f:
52
+ index_data = json.load(f)
53
+ file_list = index_data.get('data', [])
54
+ else:
55
+ # Scan directory for wav files
56
+ wav_files = glob.glob(os.path.join(src, '*/*/*wav'))
57
+ if not wav_files:
58
+ # Try different patterns
59
+ wav_files = glob.glob(os.path.join(src, '**/*.wav'), recursive=True)
60
+
61
+ file_list = []
62
+ for wav_path in wav_files:
63
+ # Check if all required files exist
64
+ txt_path = wav_path.replace('.wav', '.normalized.txt')
65
+ embedding_path = wav_path.replace('.wav', '_embedding.pt')
66
+ token_path = wav_path.replace('.wav', '_tokens.pt')
67
+
68
+ if not os.path.exists(txt_path):
69
+ logging.warning(f'Text file not found for {wav_path}, skipping')
70
+ continue
71
+
72
+ # Extract metadata
73
+ utt = os.path.basename(wav_path).replace('.wav', '')
74
+ spk = utt.split('_')[0]
75
+
76
+ # Find speaker embedding
77
+ spk_embed_dir = os.path.join(os.path.dirname(src), 'spk_embeddings')
78
+ if not os.path.exists(spk_embed_dir):
79
+ spk_embed_dir = os.path.join(src, 'spk_embeddings')
80
+ spk_embedding_path = os.path.join(spk_embed_dir, f'{spk}_embedding.pt')
81
+
82
+ file_info = {
83
+ 'utt': utt,
84
+ 'spk': spk,
85
+ 'wav': wav_path,
86
+ 'text_path': txt_path,
87
+ 'embedding_path': embedding_path,
88
+ 'token_path': token_path,
89
+ 'spk_embedding_path': spk_embedding_path
90
+ }
91
+ file_list.append(file_info)
92
+
93
+ # Process each file
94
+ for file_info in file_list:
95
+ try:
96
+ # Read audio data
97
+ with open(file_info['wav'], 'rb') as f:
98
+ audio_data = f.read()
99
+
100
+ # Read text
101
+ with open(file_info['text_path'], 'r') as f:
102
+ text = ''.join(l.strip() for l in f.readlines())
103
+
104
+ # Load embeddings if they exist
105
+ if os.path.exists(file_info['embedding_path']):
106
+ utt_embedding = torch.load(file_info['embedding_path'])
107
+ if isinstance(utt_embedding, torch.Tensor):
108
+ utt_embedding = utt_embedding.tolist()
109
+ else:
110
+ logging.warning(f"Utterance embedding not found: {file_info['embedding_path']}")
111
+ # Create a dummy embedding
112
+ utt_embedding = [0.0] * 192 # Assuming 192-dim embeddings
113
+
114
+ # Load tokens if they exist
115
+ if os.path.exists(file_info['token_path']):
116
+ speech_token = torch.load(file_info['token_path'])
117
+ if isinstance(speech_token, torch.Tensor):
118
+ speech_token = speech_token.tolist()
119
+ else:
120
+ logging.warning(f"Token file not found: {file_info['token_path']}")
121
+ speech_token = []
122
+
123
+ # Load speaker embedding
124
+ if os.path.exists(file_info['spk_embedding_path']):
125
+ spk_embedding = torch.load(file_info['spk_embedding_path'])
126
+ if isinstance(spk_embedding, torch.Tensor):
127
+ spk_embedding = spk_embedding.tolist()
128
+ else:
129
+ logging.warning(f"Speaker embedding not found: {file_info['spk_embedding_path']}")
130
+ # Use utterance embedding as fallback
131
+ spk_embedding = utt_embedding
132
+
133
+ # Build sample dict
134
+ sample_dict = {
135
+ 'utt': file_info['utt'],
136
+ 'spk': file_info['spk'],
137
+ 'audio_data': audio_data,
138
+ 'text': text,
139
+ 'text_token': [], # Will be filled by tokenize processor
140
+ 'utt_embedding': utt_embedding,
141
+ 'spk_embedding': spk_embedding,
142
+ 'speech_token': speech_token,
143
+ 'wav': file_info['wav'], # Keep original path for reference
144
+ }
145
+
146
+ # Copy over any additional fields from the original sample
147
+ for key, value in sample.items():
148
+ if key not in sample_dict:
149
+ sample_dict[key] = value
150
+
151
+ if mode == 'train':
152
+ yield sample_dict
153
+ else:
154
+ # For TTS mode
155
+ if file_info['utt'] in tts_data:
156
+ for index, tts_text in enumerate(tts_data[file_info['utt']]):
157
+ yield {**sample_dict, 'tts_index': index, 'tts_text': tts_text}
158
+ else:
159
+ yield sample_dict
160
+
161
+ except Exception as ex:
162
+ logging.warning(f'Failed to process {file_info["wav"]}: {ex}')
163
+
164
  def parquet_opener(data, mode='train', tts_data={}):
165
  """ Give url or local file, return file descriptor
166
  Inplace operation.
speech/cosyvoice/flow/decoder.py CHANGED
File without changes
speech/cosyvoice/flow/flow.py CHANGED
File without changes
speech/cosyvoice/flow/flow_matching.py CHANGED
File without changes
speech/cosyvoice/flow/length_regulator.py CHANGED
File without changes
speech/cosyvoice/hifigan/discriminator.py CHANGED
File without changes
speech/cosyvoice/hifigan/f0_predictor.py CHANGED
File without changes
speech/cosyvoice/hifigan/generator.py CHANGED
File without changes
speech/cosyvoice/hifigan/hifigan.py CHANGED
File without changes
speech/cosyvoice/llm/llm.py CHANGED
File without changes
speech/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken CHANGED
File without changes
speech/cosyvoice/tokenizer/tokenizer.py CHANGED
File without changes
speech/cosyvoice/transformer/__init__.py CHANGED
File without changes
speech/cosyvoice/transformer/activation.py CHANGED
File without changes
speech/cosyvoice/transformer/attention.py CHANGED
File without changes
speech/cosyvoice/transformer/convolution.py CHANGED
File without changes
speech/cosyvoice/transformer/decoder.py CHANGED
File without changes
speech/cosyvoice/transformer/decoder_layer.py CHANGED
File without changes
speech/cosyvoice/transformer/embedding.py CHANGED
File without changes
speech/cosyvoice/transformer/encoder.py CHANGED
File without changes
speech/cosyvoice/transformer/encoder_layer.py CHANGED
File without changes
speech/cosyvoice/transformer/label_smoothing_loss.py CHANGED
File without changes
speech/cosyvoice/transformer/positionwise_feed_forward.py CHANGED
File without changes
speech/cosyvoice/transformer/subsampling.py CHANGED
File without changes
speech/cosyvoice/transformer/upsample_encoder.py CHANGED
File without changes
speech/cosyvoice/utils/__init__.py CHANGED
File without changes
speech/cosyvoice/utils/class_utils.py CHANGED
File without changes
speech/cosyvoice/utils/common.py CHANGED
File without changes
speech/cosyvoice/utils/executor.py CHANGED
File without changes
speech/cosyvoice/utils/file_utils.py CHANGED
File without changes
speech/cosyvoice/utils/frontend_utils.py CHANGED
File without changes
speech/cosyvoice/utils/losses.py CHANGED
File without changes
speech/cosyvoice/utils/mask.py CHANGED
File without changes
speech/cosyvoice/utils/scheduler.py CHANGED
File without changes
speech/cosyvoice/utils/train_utils.py CHANGED
File without changes
speech/cosyvoice2.yaml CHANGED
@@ -129,6 +129,9 @@ hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
129
  !ref <mel_spec_transform1>
130
  ]
131
 
 
 
 
132
  # processor functions
133
  parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
134
  get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
@@ -176,25 +179,11 @@ padding: !name:cosyvoice.dataset.processor.padding
176
 
177
  # dataset processor pipeline
178
  data_pipeline: [
179
- !ref <parquet_opener>,
180
- !ref <tokenize>,
181
- !ref <filter>,
182
- !ref <resample>,
183
- !ref <compute_fbank>,
184
- !ref <parse_embedding>,
185
- !ref <shuffle>,
186
- !ref <sort>,
187
- !ref <batch>,
188
- !ref <padding>,
189
- ]
190
- data_pipeline_gan: [
191
- !ref <parquet_opener>,
192
  !ref <tokenize>,
193
  !ref <filter>,
194
  !ref <resample>,
195
- !ref <truncate>,
196
  !ref <compute_fbank>,
197
- !ref <compute_f0>,
198
  !ref <parse_embedding>,
199
  !ref <shuffle>,
200
  !ref <sort>,
 
129
  !ref <mel_spec_transform1>
130
  ]
131
 
132
+ individual_file_opener: !name:cosyvoice.dataset.processor.individual_file_opener
133
+
134
+
135
  # processor functions
136
  parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
137
  get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
 
179
 
180
  # dataset processor pipeline
181
  data_pipeline: [
182
+ !ref <individual_file_opener>,
 
 
 
 
 
 
 
 
 
 
 
 
183
  !ref <tokenize>,
184
  !ref <filter>,
185
  !ref <resample>,
 
186
  !ref <compute_fbank>,
 
187
  !ref <parse_embedding>,
188
  !ref <shuffle>,
189
  !ref <sort>,
speech/examples/magicdata-read/cosyvoice/local/prepare_data.py CHANGED
File without changes
speech/examples/magicdata-read/cosyvoice/tts_text.json CHANGED
File without changes