Spaces:
Sleeping
Sleeping
primepake
commited on
Commit
·
f7498a7
1
Parent(s):
6599f2a
data processing updates
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- speech/.gitignore +0 -0
- speech/.gitmodules +0 -0
- speech/asset/dingding.png +0 -0
- speech/cosyvoice/__init__.py +0 -0
- speech/cosyvoice/bin/export_jit.py +0 -0
- speech/cosyvoice/bin/export_onnx.py +0 -0
- speech/cosyvoice/cli/__init__.py +0 -0
- speech/cosyvoice/cli/cosyvoice.py +0 -0
- speech/cosyvoice/cli/frontend.py +0 -0
- speech/cosyvoice/cli/model.py +0 -0
- speech/cosyvoice/dataset/__init__.py +0 -0
- speech/cosyvoice/dataset/dataset.py +0 -0
- speech/cosyvoice/dataset/processor.py +136 -1
- speech/cosyvoice/flow/decoder.py +0 -0
- speech/cosyvoice/flow/flow.py +0 -0
- speech/cosyvoice/flow/flow_matching.py +0 -0
- speech/cosyvoice/flow/length_regulator.py +0 -0
- speech/cosyvoice/hifigan/discriminator.py +0 -0
- speech/cosyvoice/hifigan/f0_predictor.py +0 -0
- speech/cosyvoice/hifigan/generator.py +0 -0
- speech/cosyvoice/hifigan/hifigan.py +0 -0
- speech/cosyvoice/llm/llm.py +0 -0
- speech/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +0 -0
- speech/cosyvoice/tokenizer/tokenizer.py +0 -0
- speech/cosyvoice/transformer/__init__.py +0 -0
- speech/cosyvoice/transformer/activation.py +0 -0
- speech/cosyvoice/transformer/attention.py +0 -0
- speech/cosyvoice/transformer/convolution.py +0 -0
- speech/cosyvoice/transformer/decoder.py +0 -0
- speech/cosyvoice/transformer/decoder_layer.py +0 -0
- speech/cosyvoice/transformer/embedding.py +0 -0
- speech/cosyvoice/transformer/encoder.py +0 -0
- speech/cosyvoice/transformer/encoder_layer.py +0 -0
- speech/cosyvoice/transformer/label_smoothing_loss.py +0 -0
- speech/cosyvoice/transformer/positionwise_feed_forward.py +0 -0
- speech/cosyvoice/transformer/subsampling.py +0 -0
- speech/cosyvoice/transformer/upsample_encoder.py +0 -0
- speech/cosyvoice/utils/__init__.py +0 -0
- speech/cosyvoice/utils/class_utils.py +0 -0
- speech/cosyvoice/utils/common.py +0 -0
- speech/cosyvoice/utils/executor.py +0 -0
- speech/cosyvoice/utils/file_utils.py +0 -0
- speech/cosyvoice/utils/frontend_utils.py +0 -0
- speech/cosyvoice/utils/losses.py +0 -0
- speech/cosyvoice/utils/mask.py +0 -0
- speech/cosyvoice/utils/scheduler.py +0 -0
- speech/cosyvoice/utils/train_utils.py +0 -0
- speech/cosyvoice2.yaml +4 -15
- speech/examples/magicdata-read/cosyvoice/local/prepare_data.py +0 -0
- speech/examples/magicdata-read/cosyvoice/tts_text.json +0 -0
speech/.gitignore
CHANGED
|
File without changes
|
speech/.gitmodules
CHANGED
|
File without changes
|
speech/asset/dingding.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
speech/cosyvoice/__init__.py
CHANGED
|
File without changes
|
speech/cosyvoice/bin/export_jit.py
CHANGED
|
File without changes
|
speech/cosyvoice/bin/export_onnx.py
CHANGED
|
File without changes
|
speech/cosyvoice/cli/__init__.py
CHANGED
|
File without changes
|
speech/cosyvoice/cli/cosyvoice.py
CHANGED
|
File without changes
|
speech/cosyvoice/cli/frontend.py
CHANGED
|
File without changes
|
speech/cosyvoice/cli/model.py
CHANGED
|
File without changes
|
speech/cosyvoice/dataset/__init__.py
CHANGED
|
File without changes
|
speech/cosyvoice/dataset/dataset.py
CHANGED
|
File without changes
|
speech/cosyvoice/dataset/processor.py
CHANGED
|
@@ -21,11 +21,146 @@ import torchaudio
|
|
| 21 |
from torch.nn.utils.rnn import pad_sequence
|
| 22 |
import torch.nn.functional as F
|
| 23 |
import pyworld as pw
|
| 24 |
-
|
|
|
|
|
|
|
| 25 |
|
| 26 |
AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
|
| 27 |
|
| 28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
def parquet_opener(data, mode='train', tts_data={}):
|
| 30 |
""" Give url or local file, return file descriptor
|
| 31 |
Inplace operation.
|
|
|
|
| 21 |
from torch.nn.utils.rnn import pad_sequence
|
| 22 |
import torch.nn.functional as F
|
| 23 |
import pyworld as pw
|
| 24 |
+
import glob
|
| 25 |
+
import os
|
| 26 |
+
import json
|
| 27 |
|
| 28 |
AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
|
| 29 |
|
| 30 |
|
| 31 |
+
def individual_file_opener(data, mode='train', tts_data={}):
|
| 32 |
+
"""Load data from individual files instead of parquet
|
| 33 |
+
|
| 34 |
+
Args:
|
| 35 |
+
data: Iterable[{src}] where src is either:
|
| 36 |
+
- Path to a directory containing audio files
|
| 37 |
+
- Path to a JSON index file
|
| 38 |
+
mode: 'train' or 'test'
|
| 39 |
+
tts_data: Dict for TTS mode
|
| 40 |
+
|
| 41 |
+
Yields:
|
| 42 |
+
Dict with all required fields for training
|
| 43 |
+
"""
|
| 44 |
+
for sample in data:
|
| 45 |
+
assert 'src' in sample
|
| 46 |
+
src = sample['src']
|
| 47 |
+
|
| 48 |
+
# Determine if src is a directory or index file
|
| 49 |
+
if src.endswith('.json'):
|
| 50 |
+
# Load from index file
|
| 51 |
+
with open(src, 'r') as f:
|
| 52 |
+
index_data = json.load(f)
|
| 53 |
+
file_list = index_data.get('data', [])
|
| 54 |
+
else:
|
| 55 |
+
# Scan directory for wav files
|
| 56 |
+
wav_files = glob.glob(os.path.join(src, '*/*/*wav'))
|
| 57 |
+
if not wav_files:
|
| 58 |
+
# Try different patterns
|
| 59 |
+
wav_files = glob.glob(os.path.join(src, '**/*.wav'), recursive=True)
|
| 60 |
+
|
| 61 |
+
file_list = []
|
| 62 |
+
for wav_path in wav_files:
|
| 63 |
+
# Check if all required files exist
|
| 64 |
+
txt_path = wav_path.replace('.wav', '.normalized.txt')
|
| 65 |
+
embedding_path = wav_path.replace('.wav', '_embedding.pt')
|
| 66 |
+
token_path = wav_path.replace('.wav', '_tokens.pt')
|
| 67 |
+
|
| 68 |
+
if not os.path.exists(txt_path):
|
| 69 |
+
logging.warning(f'Text file not found for {wav_path}, skipping')
|
| 70 |
+
continue
|
| 71 |
+
|
| 72 |
+
# Extract metadata
|
| 73 |
+
utt = os.path.basename(wav_path).replace('.wav', '')
|
| 74 |
+
spk = utt.split('_')[0]
|
| 75 |
+
|
| 76 |
+
# Find speaker embedding
|
| 77 |
+
spk_embed_dir = os.path.join(os.path.dirname(src), 'spk_embeddings')
|
| 78 |
+
if not os.path.exists(spk_embed_dir):
|
| 79 |
+
spk_embed_dir = os.path.join(src, 'spk_embeddings')
|
| 80 |
+
spk_embedding_path = os.path.join(spk_embed_dir, f'{spk}_embedding.pt')
|
| 81 |
+
|
| 82 |
+
file_info = {
|
| 83 |
+
'utt': utt,
|
| 84 |
+
'spk': spk,
|
| 85 |
+
'wav': wav_path,
|
| 86 |
+
'text_path': txt_path,
|
| 87 |
+
'embedding_path': embedding_path,
|
| 88 |
+
'token_path': token_path,
|
| 89 |
+
'spk_embedding_path': spk_embedding_path
|
| 90 |
+
}
|
| 91 |
+
file_list.append(file_info)
|
| 92 |
+
|
| 93 |
+
# Process each file
|
| 94 |
+
for file_info in file_list:
|
| 95 |
+
try:
|
| 96 |
+
# Read audio data
|
| 97 |
+
with open(file_info['wav'], 'rb') as f:
|
| 98 |
+
audio_data = f.read()
|
| 99 |
+
|
| 100 |
+
# Read text
|
| 101 |
+
with open(file_info['text_path'], 'r') as f:
|
| 102 |
+
text = ''.join(l.strip() for l in f.readlines())
|
| 103 |
+
|
| 104 |
+
# Load embeddings if they exist
|
| 105 |
+
if os.path.exists(file_info['embedding_path']):
|
| 106 |
+
utt_embedding = torch.load(file_info['embedding_path'])
|
| 107 |
+
if isinstance(utt_embedding, torch.Tensor):
|
| 108 |
+
utt_embedding = utt_embedding.tolist()
|
| 109 |
+
else:
|
| 110 |
+
logging.warning(f"Utterance embedding not found: {file_info['embedding_path']}")
|
| 111 |
+
# Create a dummy embedding
|
| 112 |
+
utt_embedding = [0.0] * 192 # Assuming 192-dim embeddings
|
| 113 |
+
|
| 114 |
+
# Load tokens if they exist
|
| 115 |
+
if os.path.exists(file_info['token_path']):
|
| 116 |
+
speech_token = torch.load(file_info['token_path'])
|
| 117 |
+
if isinstance(speech_token, torch.Tensor):
|
| 118 |
+
speech_token = speech_token.tolist()
|
| 119 |
+
else:
|
| 120 |
+
logging.warning(f"Token file not found: {file_info['token_path']}")
|
| 121 |
+
speech_token = []
|
| 122 |
+
|
| 123 |
+
# Load speaker embedding
|
| 124 |
+
if os.path.exists(file_info['spk_embedding_path']):
|
| 125 |
+
spk_embedding = torch.load(file_info['spk_embedding_path'])
|
| 126 |
+
if isinstance(spk_embedding, torch.Tensor):
|
| 127 |
+
spk_embedding = spk_embedding.tolist()
|
| 128 |
+
else:
|
| 129 |
+
logging.warning(f"Speaker embedding not found: {file_info['spk_embedding_path']}")
|
| 130 |
+
# Use utterance embedding as fallback
|
| 131 |
+
spk_embedding = utt_embedding
|
| 132 |
+
|
| 133 |
+
# Build sample dict
|
| 134 |
+
sample_dict = {
|
| 135 |
+
'utt': file_info['utt'],
|
| 136 |
+
'spk': file_info['spk'],
|
| 137 |
+
'audio_data': audio_data,
|
| 138 |
+
'text': text,
|
| 139 |
+
'text_token': [], # Will be filled by tokenize processor
|
| 140 |
+
'utt_embedding': utt_embedding,
|
| 141 |
+
'spk_embedding': spk_embedding,
|
| 142 |
+
'speech_token': speech_token,
|
| 143 |
+
'wav': file_info['wav'], # Keep original path for reference
|
| 144 |
+
}
|
| 145 |
+
|
| 146 |
+
# Copy over any additional fields from the original sample
|
| 147 |
+
for key, value in sample.items():
|
| 148 |
+
if key not in sample_dict:
|
| 149 |
+
sample_dict[key] = value
|
| 150 |
+
|
| 151 |
+
if mode == 'train':
|
| 152 |
+
yield sample_dict
|
| 153 |
+
else:
|
| 154 |
+
# For TTS mode
|
| 155 |
+
if file_info['utt'] in tts_data:
|
| 156 |
+
for index, tts_text in enumerate(tts_data[file_info['utt']]):
|
| 157 |
+
yield {**sample_dict, 'tts_index': index, 'tts_text': tts_text}
|
| 158 |
+
else:
|
| 159 |
+
yield sample_dict
|
| 160 |
+
|
| 161 |
+
except Exception as ex:
|
| 162 |
+
logging.warning(f'Failed to process {file_info["wav"]}: {ex}')
|
| 163 |
+
|
| 164 |
def parquet_opener(data, mode='train', tts_data={}):
|
| 165 |
""" Give url or local file, return file descriptor
|
| 166 |
Inplace operation.
|
speech/cosyvoice/flow/decoder.py
CHANGED
|
File without changes
|
speech/cosyvoice/flow/flow.py
CHANGED
|
File without changes
|
speech/cosyvoice/flow/flow_matching.py
CHANGED
|
File without changes
|
speech/cosyvoice/flow/length_regulator.py
CHANGED
|
File without changes
|
speech/cosyvoice/hifigan/discriminator.py
CHANGED
|
File without changes
|
speech/cosyvoice/hifigan/f0_predictor.py
CHANGED
|
File without changes
|
speech/cosyvoice/hifigan/generator.py
CHANGED
|
File without changes
|
speech/cosyvoice/hifigan/hifigan.py
CHANGED
|
File without changes
|
speech/cosyvoice/llm/llm.py
CHANGED
|
File without changes
|
speech/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken
CHANGED
|
File without changes
|
speech/cosyvoice/tokenizer/tokenizer.py
CHANGED
|
File without changes
|
speech/cosyvoice/transformer/__init__.py
CHANGED
|
File without changes
|
speech/cosyvoice/transformer/activation.py
CHANGED
|
File without changes
|
speech/cosyvoice/transformer/attention.py
CHANGED
|
File without changes
|
speech/cosyvoice/transformer/convolution.py
CHANGED
|
File without changes
|
speech/cosyvoice/transformer/decoder.py
CHANGED
|
File without changes
|
speech/cosyvoice/transformer/decoder_layer.py
CHANGED
|
File without changes
|
speech/cosyvoice/transformer/embedding.py
CHANGED
|
File without changes
|
speech/cosyvoice/transformer/encoder.py
CHANGED
|
File without changes
|
speech/cosyvoice/transformer/encoder_layer.py
CHANGED
|
File without changes
|
speech/cosyvoice/transformer/label_smoothing_loss.py
CHANGED
|
File without changes
|
speech/cosyvoice/transformer/positionwise_feed_forward.py
CHANGED
|
File without changes
|
speech/cosyvoice/transformer/subsampling.py
CHANGED
|
File without changes
|
speech/cosyvoice/transformer/upsample_encoder.py
CHANGED
|
File without changes
|
speech/cosyvoice/utils/__init__.py
CHANGED
|
File without changes
|
speech/cosyvoice/utils/class_utils.py
CHANGED
|
File without changes
|
speech/cosyvoice/utils/common.py
CHANGED
|
File without changes
|
speech/cosyvoice/utils/executor.py
CHANGED
|
File without changes
|
speech/cosyvoice/utils/file_utils.py
CHANGED
|
File without changes
|
speech/cosyvoice/utils/frontend_utils.py
CHANGED
|
File without changes
|
speech/cosyvoice/utils/losses.py
CHANGED
|
File without changes
|
speech/cosyvoice/utils/mask.py
CHANGED
|
File without changes
|
speech/cosyvoice/utils/scheduler.py
CHANGED
|
File without changes
|
speech/cosyvoice/utils/train_utils.py
CHANGED
|
File without changes
|
speech/cosyvoice2.yaml
CHANGED
|
@@ -129,6 +129,9 @@ hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
|
|
| 129 |
!ref <mel_spec_transform1>
|
| 130 |
]
|
| 131 |
|
|
|
|
|
|
|
|
|
|
| 132 |
# processor functions
|
| 133 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
| 134 |
get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
|
|
@@ -176,25 +179,11 @@ padding: !name:cosyvoice.dataset.processor.padding
|
|
| 176 |
|
| 177 |
# dataset processor pipeline
|
| 178 |
data_pipeline: [
|
| 179 |
-
!ref <
|
| 180 |
-
!ref <tokenize>,
|
| 181 |
-
!ref <filter>,
|
| 182 |
-
!ref <resample>,
|
| 183 |
-
!ref <compute_fbank>,
|
| 184 |
-
!ref <parse_embedding>,
|
| 185 |
-
!ref <shuffle>,
|
| 186 |
-
!ref <sort>,
|
| 187 |
-
!ref <batch>,
|
| 188 |
-
!ref <padding>,
|
| 189 |
-
]
|
| 190 |
-
data_pipeline_gan: [
|
| 191 |
-
!ref <parquet_opener>,
|
| 192 |
!ref <tokenize>,
|
| 193 |
!ref <filter>,
|
| 194 |
!ref <resample>,
|
| 195 |
-
!ref <truncate>,
|
| 196 |
!ref <compute_fbank>,
|
| 197 |
-
!ref <compute_f0>,
|
| 198 |
!ref <parse_embedding>,
|
| 199 |
!ref <shuffle>,
|
| 200 |
!ref <sort>,
|
|
|
|
| 129 |
!ref <mel_spec_transform1>
|
| 130 |
]
|
| 131 |
|
| 132 |
+
individual_file_opener: !name:cosyvoice.dataset.processor.individual_file_opener
|
| 133 |
+
|
| 134 |
+
|
| 135 |
# processor functions
|
| 136 |
parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
|
| 137 |
get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
|
|
|
|
| 179 |
|
| 180 |
# dataset processor pipeline
|
| 181 |
data_pipeline: [
|
| 182 |
+
!ref <individual_file_opener>,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
!ref <tokenize>,
|
| 184 |
!ref <filter>,
|
| 185 |
!ref <resample>,
|
|
|
|
| 186 |
!ref <compute_fbank>,
|
|
|
|
| 187 |
!ref <parse_embedding>,
|
| 188 |
!ref <shuffle>,
|
| 189 |
!ref <sort>,
|
speech/examples/magicdata-read/cosyvoice/local/prepare_data.py
CHANGED
|
File without changes
|
speech/examples/magicdata-read/cosyvoice/tts_text.json
CHANGED
|
File without changes
|