Spaces:

mnhatdaous
/

learnable-speech

Sleeping

App Files Files Community

primepake commited on Jul 14

Commit

f7498a7

1 Parent(s): 6599f2a

data processing updates

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

speech/.gitignore +0 -0
speech/.gitmodules +0 -0
speech/asset/dingding.png +0 -0
speech/cosyvoice/__init__.py +0 -0
speech/cosyvoice/bin/export_jit.py +0 -0
speech/cosyvoice/bin/export_onnx.py +0 -0
speech/cosyvoice/cli/__init__.py +0 -0
speech/cosyvoice/cli/cosyvoice.py +0 -0
speech/cosyvoice/cli/frontend.py +0 -0
speech/cosyvoice/cli/model.py +0 -0
speech/cosyvoice/dataset/__init__.py +0 -0
speech/cosyvoice/dataset/dataset.py +0 -0
speech/cosyvoice/dataset/processor.py +136 -1
speech/cosyvoice/flow/decoder.py +0 -0
speech/cosyvoice/flow/flow.py +0 -0
speech/cosyvoice/flow/flow_matching.py +0 -0
speech/cosyvoice/flow/length_regulator.py +0 -0
speech/cosyvoice/hifigan/discriminator.py +0 -0
speech/cosyvoice/hifigan/f0_predictor.py +0 -0
speech/cosyvoice/hifigan/generator.py +0 -0
speech/cosyvoice/hifigan/hifigan.py +0 -0
speech/cosyvoice/llm/llm.py +0 -0
speech/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken +0 -0
speech/cosyvoice/tokenizer/tokenizer.py +0 -0
speech/cosyvoice/transformer/__init__.py +0 -0
speech/cosyvoice/transformer/activation.py +0 -0
speech/cosyvoice/transformer/attention.py +0 -0
speech/cosyvoice/transformer/convolution.py +0 -0
speech/cosyvoice/transformer/decoder.py +0 -0
speech/cosyvoice/transformer/decoder_layer.py +0 -0
speech/cosyvoice/transformer/embedding.py +0 -0
speech/cosyvoice/transformer/encoder.py +0 -0
speech/cosyvoice/transformer/encoder_layer.py +0 -0
speech/cosyvoice/transformer/label_smoothing_loss.py +0 -0
speech/cosyvoice/transformer/positionwise_feed_forward.py +0 -0
speech/cosyvoice/transformer/subsampling.py +0 -0
speech/cosyvoice/transformer/upsample_encoder.py +0 -0
speech/cosyvoice/utils/__init__.py +0 -0
speech/cosyvoice/utils/class_utils.py +0 -0
speech/cosyvoice/utils/common.py +0 -0
speech/cosyvoice/utils/executor.py +0 -0
speech/cosyvoice/utils/file_utils.py +0 -0
speech/cosyvoice/utils/frontend_utils.py +0 -0
speech/cosyvoice/utils/losses.py +0 -0
speech/cosyvoice/utils/mask.py +0 -0
speech/cosyvoice/utils/scheduler.py +0 -0
speech/cosyvoice/utils/train_utils.py +0 -0
speech/cosyvoice2.yaml +4 -15
speech/examples/magicdata-read/cosyvoice/local/prepare_data.py +0 -0
speech/examples/magicdata-read/cosyvoice/tts_text.json +0 -0

speech/.gitignore CHANGED Viewed

File without changes

speech/.gitmodules CHANGED Viewed

File without changes

speech/asset/dingding.png CHANGED Viewed

Git LFS Details

SHA256: ff82909abd313b24ab6c6bf1cf5ce09014068474a35a4d8d3b8084c8cf0e9503
Pointer size: 130 Bytes
Size of remote file: 96.4 kB

Git LFS Details

SHA256: ff82909abd313b24ab6c6bf1cf5ce09014068474a35a4d8d3b8084c8cf0e9503
Pointer size: 130 Bytes
Size of remote file: 96.4 kB

speech/cosyvoice/__init__.py CHANGED Viewed

File without changes

speech/cosyvoice/bin/export_jit.py CHANGED Viewed

File without changes

speech/cosyvoice/bin/export_onnx.py CHANGED Viewed

File without changes

speech/cosyvoice/cli/__init__.py CHANGED Viewed

File without changes

speech/cosyvoice/cli/cosyvoice.py CHANGED Viewed

File without changes

speech/cosyvoice/cli/frontend.py CHANGED Viewed

File without changes

speech/cosyvoice/cli/model.py CHANGED Viewed

File without changes

speech/cosyvoice/dataset/__init__.py CHANGED Viewed

File without changes

speech/cosyvoice/dataset/dataset.py CHANGED Viewed

File without changes

speech/cosyvoice/dataset/processor.py CHANGED Viewed

@@ -21,11 +21,146 @@ import torchaudio
 from torch.nn.utils.rnn import pad_sequence
 import torch.nn.functional as F
 import pyworld as pw
 AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
 def parquet_opener(data, mode='train', tts_data={}):
     """ Give url or local file, return file descriptor
         Inplace operation.

 from torch.nn.utils.rnn import pad_sequence
 import torch.nn.functional as F
 import pyworld as pw
+import glob
+import os
+import json
 AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'}
+def individual_file_opener(data, mode='train', tts_data={}):
+    """Load data from individual files instead of parquet
+    Args:
+        data: Iterable[{src}] where src is either:
+            - Path to a directory containing audio files
+            - Path to a JSON index file
+        mode: 'train' or 'test'
+        tts_data: Dict for TTS mode
+    Yields:
+        Dict with all required fields for training
+    """
+    for sample in data:
+        assert 'src' in sample
+        src = sample['src']
+        # Determine if src is a directory or index file
+        if src.endswith('.json'):
+            # Load from index file
+            with open(src, 'r') as f:
+                index_data = json.load(f)
+                file_list = index_data.get('data', [])
+        else:
+            # Scan directory for wav files
+            wav_files = glob.glob(os.path.join(src, '*/*/*wav'))
+            if not wav_files:
+                # Try different patterns
+                wav_files = glob.glob(os.path.join(src, '**/*.wav'), recursive=True)
+            file_list = []
+            for wav_path in wav_files:
+                # Check if all required files exist
+                txt_path = wav_path.replace('.wav', '.normalized.txt')
+                embedding_path = wav_path.replace('.wav', '_embedding.pt')
+                token_path = wav_path.replace('.wav', '_tokens.pt')
+                if not os.path.exists(txt_path):
+                    logging.warning(f'Text file not found for {wav_path}, skipping')
+                    continue
+                # Extract metadata
+                utt = os.path.basename(wav_path).replace('.wav', '')
+                spk = utt.split('_')[0]
+                # Find speaker embedding
+                spk_embed_dir = os.path.join(os.path.dirname(src), 'spk_embeddings')
+                if not os.path.exists(spk_embed_dir):
+                    spk_embed_dir = os.path.join(src, 'spk_embeddings')
+                spk_embedding_path = os.path.join(spk_embed_dir, f'{spk}_embedding.pt')
+                file_info = {
+                    'utt': utt,
+                    'spk': spk,
+                    'wav': wav_path,
+                    'text_path': txt_path,
+                    'embedding_path': embedding_path,
+                    'token_path': token_path,
+                    'spk_embedding_path': spk_embedding_path
+                }
+                file_list.append(file_info)
+        # Process each file
+        for file_info in file_list:
+            try:
+                # Read audio data
+                with open(file_info['wav'], 'rb') as f:
+                    audio_data = f.read()
+                # Read text
+                with open(file_info['text_path'], 'r') as f:
+                    text = ''.join(l.strip() for l in f.readlines())
+                # Load embeddings if they exist
+                if os.path.exists(file_info['embedding_path']):
+                    utt_embedding = torch.load(file_info['embedding_path'])
+                    if isinstance(utt_embedding, torch.Tensor):
+                        utt_embedding = utt_embedding.tolist()
+                else:
+                    logging.warning(f"Utterance embedding not found: {file_info['embedding_path']}")
+                    # Create a dummy embedding
+                    utt_embedding = [0.0] * 192  # Assuming 192-dim embeddings
+                # Load tokens if they exist
+                if os.path.exists(file_info['token_path']):
+                    speech_token = torch.load(file_info['token_path'])
+                    if isinstance(speech_token, torch.Tensor):
+                        speech_token = speech_token.tolist()
+                else:
+                    logging.warning(f"Token file not found: {file_info['token_path']}")
+                    speech_token = []
+                # Load speaker embedding
+                if os.path.exists(file_info['spk_embedding_path']):
+                    spk_embedding = torch.load(file_info['spk_embedding_path'])
+                    if isinstance(spk_embedding, torch.Tensor):
+                        spk_embedding = spk_embedding.tolist()
+                else:
+                    logging.warning(f"Speaker embedding not found: {file_info['spk_embedding_path']}")
+                    # Use utterance embedding as fallback
+                    spk_embedding = utt_embedding
+                # Build sample dict
+                sample_dict = {
+                    'utt': file_info['utt'],
+                    'spk': file_info['spk'],
+                    'audio_data': audio_data,
+                    'text': text,
+                    'text_token': [],  # Will be filled by tokenize processor
+                    'utt_embedding': utt_embedding,
+                    'spk_embedding': spk_embedding,
+                    'speech_token': speech_token,
+                    'wav': file_info['wav'],  # Keep original path for reference
+                }
+                # Copy over any additional fields from the original sample
+                for key, value in sample.items():
+                    if key not in sample_dict:
+                        sample_dict[key] = value
+                if mode == 'train':
+                    yield sample_dict
+                else:
+                    # For TTS mode
+                    if file_info['utt'] in tts_data:
+                        for index, tts_text in enumerate(tts_data[file_info['utt']]):
+                            yield {**sample_dict, 'tts_index': index, 'tts_text': tts_text}
+                    else:
+                        yield sample_dict
+            except Exception as ex:
+                logging.warning(f'Failed to process {file_info["wav"]}: {ex}')
 def parquet_opener(data, mode='train', tts_data={}):
     """ Give url or local file, return file descriptor
         Inplace operation.

speech/cosyvoice/flow/decoder.py CHANGED Viewed

File without changes

speech/cosyvoice/flow/flow.py CHANGED Viewed

File without changes

speech/cosyvoice/flow/flow_matching.py CHANGED Viewed

File without changes

speech/cosyvoice/flow/length_regulator.py CHANGED Viewed

File without changes

speech/cosyvoice/hifigan/discriminator.py CHANGED Viewed

File without changes

speech/cosyvoice/hifigan/f0_predictor.py CHANGED Viewed

File without changes

speech/cosyvoice/hifigan/generator.py CHANGED Viewed

File without changes

speech/cosyvoice/hifigan/hifigan.py CHANGED Viewed

File without changes

speech/cosyvoice/llm/llm.py CHANGED Viewed

File without changes

speech/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken CHANGED Viewed

File without changes

speech/cosyvoice/tokenizer/tokenizer.py CHANGED Viewed

File without changes

speech/cosyvoice/transformer/__init__.py CHANGED Viewed

File without changes

speech/cosyvoice/transformer/activation.py CHANGED Viewed

File without changes

speech/cosyvoice/transformer/attention.py CHANGED Viewed

File without changes

speech/cosyvoice/transformer/convolution.py CHANGED Viewed

File without changes

speech/cosyvoice/transformer/decoder.py CHANGED Viewed

File without changes

speech/cosyvoice/transformer/decoder_layer.py CHANGED Viewed

File without changes

speech/cosyvoice/transformer/embedding.py CHANGED Viewed

File without changes

speech/cosyvoice/transformer/encoder.py CHANGED Viewed

File without changes

speech/cosyvoice/transformer/encoder_layer.py CHANGED Viewed

File without changes

speech/cosyvoice/transformer/label_smoothing_loss.py CHANGED Viewed

File without changes

speech/cosyvoice/transformer/positionwise_feed_forward.py CHANGED Viewed

File without changes

speech/cosyvoice/transformer/subsampling.py CHANGED Viewed

File without changes

speech/cosyvoice/transformer/upsample_encoder.py CHANGED Viewed

File without changes

speech/cosyvoice/utils/__init__.py CHANGED Viewed

File without changes

speech/cosyvoice/utils/class_utils.py CHANGED Viewed

File without changes

speech/cosyvoice/utils/common.py CHANGED Viewed

File without changes

speech/cosyvoice/utils/executor.py CHANGED Viewed

File without changes

speech/cosyvoice/utils/file_utils.py CHANGED Viewed

File without changes

speech/cosyvoice/utils/frontend_utils.py CHANGED Viewed

File without changes

speech/cosyvoice/utils/losses.py CHANGED Viewed

File without changes

speech/cosyvoice/utils/mask.py CHANGED Viewed

File without changes

speech/cosyvoice/utils/scheduler.py CHANGED Viewed

File without changes

speech/cosyvoice/utils/train_utils.py CHANGED Viewed

File without changes

speech/cosyvoice2.yaml CHANGED Viewed

@@ -129,6 +129,9 @@ hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
         !ref <mel_spec_transform1>
     ]
 # processor functions
 parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
 get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
@@ -176,25 +179,11 @@ padding: !name:cosyvoice.dataset.processor.padding
 # dataset processor pipeline
 data_pipeline: [
-    !ref <parquet_opener>,
-    !ref <tokenize>,
-    !ref <filter>,
-    !ref <resample>,
-    !ref <compute_fbank>,
-    !ref <parse_embedding>,
-    !ref <shuffle>,
-    !ref <sort>,
-    !ref <batch>,
-    !ref <padding>,
-]
-data_pipeline_gan: [
-    !ref <parquet_opener>,
     !ref <tokenize>,
     !ref <filter>,
     !ref <resample>,
-    !ref <truncate>,
     !ref <compute_fbank>,
-    !ref <compute_f0>,
     !ref <parse_embedding>,
     !ref <shuffle>,
     !ref <sort>,

         !ref <mel_spec_transform1>
     ]
+individual_file_opener: !name:cosyvoice.dataset.processor.individual_file_opener
 # processor functions
 parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
 get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
 # dataset processor pipeline
 data_pipeline: [
+    !ref <individual_file_opener>,
     !ref <tokenize>,
     !ref <filter>,
     !ref <resample>,
     !ref <compute_fbank>,
     !ref <parse_embedding>,
     !ref <shuffle>,
     !ref <sort>,

speech/examples/magicdata-read/cosyvoice/local/prepare_data.py CHANGED Viewed

File without changes

speech/examples/magicdata-read/cosyvoice/tts_text.json CHANGED Viewed

File without changes