diff --git a/speech/.gitignore b/speech/.gitignore old mode 100644 new mode 100755 diff --git a/speech/.gitmodules b/speech/.gitmodules old mode 100644 new mode 100755 diff --git a/speech/asset/dingding.png b/speech/asset/dingding.png old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/__init__.py b/speech/cosyvoice/__init__.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/bin/export_jit.py b/speech/cosyvoice/bin/export_jit.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/bin/export_onnx.py b/speech/cosyvoice/bin/export_onnx.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/cli/__init__.py b/speech/cosyvoice/cli/__init__.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/cli/cosyvoice.py b/speech/cosyvoice/cli/cosyvoice.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/cli/frontend.py b/speech/cosyvoice/cli/frontend.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/cli/model.py b/speech/cosyvoice/cli/model.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/dataset/__init__.py b/speech/cosyvoice/dataset/__init__.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/dataset/dataset.py b/speech/cosyvoice/dataset/dataset.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/dataset/processor.py b/speech/cosyvoice/dataset/processor.py old mode 100644 new mode 100755 index 1eec9762f7f94e445acdcf7dc123e76fa3088394..78e3904c745075bf1bbab9adbc5e886e96c1f08b --- a/speech/cosyvoice/dataset/processor.py +++ b/speech/cosyvoice/dataset/processor.py @@ -21,11 +21,146 @@ import torchaudio from torch.nn.utils.rnn import pad_sequence import torch.nn.functional as F import pyworld as pw - +import glob +import os +import json AUDIO_FORMAT_SETS = {'flac', 'mp3', 'm4a', 'ogg', 'opus', 'wav', 'wma'} +def individual_file_opener(data, mode='train', tts_data={}): + """Load data from individual files instead of parquet + + Args: + data: Iterable[{src}] where src is either: + - Path to a directory containing audio files + - Path to a JSON index file + mode: 'train' or 'test' + tts_data: Dict for TTS mode + + Yields: + Dict with all required fields for training + """ + for sample in data: + assert 'src' in sample + src = sample['src'] + + # Determine if src is a directory or index file + if src.endswith('.json'): + # Load from index file + with open(src, 'r') as f: + index_data = json.load(f) + file_list = index_data.get('data', []) + else: + # Scan directory for wav files + wav_files = glob.glob(os.path.join(src, '*/*/*wav')) + if not wav_files: + # Try different patterns + wav_files = glob.glob(os.path.join(src, '**/*.wav'), recursive=True) + + file_list = [] + for wav_path in wav_files: + # Check if all required files exist + txt_path = wav_path.replace('.wav', '.normalized.txt') + embedding_path = wav_path.replace('.wav', '_embedding.pt') + token_path = wav_path.replace('.wav', '_tokens.pt') + + if not os.path.exists(txt_path): + logging.warning(f'Text file not found for {wav_path}, skipping') + continue + + # Extract metadata + utt = os.path.basename(wav_path).replace('.wav', '') + spk = utt.split('_')[0] + + # Find speaker embedding + spk_embed_dir = os.path.join(os.path.dirname(src), 'spk_embeddings') + if not os.path.exists(spk_embed_dir): + spk_embed_dir = os.path.join(src, 'spk_embeddings') + spk_embedding_path = os.path.join(spk_embed_dir, f'{spk}_embedding.pt') + + file_info = { + 'utt': utt, + 'spk': spk, + 'wav': wav_path, + 'text_path': txt_path, + 'embedding_path': embedding_path, + 'token_path': token_path, + 'spk_embedding_path': spk_embedding_path + } + file_list.append(file_info) + + # Process each file + for file_info in file_list: + try: + # Read audio data + with open(file_info['wav'], 'rb') as f: + audio_data = f.read() + + # Read text + with open(file_info['text_path'], 'r') as f: + text = ''.join(l.strip() for l in f.readlines()) + + # Load embeddings if they exist + if os.path.exists(file_info['embedding_path']): + utt_embedding = torch.load(file_info['embedding_path']) + if isinstance(utt_embedding, torch.Tensor): + utt_embedding = utt_embedding.tolist() + else: + logging.warning(f"Utterance embedding not found: {file_info['embedding_path']}") + # Create a dummy embedding + utt_embedding = [0.0] * 192 # Assuming 192-dim embeddings + + # Load tokens if they exist + if os.path.exists(file_info['token_path']): + speech_token = torch.load(file_info['token_path']) + if isinstance(speech_token, torch.Tensor): + speech_token = speech_token.tolist() + else: + logging.warning(f"Token file not found: {file_info['token_path']}") + speech_token = [] + + # Load speaker embedding + if os.path.exists(file_info['spk_embedding_path']): + spk_embedding = torch.load(file_info['spk_embedding_path']) + if isinstance(spk_embedding, torch.Tensor): + spk_embedding = spk_embedding.tolist() + else: + logging.warning(f"Speaker embedding not found: {file_info['spk_embedding_path']}") + # Use utterance embedding as fallback + spk_embedding = utt_embedding + + # Build sample dict + sample_dict = { + 'utt': file_info['utt'], + 'spk': file_info['spk'], + 'audio_data': audio_data, + 'text': text, + 'text_token': [], # Will be filled by tokenize processor + 'utt_embedding': utt_embedding, + 'spk_embedding': spk_embedding, + 'speech_token': speech_token, + 'wav': file_info['wav'], # Keep original path for reference + } + + # Copy over any additional fields from the original sample + for key, value in sample.items(): + if key not in sample_dict: + sample_dict[key] = value + + if mode == 'train': + yield sample_dict + else: + # For TTS mode + if file_info['utt'] in tts_data: + for index, tts_text in enumerate(tts_data[file_info['utt']]): + yield {**sample_dict, 'tts_index': index, 'tts_text': tts_text} + else: + yield sample_dict + + except Exception as ex: + logging.warning(f'Failed to process {file_info["wav"]}: {ex}') + def parquet_opener(data, mode='train', tts_data={}): """ Give url or local file, return file descriptor Inplace operation. diff --git a/speech/cosyvoice/flow/decoder.py b/speech/cosyvoice/flow/decoder.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/flow/flow.py b/speech/cosyvoice/flow/flow.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/flow/flow_matching.py b/speech/cosyvoice/flow/flow_matching.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/flow/length_regulator.py b/speech/cosyvoice/flow/length_regulator.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/hifigan/discriminator.py b/speech/cosyvoice/hifigan/discriminator.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/hifigan/f0_predictor.py b/speech/cosyvoice/hifigan/f0_predictor.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/hifigan/generator.py b/speech/cosyvoice/hifigan/generator.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/hifigan/hifigan.py b/speech/cosyvoice/hifigan/hifigan.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/llm/llm.py b/speech/cosyvoice/llm/llm.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken b/speech/cosyvoice/tokenizer/assets/multilingual_zh_ja_yue_char_del.tiktoken old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/tokenizer/tokenizer.py b/speech/cosyvoice/tokenizer/tokenizer.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/transformer/__init__.py b/speech/cosyvoice/transformer/__init__.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/transformer/activation.py b/speech/cosyvoice/transformer/activation.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/transformer/attention.py b/speech/cosyvoice/transformer/attention.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/transformer/convolution.py b/speech/cosyvoice/transformer/convolution.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/transformer/decoder.py b/speech/cosyvoice/transformer/decoder.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/transformer/decoder_layer.py b/speech/cosyvoice/transformer/decoder_layer.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/transformer/embedding.py b/speech/cosyvoice/transformer/embedding.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/transformer/encoder.py b/speech/cosyvoice/transformer/encoder.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/transformer/encoder_layer.py b/speech/cosyvoice/transformer/encoder_layer.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/transformer/label_smoothing_loss.py b/speech/cosyvoice/transformer/label_smoothing_loss.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/transformer/positionwise_feed_forward.py b/speech/cosyvoice/transformer/positionwise_feed_forward.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/transformer/subsampling.py b/speech/cosyvoice/transformer/subsampling.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/transformer/upsample_encoder.py b/speech/cosyvoice/transformer/upsample_encoder.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/utils/__init__.py b/speech/cosyvoice/utils/__init__.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/utils/class_utils.py b/speech/cosyvoice/utils/class_utils.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/utils/common.py b/speech/cosyvoice/utils/common.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/utils/executor.py b/speech/cosyvoice/utils/executor.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/utils/file_utils.py b/speech/cosyvoice/utils/file_utils.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/utils/frontend_utils.py b/speech/cosyvoice/utils/frontend_utils.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/utils/losses.py b/speech/cosyvoice/utils/losses.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/utils/mask.py b/speech/cosyvoice/utils/mask.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/utils/scheduler.py b/speech/cosyvoice/utils/scheduler.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice/utils/train_utils.py b/speech/cosyvoice/utils/train_utils.py old mode 100644 new mode 100755 diff --git a/speech/cosyvoice2.yaml b/speech/cosyvoice2.yaml old mode 100644 new mode 100755 index 551e42edc0258d9de2ee20a680e58ffe101af168..0f0b0c85466073525a506c62352653eedbbf4354 --- a/speech/cosyvoice2.yaml +++ b/speech/cosyvoice2.yaml @@ -129,6 +129,9 @@ hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan !ref ] +individual_file_opener: !name:cosyvoice.dataset.processor.individual_file_opener + + # processor functions parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer @@ -176,25 +179,11 @@ padding: !name:cosyvoice.dataset.processor.padding # dataset processor pipeline data_pipeline: [ - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , - !ref , -] -data_pipeline_gan: [ - !ref , + !ref , !ref , !ref , !ref , - !ref , !ref , - !ref , !ref , !ref , !ref , diff --git a/speech/examples/magicdata-read/cosyvoice/local/prepare_data.py b/speech/examples/magicdata-read/cosyvoice/local/prepare_data.py old mode 100644 new mode 100755 diff --git a/speech/examples/magicdata-read/cosyvoice/tts_text.json b/speech/examples/magicdata-read/cosyvoice/tts_text.json old mode 100644 new mode 100755 diff --git a/speech/local/prepare_data.py b/speech/local/prepare_data.py old mode 100644 new mode 100755 diff --git a/speech/test_train.sh b/speech/test_train.sh new file mode 100755 index 0000000000000000000000000000000000000000..33ad670c31c05d7c5bba68a949e1479c72bf3d02 --- /dev/null +++ b/speech/test_train.sh @@ -0,0 +1,110 @@ +#!/bin/bash +# Copyright 2024 Alibaba Inc. All Rights Reserved. + +stage=-1 +stop_stage=3 + +data_url=www.openslr.org/resources/60 +data_dir=data +pretrained_model_dir=./pretrained_models/CosyVoice2-0.5B + +if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then + echo "Data Download" + for part in test-clean; do + local/download_and_untar.sh ${data_dir} ${data_url} ${part} + done +fi + +# if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then +# echo "Data preparation, prepare wav.scp/text/utt2spk/spk2utt" +# for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do +# mkdir -p data/$x +# python local/prepare_data.py --src_dir $data_dir/LibriTTS/$x --des_dir data/$x +# done +# fi + +# if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then +# echo "Extract campplus speaker embedding, you will get spk2embedding.pt and utt2embedding.pt in data/$x dir" +# for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do +# tools/extract_embedding.py --dir data/$x \ +# --onnx_path $pretrained_model_dir/campplus.onnx +# done +# fi + +# if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then +# echo "Extract discrete speech token, you will get utt2speech_token.pt in data/$x dir" +# for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do +# tools/extract_speech_token.py --dir data/$x \ +# --onnx_path $pretrained_model_dir/speech_tokenizer_v2.onnx +# done +# fi + +# if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then +# echo "Prepare required parquet format data, you should have prepared wav.scp/text/utt2spk/spk2utt/utt2embedding.pt/spk2embedding.pt/utt2speech_token.pt" +# for x in train-clean-100 train-clean-360 train-other-500 dev-clean dev-other test-clean test-other; do +# mkdir -p data/$x/parquet +# tools/make_parquet_list.py --num_utts_per_parquet 1000 \ +# --num_processes 10 \ +# --src_dir data/$x \ +# --des_dir data/$x/parquet +# done +# fi + +# # train llm +# export CUDA_VISIBLE_DEVICES="0,1,2,3" +# num_gpus=$(echo $CUDA_VISIBLE_DEVICES | awk -F "," '{print NF}') +# job_id=1986 +# dist_backend="nccl" +# num_workers=2 +# prefetch=100 +# train_engine=torch_ddp +# if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then +# echo "Run train. We only support llm traning for now. If your want to train from scratch, please use conf/cosyvoice.fromscratch.yaml" +# if [ $train_engine == 'deepspeed' ]; then +# echo "Notice deepspeed has its own optimizer config. Modify conf/ds_stage2.json if necessary" +# fi +# cat data/{train-clean-100,train-clean-360,train-other-500}/parquet/data.list > data/train.data.list +# cat data/{dev-clean,dev-other}/parquet/data.list > data/dev.data.list +# # NOTE will update llm/hift training later +# for model in llm flow hifigan; do +# torchrun --nnodes=1 --nproc_per_node=$num_gpus \ +# --rdzv_id=$job_id --rdzv_backend="c10d" --rdzv_endpoint="localhost:1234" \ +# cosyvoice/bin/train.py \ +# --train_engine $train_engine \ +# --config conf/cosyvoice2.yaml \ +# --train_data data/train.data.list \ +# --cv_data data/dev.data.list \ +# --qwen_pretrain_path $pretrained_model_dir/CosyVoice-BlankEN \ +# --model $model \ +# --checkpoint $pretrained_model_dir/$model.pt \ +# --model_dir `pwd`/exp/cosyvoice2/$model/$train_engine \ +# --tensorboard_dir `pwd`/tensorboard/cosyvoice2/$model/$train_engine \ +# --ddp.dist_backend $dist_backend \ +# --num_workers ${num_workers} \ +# --prefetch ${prefetch} \ +# --pin_memory \ +# --use_amp \ +# --deepspeed_config ./conf/ds_stage2.json \ +# --deepspeed.save_states model+optimizer +# done +# fi + +# # average model +# average_num=5 +# if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then +# for model in llm flow hifigan; do +# decode_checkpoint=`pwd`/exp/cosyvoice/$model/$train_engine/${model}.pt +# echo "do model average and final checkpoint is $decode_checkpoint" +# python cosyvoice/bin/average_model.py \ +# --dst_model $decode_checkpoint \ +# --src_path `pwd`/exp/cosyvoice/$model/$train_engine \ +# --num ${average_num} \ +# --val_best +# done +# fi + +# if [ ${stage} -le 7 ] && [ ${stop_stage} -ge 7 ]; then +# echo "Export your model for inference speedup. Remember copy your llm or flow model to model_dir" +# python cosyvoice/bin/export_jit.py --model_dir $pretrained_model_dir +# python cosyvoice/bin/export_onnx.py --model_dir $pretrained_model_dir +# fi \ No newline at end of file diff --git a/speech/tools/create_data_list.py b/speech/tools/create_data_list.py old mode 100644 new mode 100755 diff --git a/speech/tools/inv_file_processor.py b/speech/tools/inv_file_processor.py old mode 100644 new mode 100755 diff --git a/speech/tools/validate_data.py b/speech/tools/validate_data.py new file mode 100644 index 0000000000000000000000000000000000000000..d31ee7ee017f0d5f69945103cd2ee7ca72c23ce3 --- /dev/null +++ b/speech/tools/validate_data.py @@ -0,0 +1,99 @@ +#!/usr/bin/env python3 +"""Validate that all required files exist for training""" + +import argparse +import glob +import os +from tqdm import tqdm + +def validate_data(src_dir): + """Check that all required files exist + + Args: + src_dir: Directory containing audio files + """ + # Find all wav files + wav_files = glob.glob(os.path.join(src_dir, '*/*/*wav')) + if not wav_files: + wav_files = glob.glob(os.path.join(src_dir, '**/*.wav'), recursive=True) + + print(f"Found {len(wav_files)} WAV files") + + missing_txt = [] + missing_embedding = [] + missing_token = [] + missing_spk_embedding = [] + speakers = set() + + for wav_path in tqdm(wav_files, desc="Validating files"): + # Check text file + txt_path = wav_path.replace('.wav', '.normalized.txt') + if not os.path.exists(txt_path): + missing_txt.append(wav_path) + + # Check embedding file + embedding_path = wav_path.replace('.wav', '_embedding.pt') + if not os.path.exists(embedding_path): + missing_embedding.append(wav_path) + + # Check token file + token_path = wav_path.replace('.wav', '_tokens.pt') + if not os.path.exists(token_path): + missing_token.append(wav_path) + + # Extract speaker + utt = os.path.basename(wav_path).replace('.wav', '') + spk = utt.split('_')[0] + speakers.add(spk) + + # Check speaker embeddings + spk_embed_dir = os.path.join(src_dir, 'spk_embeddings') + if os.path.exists(spk_embed_dir): + for spk in speakers: + spk_embedding_path = os.path.join(spk_embed_dir, f'{spk}_embedding.pt') + if not os.path.exists(spk_embedding_path): + missing_spk_embedding.append(spk) + else: + print(f"Speaker embedding directory not found: {spk_embed_dir}") + missing_spk_embedding = list(speakers) + + # Report results + print("\n=== Validation Results ===") + print(f"Total WAV files: {len(wav_files)}") + print(f"Total speakers: {len(speakers)}") + print(f"Missing text files: {len(missing_txt)}") + print(f"Missing embedding files: {len(missing_embedding)}") + print(f"Missing token files: {len(missing_token)}") + print(f"Missing speaker embeddings: {len(missing_spk_embedding)}") + + if missing_txt: + print(f"\nFirst 5 missing text files:") + for f in missing_txt[:5]: + print(f" {f}") + + if missing_embedding: + print(f"\nFirst 5 missing embedding files:") + for f in missing_embedding[:5]: + print(f" {f}") + + if missing_token: + print(f"\nFirst 5 missing token files:") + for f in missing_token[:5]: + print(f" {f}") + + if missing_spk_embedding: + print(f"\nFirst 5 missing speaker embeddings:") + for spk in list(missing_spk_embedding)[:5]: + print(f" {spk}") + + # Return success if no missing files + return len(missing_txt) == 0 and len(missing_embedding) == 0 and len(missing_token) == 0 + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--src_dir', type=str, required=True, + help='Source directory to validate') + args = parser.parse_args() + + success = validate_data(args.src_dir) + exit(0 if success else 1) \ No newline at end of file diff --git a/speech/train.py b/speech/train.py old mode 100644 new mode 100755