| | import torch |
| | import torchaudio |
| | import argparse |
| | import os |
| | import sys |
| | from tqdm import tqdm |
| | from underthesea import sent_tokenize |
| |
|
| | from TTS.tts.configs.xtts_config import XttsConfig |
| | from TTS.tts.models.xtts import Xtts |
| |
|
| | def main(): |
| | |
| | parser = argparse.ArgumentParser(description='Text-to-Speech using XTTS model') |
| | parser.add_argument('--text', '-t', type=str, required=True, |
| | help='Text to synthesize') |
| | parser.add_argument('--speaker', '-s', type=str, required=True, |
| | help='Path to speaker audio file') |
| | parser.add_argument('--language', '-l', type=str, required=True, |
| | help='Language code (e.g., "multi", "en", "es", etc.)') |
| | parser.add_argument('--output', '-o', type=str, default='output.wav', |
| | help='Output audio file name (default: output.wav)') |
| | parser.add_argument('--model-checkpoint', type=str, |
| | default='../export_checkpoint/best_model.pth', |
| | help='Path to model checkpoint') |
| | parser.add_argument('--model-config', type=str, |
| | default='../export_checkpoint/XTTS_v2.0_original_model_files/config.json', |
| | help='Path to model config file') |
| | parser.add_argument('--model-vocab', type=str, |
| | default='../export_checkpoint/XTTS_v2.0_original_model_files/vocab.json', |
| | help='Path to model vocabulary file') |
| | |
| | args = parser.parse_args() |
| | |
| | |
| | if not os.path.exists(args.speaker): |
| | print(f"Error: Speaker audio file not found: {args.speaker}") |
| | sys.exit(1) |
| | |
| | if not os.path.exists(args.model_checkpoint): |
| | print(f"Error: Model checkpoint not found: {args.model_checkpoint}") |
| | sys.exit(1) |
| | |
| | if not os.path.exists(args.model_config): |
| | print(f"Error: Model config not found: {args.model_config}") |
| | sys.exit(1) |
| | |
| | if not os.path.exists(args.model_vocab): |
| | print(f"Error: Model vocab not found: {args.model_vocab}") |
| | sys.exit(1) |
| | |
| | |
| | device = "cuda:0" if torch.cuda.is_available() else "cpu" |
| | print(f"Using device: {device}") |
| | |
| | |
| | print("Loading model...") |
| | config = XttsConfig() |
| | config.load_json(args.model_config) |
| | XTTS_MODEL = Xtts.init_from_config(config) |
| | XTTS_MODEL.load_checkpoint(config, checkpoint_path=args.model_checkpoint, |
| | vocab_path=args.model_vocab, use_deepspeed=False) |
| | XTTS_MODEL.to(device) |
| | |
| | print("Model loaded successfully!") |
| | |
| | |
| | print("Processing speaker audio...") |
| | gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents( |
| | audio_path=args.speaker, |
| | gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, |
| | max_ref_length=XTTS_MODEL.config.max_ref_len, |
| | sound_norm_refs=XTTS_MODEL.config.sound_norm_refs, |
| | ) |
| | |
| | |
| | tts_texts = sent_tokenize(args.text) |
| | print(f"Processing {len(tts_texts)} sentences...") |
| | |
| | |
| | wav_chunks = [] |
| | for text in tqdm(tts_texts, desc="Generating audio"): |
| | wav_chunk = XTTS_MODEL.inference( |
| | text=text, |
| | language=args.language, |
| | gpt_cond_latent=gpt_cond_latent, |
| | speaker_embedding=speaker_embedding, |
| | temperature=0.1, |
| | length_penalty=1.0, |
| | repetition_penalty=10.0, |
| | top_k=10, |
| | top_p=0.3, |
| | ) |
| | wav_chunks.append(torch.tensor(wav_chunk["wav"])) |
| | |
| | |
| | out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu() |
| | |
| | |
| | print(f"Saving audio to: {args.output}") |
| | torchaudio.save( |
| | args.output, |
| | out_wav, |
| | XTTS_MODEL.config.audio.output_sample_rate, |
| | encoding="PCM_S", |
| | bits_per_sample=16, |
| | ) |
| | |
| | print("Audio generation completed successfully!") |
| |
|
| | if __name__ == "__main__": |
| | main() |