| | import soundfile as sf |
| | import numpy as np |
| | import time |
| | import os |
| | os.environ['AX_INFER'] = "True" |
| | os.environ["AXMODEL_DIR"] = "./" |
| |
|
| | from voxcpm import VoxCPM |
| |
|
| | model = VoxCPM.from_pretrained("./VoxCPM-0.5B", zipenhancer_model_id="iic/speech_zipenhancer_ans_multiloss_16k_base") |
| |
|
| | t1 = time.time() |
| | |
| | chunks = [] |
| | for chunk in model.generate_streaming( |
| | text = "Streaming text to speech is easy with VoxCPM!", |
| | prompt_wav_path="assets/en_woman1.mp3", |
| | prompt_text="But many of these southern girls have the same trouble, said Holly.", |
| | cfg_value=2.0, |
| | inference_timesteps=10, |
| | normalize=True, |
| | denoise=True, |
| | retry_badcase=True, |
| | retry_badcase_max_times=3, |
| | retry_badcase_ratio_threshold=6.0, |
| |
|
| | ): |
| | chunks.append(chunk) |
| | wav = np.concatenate(chunks) |
| |
|
| | t2 = time.time() |
| | print(f"use time {t2-t1} s") |
| |
|
| | output_path = "output_streaming_ax.wav" |
| | sf.write(output_path, wav, 16000) |
| | print(f"saved: {output_path}") |
| |
|