Instructions to use Kremon96/VoxCPM2 with libraries, inference providers, notebooks, and local apps. Follow these links to get started.
- Libraries
- VoxCPM
How to use Kremon96/VoxCPM2 with VoxCPM:
import soundfile as sf from voxcpm import VoxCPM model = VoxCPM.from_pretrained("Kremon96/VoxCPM2") wav = model.generate( text="VoxCPM is an innovative end-to-end TTS model from ModelBest, designed to generate highly expressive speech.", prompt_wav_path=None, # optional: path to a prompt speech for voice cloning prompt_text=None, # optional: reference text cfg_value=2.0, # LM guidance on LocDiT, higher for better adherence to the prompt, but maybe worse inference_timesteps=10, # LocDiT inference timesteps, higher for better result, lower for fast speed normalize=True, # enable external TN tool denoise=True, # enable external Denoise tool retry_badcase=True, # enable retrying mode for some bad cases (unstoppable) retry_badcase_max_times=3, # maximum retrying times retry_badcase_ratio_threshold=6.0, # maximum length restriction for bad case detection (simple but effective), it could be adjusted for slow pace speech ) sf.write("output.wav", wav, 16000) print("saved: output.wav") - Notebooks
- Google Colab
- Kaggle
| # ComfyUI-VoxCPMTTS | |
| # A clean, efficient ComfyUI custom node for VoxCPM TTS (Text-to-Speech) functionality. This implementation provides high-quality speech generation and voice cloning capabilities using the VoxCPM model. | |
| # | |
| # Models License Notice: | |
| # - VoxCPM1.5: Apache-2.0 License (https://huggingface.co/openbmb/VoxCPM1.5) | |
| # | |
| # This integration script follows GPL-3.0 License. | |
| # When using or modifying this code, please respect both the original model licenses | |
| # and this integration's license terms. | |
| # | |
| # Source: https://github.com/1038lab/ComfyUI-VoxCPMTTS | |
| import torch | |
| import numpy as np | |
| import os | |
| import folder_paths | |
| import comfy.model_management as model_management | |
| import soundfile as sf | |
| from huggingface_hub import snapshot_download | |
| from voxcpm import VoxCPM | |
| DEFAULT_MODEL_ID = os.getenv("VOXCPM_MODEL_ID", "openbmb/VoxCPM1.5") | |
| DEFAULT_MODEL_DIRNAME = os.getenv("VOXCPM_MODEL_DIR", DEFAULT_MODEL_ID.split("/")[-1]) | |
| MODEL_CHOICES = ["VoxCPM 1.5", "VoxCPM 0.5B"] | |
| MODEL_REPO_MAP = { | |
| "VoxCPM 1.5": "openbmb/VoxCPM1.5", | |
| "VoxCPM 0.5B": "openbmb/VoxCPM-0.5B", | |
| } | |
| DEFAULT_MODEL_CHOICE = next((name for name, repo in MODEL_REPO_MAP.items() if repo == DEFAULT_MODEL_ID and name in MODEL_CHOICES), MODEL_CHOICES[0]) | |
| _voxcpm_models = {} | |
| _asr_model = None | |
| _asr_backend = None | |
| def get_available_devices(): | |
| devices = ["auto"] | |
| if torch.cuda.is_available(): | |
| devices.append("cuda") | |
| if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): | |
| devices.append("mps") | |
| devices.append("cpu") | |
| return devices | |
| def get_best_device(): | |
| if torch.cuda.is_available(): | |
| return model_management.get_torch_device() | |
| elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): | |
| return torch.device("mps") | |
| else: | |
| return torch.device("cpu") | |
| def unload_model_instance(repo_id: str): | |
| global _voxcpm_models | |
| to_delete = [] | |
| for (rid, _), mdl in _voxcpm_models.items(): | |
| if rid == repo_id: | |
| try: | |
| if hasattr(mdl, "tts_model"): | |
| mdl.tts_model.to(torch.device("cpu")) | |
| except Exception: | |
| pass | |
| to_delete.append((rid, _)) | |
| for key in to_delete: | |
| _voxcpm_models.pop(key, None) | |
| model_management.soft_empty_cache() | |
| def load_model(model_repo: str = DEFAULT_MODEL_ID, model_dirname: str = None): | |
| global _voxcpm_models | |
| repo_id = model_repo or DEFAULT_MODEL_ID | |
| dirname = model_dirname or os.getenv("VOXCPM_MODEL_DIR") or repo_id.split("/")[-1] | |
| models_dir = folder_paths.models_dir | |
| tts_dir = os.path.join(models_dir, "TTS") | |
| target_dir = os.path.join(tts_dir, dirname) | |
| cache_key = (repo_id, target_dir) | |
| if cache_key not in _voxcpm_models: | |
| if not os.path.exists(target_dir) or not os.listdir(target_dir): | |
| try: | |
| os.makedirs(tts_dir, exist_ok=True) | |
| os.makedirs(target_dir, exist_ok=True) | |
| snapshot_download( | |
| repo_id=repo_id, | |
| local_dir=target_dir, | |
| local_dir_use_symlinks=False | |
| ) | |
| except Exception as e: | |
| print(f"[ERROR] Failed to download VoxCPM model: {str(e)}") | |
| raise e | |
| try: | |
| _voxcpm_models[cache_key] = VoxCPM( | |
| voxcpm_model_path=target_dir, | |
| enable_denoiser=False, | |
| optimize=False, | |
| ) | |
| except Exception as e: | |
| print(f"[ERROR] VoxCPM initialization failed: {str(e)}") | |
| raise e | |
| return _voxcpm_models[cache_key] | |
| class AILab_VoxCPMTTS: | |
| def INPUT_TYPES(s): | |
| available_devices = get_available_devices() | |
| return { | |
| "required": { | |
| "model": (MODEL_CHOICES, {"default": DEFAULT_MODEL_CHOICE, "tooltip": "Select VoxCPM model version"}), | |
| "text": ("STRING", {"multiline": True, "default": "Hello, this is VoxCPM TTS.", "tooltip": "Text to synthesize into speech"}), | |
| }, | |
| "optional": { | |
| "reference_audio": ("AUDIO", {"tooltip": "Reference audio for voice cloning"}), | |
| "reference_text": ("STRING", {"multiline": True, "default": "", "tooltip": "Text transcript of reference audio (leave empty for auto-transcription)"}), | |
| "auto_transcribe_reference": ("BOOLEAN", {"default": False, "tooltip": "If reference text is empty, auto-run ASR to fill it"}), | |
| "unload_model": ("BOOLEAN", {"default": False, "tooltip": "Unload model after generation to free VRAM"}), | |
| "device": (available_devices, {"default": available_devices[0], "tooltip": "Device to run the model on"}), | |
| "seed": ("INT", {"default": -1, "min": -1, "max": 0xffffffffffffffff, "tooltip": "Random seed (-1 for random)"}), | |
| } | |
| } | |
| RETURN_TYPES = ("STRING", "AUDIO") | |
| RETURN_NAMES = ("REFERENCE_TEXT", "AUDIO") | |
| FUNCTION = "generate" | |
| CATEGORY = "🧪AILab/🔊TTS/VoxCPMTTS" | |
| def generate(self, model, text, device, seed, reference_audio=None, reference_text="", auto_transcribe_reference=False, unload_model=False): | |
| # Hidden presets for simplified node | |
| cfg_value = 2.0 | |
| inference_steps = 10 | |
| max_length = 4096 | |
| fade_in_ms = 20 | |
| retry_attempts = 2 | |
| retry_threshold = 8.0 | |
| normalize = True | |
| if seed != -1: | |
| torch.manual_seed(seed) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed_all(seed) | |
| model_management.soft_empty_cache() | |
| repo_id = MODEL_REPO_MAP.get(model, MODEL_REPO_MAP.get(model.strip(), model)) | |
| model = load_model(model_repo=repo_id) | |
| target_sr = 44100 if "1.5" in repo_id else 16000 | |
| if device == "auto": | |
| target_device = get_best_device() | |
| elif device == "cuda" and torch.cuda.is_available(): | |
| target_device = model_management.get_torch_device() | |
| else: | |
| target_device = torch.device("cpu") | |
| try: | |
| if hasattr(model, 'tts_model'): | |
| model.tts_model.to(target_device) | |
| if hasattr(model.tts_model, 'audio_vae'): | |
| model.tts_model.audio_vae.to(target_device) | |
| except Exception as e: | |
| pass | |
| text = text.strip() | |
| if len(text) == 0: | |
| raise ValueError("Please input text to synthesize.") | |
| prompt_wav_path = None | |
| prompt_waveform = None | |
| prompt_sample_rate = None | |
| prompt_text = None | |
| if reference_audio: | |
| import torchaudio, tempfile | |
| waveform = reference_audio["waveform"] | |
| sample_rate = reference_audio["sample_rate"] | |
| # Normalize to shape [1,1,T] | |
| while waveform.dim() > 3: | |
| waveform = waveform.squeeze(0) | |
| if waveform.dim() == 3: | |
| if waveform.shape[0] > 1: | |
| waveform = waveform[0:1, ...] | |
| elif waveform.dim() == 2: | |
| if waveform.shape[0] > 1: | |
| waveform = waveform.mean(dim=0, keepdim=True) | |
| waveform = waveform.unsqueeze(0) | |
| elif waveform.dim() == 1: | |
| waveform = waveform.unsqueeze(0).unsqueeze(0) | |
| else: | |
| waveform = waveform.reshape(1, 1, -1) | |
| prompt_waveform = waveform.contiguous().float().cpu() | |
| prompt_sample_rate = int(sample_rate) | |
| if isinstance(reference_text, str) and len(reference_text.strip()) >= 1: | |
| prompt_text = reference_text.strip() | |
| elif auto_transcribe_reference: | |
| tmp_wav = None | |
| try: | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
| wav_np = prompt_waveform.squeeze().cpu().numpy() | |
| if wav_np.ndim > 1: | |
| wav_np = wav_np[0] | |
| sf.write(tmp_file.name, wav_np, prompt_sample_rate) | |
| tmp_wav = tmp_file.name | |
| global _asr_model | |
| global _asr_backend | |
| if _asr_model is None: | |
| # Prefer faster-whisper; fallback to openai-whisper | |
| try: | |
| from faster_whisper import WhisperModel # type: ignore | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| compute_type = "float16" if torch.cuda.is_available() else "int8" | |
| model_name = os.environ.get("VOXCPM_ASR_MODEL", "small") | |
| print(f"[ASR] Loading faster-whisper: {model_name} on {device}/{compute_type}") | |
| _asr_model = WhisperModel(model_name, device=device, compute_type=compute_type) | |
| _asr_backend = "faster_whisper" | |
| except Exception: | |
| try: | |
| import whisper # type: ignore | |
| model_name = os.environ.get("VOXCPM_ASR_MODEL", "small") | |
| print(f"[ASR] Loading openai-whisper: {model_name}") | |
| _asr_model = whisper.load_model(model_name) | |
| _asr_backend = "openai_whisper" | |
| except Exception as e_load: | |
| print(f"[ASR] Failed to load ASR backends: {e_load}") | |
| _asr_backend = None | |
| text_out = "" | |
| if _asr_backend == "faster_whisper": | |
| segments, _ = _asr_model.transcribe(tmp_wav, language=None, beam_size=1) | |
| text_out = " ".join([seg.text for seg in segments]).strip() | |
| elif _asr_backend == "openai_whisper": | |
| res = _asr_model.transcribe(tmp_wav) | |
| text_out = (res.get("text", "") or "").strip() | |
| prompt_text = text_out if len(text_out) > 0 else None | |
| print(f"[ASR] Transcription: {text_out[:120]}" + ("..." if len(text_out) > 120 else "")) | |
| except Exception as e: | |
| prompt_text = None | |
| print(f"[ASR] Failed: {e}") | |
| finally: | |
| if tmp_wav and os.path.exists(tmp_wav): | |
| try: | |
| os.unlink(tmp_wav) | |
| except Exception: | |
| pass | |
| if prompt_text is None: | |
| raise ValueError("reference_text is required when providing reference_audio (or enable auto_transcribe_reference and ensure ASR succeeds)") | |
| tmp_prompt_file = None | |
| if prompt_wav_path is None and prompt_waveform is not None and prompt_sample_rate is not None: | |
| try: | |
| import tempfile | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
| wav_np = prompt_waveform.squeeze().cpu().numpy() | |
| if wav_np.ndim > 1: | |
| wav_np = wav_np[0] | |
| sf.write(tmp_file.name, wav_np, prompt_sample_rate) | |
| tmp_prompt_file = tmp_file.name | |
| prompt_wav_path = tmp_prompt_file | |
| except Exception as e: | |
| print(f"[WARN] Failed to write prompt waveform: {e}") | |
| prompt_wav_path = None | |
| try: | |
| wav = model.generate( | |
| text=text, | |
| prompt_wav_path=prompt_wav_path, | |
| prompt_text=prompt_text, | |
| cfg_value=float(cfg_value), | |
| inference_timesteps=int(inference_steps), | |
| max_len=max_length, | |
| normalize=normalize, | |
| retry_badcase=retry_attempts > 0, | |
| retry_badcase_max_times=retry_attempts, | |
| retry_badcase_ratio_threshold=retry_threshold, | |
| ) | |
| finally: | |
| if tmp_prompt_file and os.path.exists(tmp_prompt_file): | |
| try: | |
| os.unlink(tmp_prompt_file) | |
| except Exception: | |
| pass | |
| if isinstance(wav, np.ndarray): | |
| audio_tensor = torch.from_numpy(wav).float() | |
| else: | |
| audio_tensor = wav.float() | |
| if audio_tensor.is_cuda: | |
| audio_tensor = audio_tensor.cpu() | |
| if audio_tensor.dim() == 1: | |
| audio_tensor = audio_tensor.unsqueeze(0).unsqueeze(0) | |
| elif audio_tensor.dim() == 2: | |
| audio_tensor = audio_tensor.unsqueeze(0) | |
| try: | |
| n = int(max(0, fade_in_ms) * target_sr / 1000) | |
| if n > 0 and audio_tensor.shape[-1] > n: | |
| ramp = torch.linspace(0.0, 1.0, n, dtype=audio_tensor.dtype, device=audio_tensor.device) | |
| audio_tensor[..., :n] *= ramp | |
| except Exception: | |
| pass | |
| used_reference_text = prompt_text if isinstance(prompt_text, str) else "" | |
| result = (used_reference_text, {"waveform": audio_tensor, "sample_rate": target_sr}) | |
| if unload_model: | |
| unload_model_instance(repo_id) | |
| return result | |
| class AILab_VoxCPMTTS_Advanced: | |
| def INPUT_TYPES(s): | |
| available_devices = get_available_devices() | |
| return { | |
| "required": { | |
| "model": (MODEL_CHOICES, {"default": DEFAULT_MODEL_CHOICE, "tooltip": "Select VoxCPM model version"}), | |
| "text": ("STRING", {"multiline": True, "default": "Hello, this is VoxCPM TTS.", "tooltip": "Text to synthesize into speech"}), | |
| "cfg_value": ("FLOAT", {"default": 2.0, "min": 1.0, "max": 10.0, "step": 0.1, "tooltip": "Guidance scale: higher = more adherence to prompt, lower = more natural"}), | |
| "inference_steps": ("INT", {"default": 10, "min": 1, "max": 100, "step": 1, "tooltip": "Number of diffusion steps: higher = better quality, lower = faster"}), | |
| "max_length": ("INT", {"default": 4096, "min": 256, "max": 8192, "step": 256, "tooltip": "Maximum token length during generation"}), | |
| "normalize": ("BOOLEAN", {"default": True, "tooltip": "Enable text normalization for numbers, punctuation, etc."}), | |
| }, | |
| "optional": { | |
| "reference_audio": ("AUDIO", {"tooltip": "Reference audio for voice cloning"}), | |
| "reference_text": ("STRING", {"multiline": True, "default": "", "tooltip": "Text transcript of reference audio (leave empty for auto-transcription)"}), | |
| "fade_in_ms": ("INT", {"default": 20, "min": 0, "max": 1000, "step": 5, "tooltip": "Fade-in duration to reduce initial artifacts (0-1000ms)"}), | |
| "auto_transcribe_reference": ("BOOLEAN", {"default": False, "tooltip": "Automatically transcribe reference audio when no text is provided"}), | |
| "show_transcription_log": ("BOOLEAN", {"default": True, "tooltip": "Show ASR transcription logs in console"}), | |
| "unload_model": ("BOOLEAN", {"default": False, "tooltip": "Unload model after generation to free VRAM"}), | |
| "retry_attempts": ("INT", {"default": 2, "min": 0, "max": 10, "step": 1, "tooltip": "Number of retry attempts for bad cases"}), | |
| "retry_threshold": ("FLOAT", {"default": 8.0, "min": 2.0, "max": 20.0, "step": 0.1, "tooltip": "Audio-to-text ratio threshold for retry detection"}), | |
| "device": (available_devices, {"default": available_devices[0], "tooltip": "Device to run the model on"}), | |
| "seed": ("INT", {"default": -1, "min": -1, "max": 0xffffffffffffffff, "tooltip": "Random seed (-1 for random)"}), | |
| } | |
| } | |
| RETURN_TYPES = ("AUDIO", "STRING") | |
| FUNCTION = "generate" | |
| CATEGORY = "🧪AILab/🔊TTS/VoxCPMTTS" | |
| def generate( | |
| self, | |
| model, | |
| text, | |
| cfg_value, | |
| inference_steps, | |
| max_length, | |
| normalize, | |
| seed, | |
| device, | |
| retry_attempts, | |
| retry_threshold, | |
| reference_audio=None, | |
| reference_text="", | |
| fade_in_ms=20, | |
| auto_transcribe_reference=False, | |
| show_transcription_log=True, | |
| unload_model=False, | |
| ): | |
| if seed != -1: | |
| torch.manual_seed(seed) | |
| if torch.cuda.is_available(): | |
| torch.cuda.manual_seed_all(seed) | |
| model_management.soft_empty_cache() | |
| repo_id = MODEL_REPO_MAP.get(model, MODEL_REPO_MAP.get(model.strip(), model)) | |
| model = load_model(model_repo=repo_id) | |
| target_sr = 44100 if "1.5" in repo_id else 16000 | |
| if device == "auto": | |
| target_device = get_best_device() | |
| elif device == "cuda" and torch.cuda.is_available(): | |
| target_device = model_management.get_torch_device() | |
| else: | |
| target_device = torch.device("cpu") | |
| try: | |
| if hasattr(model, 'tts_model'): | |
| model.tts_model.to(target_device) | |
| if hasattr(model.tts_model, 'audio_vae'): | |
| model.tts_model.audio_vae.to(target_device) | |
| except Exception: | |
| pass | |
| text = text.strip() | |
| if len(text) == 0: | |
| raise ValueError("Please input text to synthesize.") | |
| prompt_wav_path = None | |
| prompt_waveform = None | |
| prompt_sample_rate = None | |
| prompt_text = None | |
| if reference_audio: | |
| import torchaudio, tempfile | |
| waveform = reference_audio["waveform"] | |
| sample_rate = reference_audio["sample_rate"] | |
| while waveform.dim() > 3: | |
| waveform = waveform.squeeze(0) | |
| if waveform.dim() == 3: | |
| if waveform.shape[0] > 1: | |
| waveform = waveform[0:1, ...] | |
| elif waveform.dim() == 2: | |
| if waveform.shape[0] > 1: | |
| waveform = waveform.mean(dim=0, keepdim=True) | |
| waveform = waveform.unsqueeze(0) | |
| elif waveform.dim() == 1: | |
| waveform = waveform.unsqueeze(0).unsqueeze(0) | |
| else: | |
| waveform = waveform.reshape(1, 1, -1) | |
| prompt_waveform = waveform.contiguous().float().cpu() | |
| prompt_sample_rate = int(sample_rate) | |
| if isinstance(reference_text, str) and len(reference_text.strip()) >= 1: | |
| prompt_text = reference_text.strip() | |
| elif auto_transcribe_reference: | |
| tmp_wav = None | |
| try: | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
| wav_np = prompt_waveform.squeeze().cpu().numpy() | |
| if wav_np.ndim > 1: | |
| wav_np = wav_np[0] | |
| sf.write(tmp_file.name, wav_np, prompt_sample_rate) | |
| tmp_wav = tmp_file.name | |
| global _asr_model, _asr_backend | |
| if _asr_model is None: | |
| try: | |
| from faster_whisper import WhisperModel # type: ignore | |
| asr_device = "cuda" if torch.cuda.is_available() else "cpu" | |
| compute_type = "float16" if torch.cuda.is_available() else "int8" | |
| model_name = os.environ.get("VOXCPM_ASR_MODEL", "small") | |
| if show_transcription_log: | |
| print(f"[ASR] Loading faster-whisper: {model_name} on {asr_device}/{compute_type}") | |
| _asr_model = WhisperModel(model_name, device=asr_device, compute_type=compute_type) | |
| _asr_backend = "faster_whisper" | |
| except Exception: | |
| try: | |
| import whisper # type: ignore | |
| model_name = os.environ.get("VOXCPM_ASR_MODEL", "small") | |
| if show_transcription_log: | |
| print(f"[ASR] Loading openai-whisper: {model_name}") | |
| _asr_model = whisper.load_model(model_name) | |
| _asr_backend = "openai_whisper" | |
| except Exception as e_load: | |
| if show_transcription_log: | |
| print(f"[ASR] Failed to load ASR backends: {e_load}") | |
| _asr_backend = None | |
| text_out = "" | |
| if _asr_backend == "faster_whisper": | |
| segments, _ = _asr_model.transcribe(tmp_wav, language=None, beam_size=1) | |
| text_out = " ".join([seg.text for seg in segments]).strip() | |
| elif _asr_backend == "openai_whisper": | |
| res = _asr_model.transcribe(tmp_wav) | |
| text_out = (res.get("text", "") or "").strip() | |
| prompt_text = text_out if len(text_out) > 0 else None | |
| if show_transcription_log: | |
| print(f"[ASR] Transcription: {text_out[:120]}" + ("..." if len(text_out) > 120 else "")) | |
| except Exception as e: | |
| prompt_text = None | |
| if show_transcription_log: | |
| print(f"[ASR] Failed: {e}") | |
| finally: | |
| if tmp_wav and os.path.exists(tmp_wav): | |
| try: | |
| os.unlink(tmp_wav) | |
| except Exception: | |
| pass | |
| if prompt_text is None: | |
| if show_transcription_log: | |
| print("[ERROR] reference_text is required when providing reference_audio (or enable auto_transcribe_reference and ensure ASR succeeds)") | |
| raise ValueError("reference_text is required when providing reference_audio (or enable auto_transcribe_reference and ensure ASR succeeds)") | |
| tmp_prompt_file = None | |
| if prompt_wav_path is None and prompt_waveform is not None and prompt_sample_rate is not None: | |
| try: | |
| import tempfile | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: | |
| wav_np = prompt_waveform.squeeze().cpu().numpy() | |
| if wav_np.ndim > 1: | |
| wav_np = wav_np[0] | |
| sf.write(tmp_file.name, wav_np, prompt_sample_rate) | |
| tmp_prompt_file = tmp_file.name | |
| prompt_wav_path = tmp_prompt_file | |
| except Exception as e: | |
| if show_transcription_log: | |
| print(f"[WARN] Failed to write prompt waveform: {e}") | |
| prompt_wav_path = None | |
| try: | |
| wav = model.generate( | |
| text=text, | |
| prompt_wav_path=prompt_wav_path, | |
| prompt_text=prompt_text, | |
| cfg_value=float(cfg_value), | |
| inference_timesteps=int(inference_steps), | |
| max_len=max_length, | |
| normalize=normalize, | |
| retry_badcase=retry_attempts > 0, | |
| retry_badcase_max_times=retry_attempts, | |
| retry_badcase_ratio_threshold=retry_threshold, | |
| ) | |
| finally: | |
| if tmp_prompt_file and os.path.exists(tmp_prompt_file): | |
| try: | |
| os.unlink(tmp_prompt_file) | |
| except Exception: | |
| pass | |
| if isinstance(wav, np.ndarray): | |
| audio_tensor = torch.from_numpy(wav).float() | |
| else: | |
| audio_tensor = wav.float() | |
| if audio_tensor.is_cuda: | |
| audio_tensor = audio_tensor.cpu() | |
| if audio_tensor.dim() == 1: | |
| audio_tensor = audio_tensor.unsqueeze(0).unsqueeze(0) | |
| elif audio_tensor.dim() == 2: | |
| audio_tensor = audio_tensor.unsqueeze(0) | |
| try: | |
| n = int(max(0, fade_in_ms) * target_sr / 1000) | |
| if n > 0 and audio_tensor.shape[-1] > n: | |
| ramp = torch.linspace(0.0, 1.0, n, dtype=audio_tensor.dtype, device=audio_tensor.device) | |
| audio_tensor[..., :n] *= ramp | |
| except Exception: | |
| pass | |
| used_reference_text = prompt_text if isinstance(prompt_text, str) else "" | |
| result = ({"waveform": audio_tensor, "sample_rate": target_sr}, used_reference_text) | |
| if unload_model: | |
| unload_model_instance(repo_id) | |
| return result | |
| NODE_CLASS_MAPPINGS = { | |
| "AILab_VoxCPMTTS": AILab_VoxCPMTTS, | |
| "AILab_VoxCPMTTS_Advanced": AILab_VoxCPMTTS_Advanced, | |
| } | |
| NODE_DISPLAY_NAME_MAPPINGS = { | |
| "AILab_VoxCPMTTS": "VoxCPM TTS", | |
| "AILab_VoxCPMTTS_Advanced": "VoxCPM TTS (Advanced)", | |
| } | |