import os import re import torch from shared.mps import mps_device_or from shared.utils import files_locator as fl from .prompt_enhancers import TTS_MONOLOGUE_PROMPT, TTS_QWEN3_DIALOGUE_PROMPT INDEX_TTS2_REPO_ID = "DeepBeepMeep/TTS" INDEX_TTS2_FOLDER = "index_tts2" INDEX_TTS2_MAIN_GPT_FILENAME = "index_tts2_gpt_fp16.safetensors" INDEX_TTS2_QWEN_EMO_FOLDER = "qwen0.6bemo4-merge" INDEX_TTS2_BIGVGAN_FOLDER = "bigvgan_v2_22khz_80band_256x" INDEX_TTS2_W2V_BERT_FOLDER = "w2v-bert-2.0" INDEX_TTS2_BIGVGAN_FILES = [ "config.json", "bigvgan_generator.pt", ] INDEX_TTS2_W2V_BERT_FILES = [ "config.json", "preprocessor_config.json", "model_fp16.safetensors", ] INDEX_TTS2_SHOW_LOAD_LOGS = False INDEX_TTS2_ROOT_FILES = [ "bpe.model", "feat1.pt", "feat2.pt", "s2mel.safetensors", "wav2vec2bert_stats.pt", "campplus_cn_common.bin", "index_tts2_semantic_codec.safetensors", ] INDEX_TTS2_QWEN_EMO_FILES = [ "Modelfile", "added_tokens.json", "chat_template.jinja", "config.json", "generation_config.json", "merges.txt", "model.safetensors", "special_tokens_map.json", "tokenizer.json", "tokenizer_config.json", "vocab.json", ] INDEX_TTS2_DURATION_SLIDER = { "label": "Max duration (seconds)", "min": 1, "max": 600, "increment": 1, "default": 25, } INDEX_TTS2_AUDIO_PROMPT_TYPES = { "selection": ["A", "AB", "AB2"], "labels": { "A": "Voice cloning (1 reference audio)", "AB": "Voice + emotion (2 reference audios)", "AB2": "Dialogue (2 speaker reference audios)", }, "letters_filter": "AB2", "default": "A", } INDEX_TTS2_AUTO_SPLIT_SETTING_ID = "auto_split_every_s" INDEX_TTS2_AUTO_SPLIT_MIN_SECONDS = 5.0 INDEX_TTS2_AUTO_SPLIT_MAX_SECONDS = 90.0 INDEX_TTS2_CUSTOM_SETTINGS = [] def _get_index_tts2_model_def(): return { "audio_only": True, "image_outputs": False, "sliding_window": False, "guidance_max_phases": 0, "no_negative_prompt": True, "inference_steps": False, "temperature": True, "top_p_slider": True, "top_k_slider": True, "image_prompt_types_allowed": "", "supports_early_stop": True, "profiles_dir": ["index_tts2"], "duration_slider": dict(INDEX_TTS2_DURATION_SLIDER), "any_audio_prompt": True, "audio_prompt_choices": True, "audio_prompt_type_sources": dict(INDEX_TTS2_AUDIO_PROMPT_TYPES), "custom_settings": [one.copy() for one in INDEX_TTS2_CUSTOM_SETTINGS], "preserve_empty_prompt_lines": True, "pause_between_sentences": True, "audio_guide_label": "Speaker reference voice", "audio_guide2_label": "Speaker 2 voice / emotion reference (optional)", "alt_prompt": { "label": "Default Emotion Instruction (if none, emotion will be detected or set manually for each sentence)", "name": "Default Emotion Instruction", "placeholder": "happy,angry,sad,afraid,disgusted,melancholic,surprised,calm", "lines": 2, }, "text_prompt_enhancer_instructions": TTS_MONOLOGUE_PROMPT, "text_prompt_enhancer_instructions1": TTS_QWEN3_DIALOGUE_PROMPT, "text_prompt_enhancer_max_tokens": 512, "text_prompt_enhancer_max_tokens1": 512, "prompt_enhancer_def": { "selection": ["T", "T1"], "labels": { "T": "A Speech based on current Prompt", "T1": "A Dialogue between two People based on current Prompt", }, "default": "T", }, "prompt_enhancer_button_label": "Write", "lm_engines": ["legacy", "cg", "vllm"], "compile": False, } def _get_index_tts2_download_def(): return { "repoId": INDEX_TTS2_REPO_ID, # IndexTTS2 configs are bundled with source code in models/TTS/index_tts2/configs. "sourceFolderList": [ INDEX_TTS2_FOLDER, INDEX_TTS2_QWEN_EMO_FOLDER, INDEX_TTS2_BIGVGAN_FOLDER, INDEX_TTS2_W2V_BERT_FOLDER, ], "fileList": [ INDEX_TTS2_ROOT_FILES, INDEX_TTS2_QWEN_EMO_FILES, INDEX_TTS2_BIGVGAN_FILES, INDEX_TTS2_W2V_BERT_FILES, ], } def _resolve_w2v_bert_dir(): located = fl.locate_folder(INDEX_TTS2_W2V_BERT_FOLDER, error_if_none=False) if located is not None: return located fallback = os.path.join(fl.get_download_location(), INDEX_TTS2_W2V_BERT_FOLDER) if os.path.isdir(fallback): return fallback return None def _ensure_w2v_bert_fp16_file(): w2v_dir = _resolve_w2v_bert_dir() if w2v_dir is None: raise FileNotFoundError( f"IndexTTS2 semantic folder '{INDEX_TTS2_W2V_BERT_FOLDER}' is missing. " "WanGP must download it from DeepBeepMeep/TTS." ) # fp32_path = os.path.join(w2v_dir, "model.safetensors") # if not os.path.isfile(fp32_path): # raise FileNotFoundError( # f"IndexTTS2 semantic model file is missing at '{fp32_path}'. " # "Expected DeepBeepMeep/TTS/w2v-bert-2.0/model.safetensors." # ) fp16_path = os.path.join(w2v_dir, "model_fp16.safetensors") if os.path.isfile(fp16_path): return fp16_path from mmgp import offload src = safetensors2.l .load_ (fp16_path, device="cpu") # dst = {} # for key, value in src.items(): # if torch.is_floating_point(value) and value.dtype != torch.float16: # dst[key] = value.to(torch.float16).contiguous() # else: # dst[key] = value.contiguous() # save_file(dst, fp16_path, metadata={"format": "pt", "dtype": "float16"}) return fp16_path class family_handler: @staticmethod def query_supported_types(): return ["index_tts2"] @staticmethod def query_family_maps(): return {}, {} @staticmethod def query_model_family(): return "tts" @staticmethod def query_family_infos(): return {"tts": (200, "TTS")} @staticmethod def register_lora_cli_args(parser, lora_root): parser.add_argument( "--lora-dir-index-tts2", type=str, default=None, help=f"Path to a directory that contains IndexTTS2 settings (default: {os.path.join(lora_root, 'index_tts2')})", ) @staticmethod def get_lora_dir(base_model_type, args, lora_root): return getattr(args, "lora_dir_index_tts2", None) or os.path.join(lora_root, "index_tts2") @staticmethod def query_model_def(base_model_type, model_def): return _get_index_tts2_model_def() @staticmethod def query_model_files(computeList, base_model_type, model_def=None): return _get_index_tts2_download_def() @staticmethod def load_model( model_filename, model_type, base_model_type, model_def, quantizeTransformer=False, text_encoder_quantization=None, dtype=None, VAE_dtype=None, mixed_precision_transformer=False, save_quantized=False, submodel_no_list=None, text_encoder_filename=None, profile=0, lm_decoder_engine="legacy", **kwargs, ): from .index_tts2.pipeline import IndexTTS2Pipeline # _ensure_w2v_bert_fp16_file() weights_candidate = None if isinstance(model_filename, (list, tuple)): if len(model_filename) > 0: weights_candidate = model_filename[0] else: weights_candidate = model_filename gpt_weights_path = None if weights_candidate: gpt_weights_path = fl.locate_file(weights_candidate, error_if_none=False) or weights_candidate if gpt_weights_path is None: gpt_weights_path = fl.locate_file(INDEX_TTS2_MAIN_GPT_FILENAME, error_if_none=False) if gpt_weights_path is not None: gpt_name = os.path.basename(gpt_weights_path) if "_quanto_" in gpt_name: non_quanto_name = gpt_name.replace("_quanto_fp16_int8", "_fp16").replace("_quanto_int8", "") non_quanto_path = fl.locate_file(non_quanto_name, error_if_none=False) if non_quanto_path is not None: gpt_weights_path = non_quanto_path if gpt_weights_path is None: raise FileNotFoundError( f"IndexTTS2 main transformer file '{INDEX_TTS2_MAIN_GPT_FILENAME}' is missing. " "It must be provided in defaults model.URLs." ) runtime_device = mps_device_or(torch.device("cpu")) pipeline = IndexTTS2Pipeline( ckpt_root=fl.get_download_location(), device=runtime_device, gpt_weights_path=gpt_weights_path, show_load_logs=INDEX_TTS2_SHOW_LOAD_LOGS, lm_decoder_engine=lm_decoder_engine, ) if torch.cuda.is_available(): pipeline.model.device = "cuda:0" pipe = { "transformer": pipeline.model.gpt, "transformer2": pipeline.model.s2mel, "vocoder": pipeline.model.bigvgan, "semantic_model": pipeline.model.semantic_model, "campplus_model": pipeline.model.campplus_model, "qwen_emo_model": pipeline.model.qwen_emo.model, } if str(lm_decoder_engine).strip().lower() in ("cg", "cudagraph", "vllm"): pipe["transformer"]._budget = 0 load_def = { "pipe": pipe, "coTenantsMap": {}, } if int(profile) in (2, 4, 5): load_def["budgets"] = {"transformer2": 250} if save_quantized and gpt_weights_path: from mmgp import offload quant_filename = os.path.basename(gpt_weights_path) if "quanto" not in quant_filename: if "_fp16" in quant_filename: quant_filename = quant_filename.replace("_fp16", "_quanto_fp16_int8") else: dot_pos = quant_filename.rfind(".") if dot_pos >= 0: quant_filename = f"{quant_filename[:dot_pos]}_quanto_int8{quant_filename[dot_pos:]}" else: quant_filename = f"{quant_filename}_quanto_int8.safetensors" if fl.locate_file(quant_filename, error_if_none=False) is None: quant_path = os.path.join(fl.get_download_location(), quant_filename) offload.save_model(pipeline.model.gpt, quant_path, do_quantize=True, config_file_path=None) return pipeline, load_def @staticmethod def fix_settings(base_model_type, settings_version, model_def, ui_defaults): if "alt_prompt" not in ui_defaults: ui_defaults["alt_prompt"] = "" defaults = { "audio_prompt_type": "A", } for key, value in defaults.items(): ui_defaults.setdefault(key, value) @staticmethod def update_default_settings(base_model_type, model_def, ui_defaults): duration_def = model_def.get("duration_slider", {}) ui_defaults.update( { "prompt": "[fear] At the very beginning I was so afraid to speak.\n[sadness] Nobody would talk to me. I felt so alone.\n[disgust] They would just ignore me and pretend that I didnt exist\n[happy] By chance I discovered this wonderful App, and now everything is different.\n[anger] I have a new voice and now everybody will have no choice but to listen to my words !!!", "audio_prompt_type": "A", "alt_prompt": "", "repeat_generation": 1, "duration_seconds": duration_def.get("default", 25), "pause_seconds": 0.2, "video_length": 0, "num_inference_steps": 0, "negative_prompt": "", "temperature": 0.8, "top_p": 0.8, "top_k": 30, "multi_prompts_gen_type": "FG", } ) @staticmethod def validate_generative_prompt(base_model_type, model_def, inputs, one_prompt): if one_prompt is None or len(str(one_prompt).strip()) == 0: return "Prompt text cannot be empty for IndexTTS2." if inputs.get("audio_guide") is None: return "IndexTTS2 requires one reference voice audio file." raw_audio_prompt_type = str(inputs.get("audio_prompt_type", "A") or "A").upper() if "2" in raw_audio_prompt_type: audio_prompt_type = "2" elif "B" in raw_audio_prompt_type: audio_prompt_type = "AB" elif "A" in raw_audio_prompt_type: audio_prompt_type = "A" else: return "Unsupported audio prompt mode for IndexTTS2." prompt_text = str(one_prompt) has_speaker_syntax = re.search(r"Speaker\s*\d+\s*:", prompt_text, flags=re.IGNORECASE) is not None if audio_prompt_type == "AB" and inputs.get("audio_guide2") is None: return "Emotion mode requires a second reference audio file." if audio_prompt_type == "2": if inputs.get("audio_guide2") is None: return "Two-speaker mode requires a second speaker reference audio file." speaker_matches = list(re.finditer(r"Speaker\s*(\d+)\s*:", prompt_text, flags=re.IGNORECASE)) if not speaker_matches: return ( "Two-speaker mode requires prompt lines using Speaker 1: and Speaker 2: " "(or any two numeric speaker IDs). For headless settings, keep " "'multi_prompts_gen_type' = 'FG' so dialogue lines stay in one prompt." ) speaker_ids = sorted({int(match.group(1)) for match in speaker_matches}) if len(speaker_ids) != 2: return ( "Two-speaker mode requires exactly two speaker IDs. Use Speaker 1: and Speaker 2:. " "For headless settings, keep 'multi_prompts_gen_type' = 'FG'." ) elif has_speaker_syntax: return "Speaker-tag dialogue requires two-speaker mode (set audio prompt mode to Dialogue)." return None @staticmethod def validate_generative_settings(base_model_type, model_def, inputs): duration = inputs.get("duration_seconds", 0) try: duration = float(duration) except Exception: return "Max duration must be a number." if duration <= 0: return "Max duration must be greater than 0." custom_settings = inputs.get("custom_settings", None) if custom_settings is None: return None if not isinstance(custom_settings, dict): return "Custom settings must be a dictionary." raw_value = custom_settings.get(INDEX_TTS2_AUTO_SPLIT_SETTING_ID, None) if raw_value is None: return None if isinstance(raw_value, str): raw_value = raw_value.strip() if len(raw_value) == 0: custom_settings.pop(INDEX_TTS2_AUTO_SPLIT_SETTING_ID, None) inputs["custom_settings"] = custom_settings if len(custom_settings) > 0 else None return None try: if isinstance(raw_value, bool): raise ValueError() auto_split_seconds = float(raw_value) except Exception: return ( f"Auto Split Every s must be a number between " f"{int(INDEX_TTS2_AUTO_SPLIT_MIN_SECONDS)} and {int(INDEX_TTS2_AUTO_SPLIT_MAX_SECONDS)} seconds." ) if ( auto_split_seconds < INDEX_TTS2_AUTO_SPLIT_MIN_SECONDS or auto_split_seconds > INDEX_TTS2_AUTO_SPLIT_MAX_SECONDS ): return ( f"Auto Split Every s must be between " f"{int(INDEX_TTS2_AUTO_SPLIT_MIN_SECONDS)} and {int(INDEX_TTS2_AUTO_SPLIT_MAX_SECONDS)} seconds." ) custom_settings[INDEX_TTS2_AUTO_SPLIT_SETTING_ID] = auto_split_seconds inputs["custom_settings"] = custom_settings return None