import gradio as gr # Import alias module before outetts to setup whisper redirection import alias as _alias import outetts import json import tempfile import hashlib import os import re from typing import Optional from llama_cpp.llama import LlamaGrammar from outetts.version.interface import InterfaceLLAMACPP from outetts.models.info import MODEL_INFO from outetts.utils import helpers from huggingface_hub import hf_hub_download import torch from transformers import BitsAndBytesConfig import spaces import numpy as np from collections import OrderedDict # Available OuteTTS models based on the documentation MODELS = {v.value: v for _, v in outetts.Models.__members__.items()} MODEL_QUANTIZATION = { outetts.Models.VERSION_0_1_SIZE_350M: outetts.LlamaCppQuantization.FP16, outetts.Models.VERSION_0_2_SIZE_500M: outetts.LlamaCppQuantization.FP16, outetts.Models.VERSION_0_3_SIZE_500M: outetts.LlamaCppQuantization.FP16, } # Cache for speaker profiles to avoid re-transcribing the same audio speaker_cache = {} SPLIT_SYMBOL = { outetts.InterfaceVersion.V1: '<|space|>', outetts.InterfaceVersion.V2: '<|space|>', outetts.InterfaceVersion.V3: ' ', } def word_to_grammar(word): if all(ord(c) < 128 for c in word): return f'"{word}"' return f'[{"".join(OrderedDict.fromkeys(word))}]+' # patch InterfaceLLAMACPP, inject new _generate method InterfaceLLAMACPP._orig_generate = InterfaceLLAMACPP._generate def ggml_generate(self, input_ids, config): tokenizer = self.prompt_processor.tokenizer split = SPLIT_SYMBOL.get(self.config.interface_version, ' ') prompt = tokenizer.decode(input_ids, skip_special_tokens=False) prompt_no_special = tokenizer.decode(input_ids, skip_special_tokens=True).strip() if '<|text_start|>' not in prompt: return self._orig_generate(input_ids, config) speaker_text_last = prompt_no_special.split('\n').pop() text = prompt[prompt.index('<|text_start|>')+14:prompt.index('<|text_end|>')] gen_text = text[text.index(speaker_text_last)+len(speaker_text_last):].strip(split) if speaker_text_last in text else text words = [word_to_grammar(word) for word in gen_text.split(split)] if self.config.interface_version == outetts.InterfaceVersion.V2: config.additional_gen_config["grammar"] = LlamaGrammar.from_string(f"""\ root ::= NL? {' audioBlock '.join(words)} audioEnd NL EOS? audioBlock ::= TIME CODE* space NL? TEXT ::= [A-Za-z0-9 .,?!]+ EOS ::= "<|im_end|>" emotionStart ::= "<|emotion_start|>" emotionEnd ::= "<|emotion_end|>" audioEnd ::= "<|audio_end|>" space ::= "<|space|>" WORD ::= {' | '.join(words)} NL ::= [\\n] TIME ::= "<|t_" DECIMAL "|>" CODE ::= "<|" DIGITS "|>" DIGITS ::= [0-9]+ DECIMAL ::= [0-9]+ "." [0-9]+ punch ::= "<|" [a-z_]+ "|>" """) elif self.config.interface_version == outetts.InterfaceVersion.V3: config.additional_gen_config["grammar"] = LlamaGrammar.from_string(f"""\ root ::= leadWord wordBlock* audioEnd NL EOS? leadWord ::= WORD audioBlock wordBlock ::= wordStart WORD audioBlock audioBlock ::= codeBlock wordEnd NL? codeBlock ::= features TIME energy spectralCentroid pitch CODE CODES* TEXT ::= [A-Za-z0-9.,!?]+ EOS ::= "<|im_end|>" audioEnd ::= "<|audio_end|>" wordStart ::= "<|word_start|>" wordEnd ::= "<|word_end|>" features ::= "<|features|>" energy ::= "<|energy_" DIGITS "|>" spectralCentroid ::= "<|spectral_centroid_" DIGITS "|>" pitch ::= "<|pitch_" DIGITS "|>" WORD ::= {' | '.join(words)} NL ::= [\\n] TIME ::= "<|t_" DECIMAL "|>" CODE ::= "<|code|>" CODES ::= CODE1 CODE2 CODE1 ::= "<|c1_" DIGITS "|>" CODE2 ::= "<|c2_" DIGITS "|>" DIGITS ::= [0-9]+ DECIMAL ::= [0-9]+ "." [0-9]+ """) return self._orig_generate(input_ids, config) InterfaceLLAMACPP._generate = ggml_generate def get_file_hash(file_path): """Calculate MD5 hash of a file for caching purposes.""" hash_md5 = hashlib.md5() with open(file_path, "rb") as f: for chunk in iter(lambda: f.read(4096), b""): hash_md5.update(chunk) return hash_md5.hexdigest() def try_ggml_model(model: outetts.Models, quantization: outetts.LlamaCppQuantization): model_config = MODEL_INFO[model] repo = f"OuteAI/{model.value}-GGUF" filename = f"{model.value}-{quantization.value}.gguf" model_path = hf_hub_download( repo_id=repo, filename=filename, local_dir=os.path.join(helpers.get_cache_dir(), "gguf"), local_files_only=False ) generation_type = outetts.GenerationType.CHUNKED # if model_config['interface_version'] == outetts.InterfaceVersion.V3: # generation_type = outetts.GenerationType.GUIDED_WORDS return outetts.ModelConfig( model_path=model_path, tokenizer_path=f"OuteAI/{model.value}", backend=outetts.Backend.LLAMACPP, n_gpu_layers=99, verbose=False, device=None, dtype=None, additional_model_config={}, audio_codec_path=None, generation_type=generation_type, **model_config ) def get_interface(model_name: str): """Get interface instance for the model (no caching to avoid CUDA memory issues).""" model = MODELS[model_name] try: quantization = MODEL_QUANTIZATION.get(model, outetts.LlamaCppQuantization.Q8_0) config = try_ggml_model(model, quantization) except: has_cuda = torch.cuda.is_available() model_config = MODEL_INFO[model] config = outetts.ModelConfig( model_path=f"OuteAI/{model_name}", tokenizer_path=f"OuteAI/{model_name}", backend=outetts.Backend.HF, additional_model_config={ "device_map": "auto" if has_cuda else "cpu", "quantization_config": BitsAndBytesConfig( load_in_4bit=True, llm_int8_enable_fp32_cpu_offload=True ) if has_cuda else None, }, **model_config ) # Initialize the interface interface = outetts.Interface(config=config) return interface def get_or_create_speaker(interface, audio_file): """Get speaker from cache or create new one if not cached.""" # Calculate file hash for caching file_hash = get_file_hash(audio_file) cache_key = f"{interface.config.interface_version}_{file_hash}" # Check if speaker profile is already cached if cache_key in speaker_cache: print(f"✅ Using cached speaker profile for {os.path.basename(audio_file)}") return json.loads(speaker_cache[cache_key]) device = "cuda" if torch.cuda.is_available() else "cpu" # Create new speaker profile print(f"🔄 Creating new speaker profile for {os.path.basename(audio_file)}") try: speaker = interface.create_speaker(audio_file, whisper_model="large-v3-turbo", whisper_device=device) # Cache the speaker profile speaker_cache[cache_key] = json.dumps(speaker) print(f"💾 Cached speaker profile ({len(speaker_cache)} total cached)") return speaker except Exception as e: return f"❌ Error creating speaker profile: {str(e)}" @spaces.GPU def create_speaker_and_generate(model_name, audio_file, test_text: Optional[str] = None, temperature: float = 0.4): """Create speaker from audio and optionally generate test audio.""" if audio_file is None: # Return default values for startup/caching purposes return "Please upload an audio file to create a speaker profile.", None # Get interface (no caching to avoid CUDA memory issues) interface = get_interface(model_name) # Get or create speaker profile (with caching) speaker_result = get_or_create_speaker(interface, audio_file) # Check if speaker_result is an error message if isinstance(speaker_result, str) and speaker_result.startswith("❌"): return speaker_result, None # Convert speaker dict to formatted JSON speaker_json = json.dumps(speaker_result, indent=2, ensure_ascii=False) # Generate test audio if text is provided generated_audio = None if test_text and test_text.strip(): output = interface.generate( config=outetts.GenerationConfig( text=test_text, speaker=speaker_result, sampler_config=outetts.SamplerConfig( temperature=temperature ), max_length=MODEL_INFO[MODELS[model_name]]["max_seq_length"] ) ) # Save to temporary file with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: output.save(f.name) generated_audio = f.name return speaker_json, generated_audio example_text = "Hello, this is a test of the OuteTTS speaker profile." # Create the Gradio interface demo = gr.Interface( fn=create_speaker_and_generate, inputs=[ gr.Dropdown( choices=list(MODELS.keys()), value=list(MODELS.keys())[-1], label="Select OuteTTS Model", info="Choose the model variant to use" ), gr.Audio( label="Upload Reference Audio (Max 20 seconds)", type="filepath", sources=["upload", "microphone"] ), gr.Textbox( label="Test Text (Optional)", placeholder="Enter text to generate speech (leave empty to only create speaker profile)...", lines=3, value=None ), gr.Slider( minimum=0.1, maximum=1.0, step=0.1, value=0.4, label="Temperature", info="Controls randomness in generation" ) ], outputs=[ gr.Textbox( label="Speaker Profile (JSON)", lines=15, max_lines=20, show_copy_button=True ), gr.Audio( label="Generated Test Audio (if text provided)", type="filepath" ) ], title="🎙️ OuteTTS Speaker Creator", description="Create and manage speaker profiles for OuteTTS text-to-speech synthesis. Upload audio to create a speaker profile, and optionally provide test text to generate sample audio.", theme=gr.themes.Soft(), examples=[ ["OuteTTS-1.0-0.6B", None, example_text, 0.2], ["OuteTTS-0.3-500M", None, example_text, 0.2], ], cache_examples=False, flagging_mode="never" ) if __name__ == "__main__": # Launch with optimized configuration for HuggingFace Spaces demo.launch( server_name="0.0.0.0", # Allow external connections server_port=7860, share=False, # Set to True if you want a public link show_api=True, # Show API documentation show_error=True # Show detailed error messages )