import os import torch import soundfile as sf import logging import gradio as gr import librosa import numpy as np from datetime import datetime from mira.model import MiraTTS import spaces # Configure logging for HF Spaces logging.basicConfig( level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s' ) MODEL = None def initialize_model(): """Initialize MiraTTS model with error handling for HF Spaces.""" global MODEL if MODEL is not None: return MODEL try: logging.info("Initializing MiraTTS model...") model_dir = "YatharthS/MiraTTS" # Initialize with HF Spaces compatible settings MODEL = MiraTTS( model_dir=model_dir, tp=1, # Single GPU enable_prefix_caching=False, # Disable for stability cache_max_entry_count=0.1 # Reduced cache ) logging.info("Model initialized successfully") return MODEL except Exception as e: logging.error(f"Model initialization failed: {e}") raise e def validate_audio_input(audio_path): """Validate and preprocess audio input for HF Spaces.""" if not audio_path or not os.path.exists(audio_path): raise ValueError("Audio file not found") try: # Load and validate audio audio, sr = librosa.load(audio_path, sr=None, duration=30) # Limit to 30s for memory if len(audio) == 0: raise ValueError("Audio file is empty") # Ensure minimum length min_length = int(0.5 * sr) # At least 0.5 seconds if len(audio) < min_length: raise ValueError(f"Audio too short: {len(audio)/sr:.2f}s, minimum 0.5s required") # Resample to 16kHz if sr != 16000: audio = librosa.resample(audio, orig_sr=sr, target_sr=16000) sr = 16000 # Normalize audio audio = audio / np.max(np.abs(audio)) # Save preprocessed audio temp_dir = "/tmp" if os.path.exists("/tmp") else "." temp_path = os.path.join(temp_dir, f"processed_{os.path.basename(audio_path)}") sf.write(temp_path, audio, samplerate=sr) return temp_path, len(audio), sr except Exception as e: raise ValueError(f"Audio processing failed: {e}") #@spaces.GPU @spaces.GPU(duration=120) # Swapping to zero-gpu def generate_speech(text, prompt_audio_path): """Generate speech with GPU acceleration for HF Spaces.""" try: # Initialize model if needed model = initialize_model() # Validate inputs if not text or not text.strip(): raise ValueError("Text input is empty") # Process audio processed_audio, length, sr = validate_audio_input(prompt_audio_path) logging.info(f"Audio processed: {length/sr:.2f}s at {sr}Hz") # Encode audio context_tokens = model.encode_audio(processed_audio) if context_tokens is None: raise ValueError("Failed to encode reference audio") # Generate speech output_audio = model.generate(text, context_tokens) if output_audio is None: raise ValueError("Speech generation failed") # Process output if torch.is_tensor(output_audio): output_audio = output_audio.cpu().numpy() if output_audio.dtype == 'float16': output_audio = output_audio.astype('float32') # Clean up if os.path.exists(processed_audio): os.remove(processed_audio) return output_audio, 48000 except Exception as e: logging.error(f"Generation error: {e}") raise e def voice_clone_interface(text, prompt_audio_upload, prompt_audio_record): """Interface for voice cloning.""" try: if not text or not text.strip(): return None, "Please enter text to synthesize." prompt_audio = prompt_audio_upload if prompt_audio_upload else prompt_audio_record if not prompt_audio: return None, "Please upload or record reference audio." # Generate audio audio, sample_rate = generate_speech(text, prompt_audio) # Save output os.makedirs("outputs", exist_ok=True) timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") output_path = f"outputs/mira_tts_{timestamp}.wav" sf.write(output_path, audio, samplerate=sample_rate) return output_path, "Generation successful!" except Exception as e: error_msg = f"Error: {str(e)}" logging.error(error_msg) return None, error_msg def build_interface(): """Build Gradio interface optimized for HF Spaces.""" with gr.Blocks(title="MiraTTS - Voice Cloning") as demo: gr.HTML("""

MiraTTS Voice Cloning

High-quality voice synthesis with 100x realtime speed using optimized LMDeploy

""") with gr.Row(): with gr.Column(scale=1): gr.Markdown("### Reference Audio") prompt_upload = gr.Audio( sources="upload", type="filepath", label="Upload Reference Audio" ) gr.Markdown("*Upload a clear audio sample (3-30 seconds, 16kHz+)*") prompt_record = gr.Audio( sources="microphone", type="filepath", label="Record Reference Audio" ) gr.Markdown("*Record directly in your browser*") with gr.Column(scale=1): gr.Markdown("### Text Input") text_input = gr.Textbox( label="Text to Synthesize", placeholder="Enter the text you want to convert to speech...", lines=4, value="Hello! This is a demonstration of MiraTTS, an optimized text-to-speech model." ) generate_btn = gr.Button( "Generate Speech", variant="primary" ) with gr.Row(): with gr.Column(): output_audio = gr.Audio( label="Generated Speech", type="filepath", autoplay=True ) status_text = gr.Textbox( label="Status", interactive=False, show_label=True ) # Examples gr.Markdown("### Example Usage") gr.Markdown(""" 1. **Upload or record** a reference audio (your target voice) 2. **Enter text** you want to synthesize in that voice 3. **Click generate** and wait for the result **Tips:** - Use clear reference audio without background noise - Keep reference audio between 3-30 seconds - Shorter text generates faster """) # Event handlers generate_btn.click( voice_clone_interface, inputs=[text_input, prompt_upload, prompt_record], outputs=[output_audio, status_text], show_progress=True ) # Clear function def clear_all(): return None, None, "", None, "Ready for new generation" clear_btn = gr.Button("Clear All", variant="secondary") clear_btn.click( clear_all, outputs=[prompt_upload, prompt_record, text_input, output_audio, status_text] ) return demo if __name__ == "__main__": # Removing this for zero-gpu to work. # Initialize model at startup #try: # initialize_model() # logging.info("Model pre-loaded successfully") #except Exception as e: # logging.error(f"Failed to pre-load model: {e}") # Launch interface demo = build_interface() demo.launch( server_name="0.0.0.0", server_port=7860, share=False )