Mi-TTS

Paused

App Files Files Community

Translsis commited on Dec 24, 2025

Commit

39c69de

verified ·

1 Parent(s): 0b42a1d

Update app.py

Browse files

Files changed (1) hide show

app.py +280 -39

app.py CHANGED Viewed

@@ -4,15 +4,80 @@ import soundfile as sf
 import logging
 import argparse
 import gradio as gr
 from datetime import datetime
 from mira.model import MiraTTS
 MODEL = None
-def initialize_model(model_dir="YatharthS/MiraTTS"):
     """Load the MiraTTS model once at the beginning."""
     logging.info(f"Loading MiraTTS model from: {model_dir}")
     model = MiraTTS(model_dir)
     return model
 def generate_audio(text, prompt_audio_path):
@@ -26,8 +91,13 @@ def generate_audio(text, prompt_audio_path):
         # Encode the prompt audio
         context_tokens = MODEL.encode_audio(prompt_audio_path)
         # Generate audio
-        audio = MODEL.generate(text, context_tokens)
         # Convert to numpy array if it's a tensor and handle dtype
         if torch.is_tensor(audio):
@@ -44,7 +114,7 @@ def generate_audio(text, prompt_audio_path):
         logging.error(f"Error during generation: {e}")
         raise e
-def run_tts(text, prompt_audio_path, save_dir="results"):
     """Perform TTS inference and save the generated audio."""
     logging.info(f"Saving audio to: {save_dir}")
@@ -52,7 +122,7 @@ def run_tts(text, prompt_audio_path, save_dir="results"):
     os.makedirs(save_dir, exist_ok=True)
     # Generate unique filename using timestamp
-    timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
     save_path = os.path.join(save_dir, f"mira_tts_{timestamp}.wav")
     logging.info("Starting MiraTTS inference...")
@@ -64,36 +134,76 @@ def run_tts(text, prompt_audio_path, save_dir="results"):
     sf.write(save_path, audio, samplerate=sample_rate)
     logging.info(f"Audio saved at: {save_path}")
     return save_path
-def voice_clone_callback(text, prompt_audio_upload, prompt_audio_record):
     """Gradio callback for voice cloning using MiraTTS."""
     if not text.strip():
-        return None
     # Use uploaded audio or recorded audio
     prompt_audio = prompt_audio_upload if prompt_audio_upload else prompt_audio_record
     if not prompt_audio:
-        return None
     try:
-        audio_output_path = run_tts(text, prompt_audio)
-        return audio_output_path
     except Exception as e:
         logging.error(f"Error in voice cloning: {e}")
-        return None
-def voice_creation_callback(text, temperature, top_p, top_k):
     """Gradio callback for creating synthetic voice with custom parameters."""
     if not text.strip():
-        return None
     global MODEL
     if MODEL is None:
         MODEL = initialize_model()
     try:
         # Set custom generation parameters
         MODEL.set_params(
@@ -104,8 +214,9 @@ def voice_creation_callback(text, temperature, top_p, top_k):
             repetition_penalty=1.2
         )
-        # Use a default voice context (you may want to provide default audio files)
-        # Check multiple possible paths for example audio
         possible_paths = [
             "/models3/src/MiraTTS/models/MiraTTS/example1.wav",
             "models/MiraTTS/example1.wav",
@@ -119,9 +230,17 @@ def voice_creation_callback(text, temperature, top_p, top_k):
                 break
         if default_audio:
             # Generate audio with dtype conversion
             context_tokens = MODEL.encode_audio(default_audio)
-            audio = MODEL.generate(text, context_tokens)
             # Handle tensor conversion and dtype
             if torch.is_tensor(audio):
@@ -135,35 +254,95 @@ def voice_creation_callback(text, temperature, top_p, top_k):
             # Save the audio
             os.makedirs("results", exist_ok=True)
-            timestamp = datetime.now().strftime("%Y%m%d%H%M%S")
             save_path = os.path.join("results", f"mira_tts_creation_{timestamp}.wav")
             sf.write(save_path, audio, samplerate=48000)
-            return save_path
         else:
             logging.warning("No default audio found for voice creation")
-            return None
     except Exception as e:
         logging.error(f"Error in voice creation: {e}")
-        return None
 def build_ui():
     """Build the Gradio interface similar to SparkTTS."""
-    with gr.Blocks(title="MiraTTS Web Interface") as demo:
         # Title
         gr.HTML('<h1 style="text-align: center;">MiraTTS - High Quality Voice Synthesis</h1>')
         # Description
         gr.Markdown("""
         MiraTTS is a highly optimized Text-to-Speech model based on Spark-TTS with LMDeploy acceleration.
-        It provides over 100x realtime generation speed with high-quality 48kHz audio output.
         """)
         with gr.Tabs():
             # Voice Clone Tab
-            with gr.TabItem("Voice Clone"):
                 gr.Markdown("### Clone any voice using a reference audio sample")
                 with gr.Row():
@@ -186,18 +365,20 @@ def build_ui():
                 )
                 with gr.Row():
-                    clone_button = gr.Button("Generate Audio", variant="primary")
-                    clear_button = gr.Button("Clear")
                 audio_output_clone = gr.Audio(
                     label="Generated Audio",
                     autoplay=True
                 )
                 clone_button.click(
                     voice_clone_callback,
                     inputs=[text_input, prompt_audio_upload, prompt_audio_record],
-                    outputs=[audio_output_clone],
                 )
                 clear_button.click(
@@ -206,7 +387,7 @@ def build_ui():
                 )
             # Voice Creation Tab
-            with gr.TabItem("Voice Creation"):
                 gr.Markdown("### Create synthetic voices with custom parameters")
                 with gr.Row():
@@ -242,42 +423,90 @@ def build_ui():
                             )
                     with gr.Column():
-                        create_button = gr.Button("Create Voice", variant="primary")
                         audio_output_creation = gr.Audio(
                             label="Generated Audio",
                             autoplay=True
                         )
                 create_button.click(
                     voice_creation_callback,
                     inputs=[text_input_creation, temperature, top_p, top_k],
-                    outputs=[audio_output_creation],
                 )
             # About Tab
-            with gr.TabItem("About"):
-                gr.Markdown("""
                 ## About MiraTTS
                 MiraTTS is an optimized version of Spark-TTS with the following features:
                 - **Ultra-fast generation**: Over 100x realtime speed using LMDeploy optimization
                 - **High quality**: Generates crisp 48kHz audio outputs
-                - **Memory efficient**: Works within 6GB VRAM
-                - **Low latency**: As low as 100ms generation time
                 - **Voice cloning**: Clone any voice from a short audio sample
-                ### Model Information
-                - Base model: Spark-TTS-0.5B
-                - Optimization: LMDeploy + FlashSR
-                - Sample rate: 48kHz
-                - Model size: ~500M parameters
                 ### Usage Tips
                 - For voice cloning, use clear audio samples between 3-30 seconds
                 - Ensure reference audio is at least 16kHz quality
                 - Longer text inputs may require more memory
                 - Adjust generation parameters for different voice styles
                 """)
     return demo
@@ -291,6 +520,13 @@ def parse_arguments():
         default="YatharthS/MiraTTS",
         help="Path to the MiraTTS model directory or HuggingFace model ID"
     )
     parser.add_argument(
         "--server_name",
         type=str,
@@ -320,15 +556,20 @@ if __name__ == "__main__":
     # Parse arguments
     args = parse_arguments()
     # Initialize model
     logging.info("Initializing MiraTTS model...")
-    MODEL = initialize_model(args.model_dir)
     # Build and launch interface
     logging.info("Building Gradio interface...")
     demo = build_ui()
     logging.info(f"Launching web interface on {args.server_name}:{args.server_port}")
     demo.launch(
         server_name=args.server_name,
         server_port=args.server_port,

 import logging
 import argparse
 import gradio as gr
+import json
+import threading
+import queue
 from datetime import datetime
+from pathlib import Path
 from mira.model import MiraTTS
 MODEL = None
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+HISTORY_FILE = "generation_history.json"
+GENERATION_QUEUE = queue.Queue()
+PROCESSING_LOCK = threading.Lock()
+class GenerationHistory:
+    """Manage generation history with persistence."""
+    def __init__(self, history_file=HISTORY_FILE):
+        self.history_file = history_file
+        self.history = self.load_history()
+    def load_history(self):
+        """Load history from JSON file."""
+        if os.path.exists(self.history_file):
+            try:
+                with open(self.history_file, 'r', encoding='utf-8') as f:
+                    return json.load(f)
+            except Exception as e:
+                logging.error(f"Error loading history: {e}")
+                return []
+        return []
+    def save_history(self):
+        """Save history to JSON file."""
+        try:
+            with open(self.history_file, 'w', encoding='utf-8') as f:
+                json.dump(self.history, f, indent=2, ensure_ascii=False)
+        except Exception as e:
+            logging.error(f"Error saving history: {e}")
+    def add_entry(self, entry):
+        """Add a new entry to history."""
+        self.history.insert(0, entry)  # Add to beginning
+        # Keep only last 100 entries
+        if len(self.history) > 100:
+            self.history = self.history[:100]
+        self.save_history()
+    def get_history(self):
+        """Get all history entries."""
+        return self.history
+    def clear_history(self):
+        """Clear all history."""
+        self.history = []
+        self.save_history()
+# Global history manager
+HISTORY_MANAGER = GenerationHistory()
+def initialize_model(model_dir="YatharthS/MiraTTS", device=None):
     """Load the MiraTTS model once at the beginning."""
+    global DEVICE
+    if device:
+        DEVICE = device
     logging.info(f"Loading MiraTTS model from: {model_dir}")
+    logging.info(f"Using device: {DEVICE}")
     model = MiraTTS(model_dir)
+    # Move model to appropriate device
+    if hasattr(model, 'to'):
+        model = model.to(DEVICE)
     return model
 def generate_audio(text, prompt_audio_path):
         # Encode the prompt audio
         context_tokens = MODEL.encode_audio(prompt_audio_path)
+        # Move context tokens to device if needed
+        if torch.is_tensor(context_tokens):
+            context_tokens = context_tokens.to(DEVICE)
         # Generate audio
+        with torch.inference_mode() if DEVICE == "cpu" else torch.cuda.amp.autocast():
+            audio = MODEL.generate(text, context_tokens)
         # Convert to numpy array if it's a tensor and handle dtype
         if torch.is_tensor(audio):
         logging.error(f"Error during generation: {e}")
         raise e
+def run_tts(text, prompt_audio_path, save_dir="results", mode="clone"):
     """Perform TTS inference and save the generated audio."""
     logging.info(f"Saving audio to: {save_dir}")
     os.makedirs(save_dir, exist_ok=True)
     # Generate unique filename using timestamp
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
     save_path = os.path.join(save_dir, f"mira_tts_{timestamp}.wav")
     logging.info("Starting MiraTTS inference...")
     sf.write(save_path, audio, samplerate=sample_rate)
     logging.info(f"Audio saved at: {save_path}")
+    # Add to history
+    history_entry = {
+        "timestamp": datetime.now().isoformat(),
+        "text": text[:100] + "..." if len(text) > 100 else text,
+        "full_text": text,
+        "mode": mode,
+        "file_path": save_path,
+        "reference_audio": prompt_audio_path if mode == "clone" else None,
+        "device": DEVICE
+    }
+    HISTORY_MANAGER.add_entry(history_entry)
     return save_path
+def background_worker():
+    """Background worker to process generation tasks."""
+    while True:
+        try:
+            task = GENERATION_QUEUE.get()
+            if task is None:  # Poison pill to stop the worker
+                break
+            callback, args = task
+            callback(*args)
+        except Exception as e:
+            logging.error(f"Error in background worker: {e}")
+        finally:
+            GENERATION_QUEUE.task_done()
+# Start background worker thread
+worker_thread = threading.Thread(target=background_worker, daemon=True)
+worker_thread.start()
+def voice_clone_callback(text, prompt_audio_upload, prompt_audio_record, progress=gr.Progress()):
     """Gradio callback for voice cloning using MiraTTS."""
     if not text.strip():
+        return None, get_history_display()
     # Use uploaded audio or recorded audio
     prompt_audio = prompt_audio_upload if prompt_audio_upload else prompt_audio_record
     if not prompt_audio:
+        return None, get_history_display()
+    progress(0, desc="Initializing...")
     try:
+        progress(0.3, desc="Encoding audio...")
+        progress(0.6, desc="Generating speech...")
+        audio_output_path = run_tts(text, prompt_audio, mode="clone")
+        progress(1.0, desc="Complete!")
+        return audio_output_path, get_history_display()
     except Exception as e:
         logging.error(f"Error in voice cloning: {e}")
+        return None, get_history_display()
+def voice_creation_callback(text, temperature, top_p, top_k, progress=gr.Progress()):
     """Gradio callback for creating synthetic voice with custom parameters."""
     if not text.strip():
+        return None, get_history_display()
     global MODEL
     if MODEL is None:
         MODEL = initialize_model()
+    progress(0, desc="Initializing...")
     try:
         # Set custom generation parameters
         MODEL.set_params(
             repetition_penalty=1.2
         )
+        progress(0.3, desc="Loading default voice...")
+        # Use a default voice context
         possible_paths = [
             "/models3/src/MiraTTS/models/MiraTTS/example1.wav",
             "models/MiraTTS/example1.wav",
                 break
         if default_audio:
+            progress(0.6, desc="Generating speech...")
             # Generate audio with dtype conversion
             context_tokens = MODEL.encode_audio(default_audio)
+            # Move to device
+            if torch.is_tensor(context_tokens):
+                context_tokens = context_tokens.to(DEVICE)
+            with torch.inference_mode() if DEVICE == "cpu" else torch.cuda.amp.autocast():
+                audio = MODEL.generate(text, context_tokens)
             # Handle tensor conversion and dtype
             if torch.is_tensor(audio):
             # Save the audio
             os.makedirs("results", exist_ok=True)
+            timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
             save_path = os.path.join("results", f"mira_tts_creation_{timestamp}.wav")
             sf.write(save_path, audio, samplerate=48000)
+            # Add to history
+            history_entry = {
+                "timestamp": datetime.now().isoformat(),
+                "text": text[:100] + "..." if len(text) > 100 else text,
+                "full_text": text,
+                "mode": "creation",
+                "file_path": save_path,
+                "reference_audio": None,
+                "device": DEVICE,
+                "temperature": temperature,
+                "top_p": top_p,
+                "top_k": top_k
+            }
+            HISTORY_MANAGER.add_entry(history_entry)
+            progress(1.0, desc="Complete!")
+            return save_path, get_history_display()
         else:
             logging.warning("No default audio found for voice creation")
+            return None, get_history_display()
     except Exception as e:
         logging.error(f"Error in voice creation: {e}")
+        return None, get_history_display()
+def get_history_display():
+    """Get formatted history for display."""
+    history = HISTORY_MANAGER.get_history()
+    if not history:
+        return "No generation history yet."
+    display_text = "# Generation History\n\n"
+    for idx, entry in enumerate(history[:20]):  # Show last 20
+        timestamp = datetime.fromisoformat(entry['timestamp']).strftime("%Y-%m-%d %H:%M:%S")
+        mode = entry['mode'].capitalize()
+        text_preview = entry['text']
+        file_name = os.path.basename(entry['file_path'])
+        display_text += f"### {idx + 1}. {timestamp} - {mode}\n"
+        display_text += f"**Text:** {text_preview}\n"
+        display_text += f"**File:** `{file_name}`\n"
+        display_text += f"**Device:** {entry.get('device', 'N/A')}\n"
+        if entry.get('temperature'):
+            display_text += f"**Params:** T={entry.get('temperature')}, p={entry.get('top_p')}, k={entry.get('top_k')}\n"
+        display_text += "\n---\n\n"
+    return display_text
+def get_history_files():
+    """Get list of history files for download."""
+    history = HISTORY_MANAGER.get_history()
+    return [(entry['file_path'], os.path.basename(entry['file_path']))
+            for entry in history if os.path.exists(entry['file_path'])]
+def clear_history_callback():
+    """Clear generation history."""
+    HISTORY_MANAGER.clear_history()
+    return get_history_display(), []
 def build_ui():
     """Build the Gradio interface similar to SparkTTS."""
+    with gr.Blocks(title="MiraTTS Web Interface", theme=gr.themes.Soft()) as demo:
         # Title
         gr.HTML('<h1 style="text-align: center;">MiraTTS - High Quality Voice Synthesis</h1>')
+        # Device info
+        device_info = f"🖥️ Running on: **{DEVICE.upper()}**"
+        if DEVICE == "cuda":
+            device_info += f" (GPU: {torch.cuda.get_device_name(0)})"
+        gr.Markdown(device_info)
         # Description
         gr.Markdown("""
         MiraTTS is a highly optimized Text-to-Speech model based on Spark-TTS with LMDeploy acceleration.
+        It provides high-quality 48kHz audio output with background processing support.
         """)
         with gr.Tabs():
             # Voice Clone Tab
+            with gr.TabItem("🎤 Voice Clone"):
                 gr.Markdown("### Clone any voice using a reference audio sample")
                 with gr.Row():
                 )
                 with gr.Row():
+                    clone_button = gr.Button("🎵 Generate Audio", variant="primary")
+                    clear_button = gr.Button("🗑️ Clear")
                 audio_output_clone = gr.Audio(
                     label="Generated Audio",
                     autoplay=True
                 )
+                history_display_clone = gr.Markdown(get_history_display())
                 clone_button.click(
                     voice_clone_callback,
                     inputs=[text_input, prompt_audio_upload, prompt_audio_record],
+                    outputs=[audio_output_clone, history_display_clone],
                 )
                 clear_button.click(
                 )
             # Voice Creation Tab
+            with gr.TabItem("✨ Voice Creation"):
                 gr.Markdown("### Create synthetic voices with custom parameters")
                 with gr.Row():
                             )
                     with gr.Column():
+                        create_button = gr.Button("🎨 Create Voice", variant="primary")
                         audio_output_creation = gr.Audio(
                             label="Generated Audio",
                             autoplay=True
                         )
+                history_display_creation = gr.Markdown(get_history_display())
                 create_button.click(
                     voice_creation_callback,
                     inputs=[text_input_creation, temperature, top_p, top_k],
+                    outputs=[audio_output_creation, history_display_creation],
+                )
+            # History Tab
+            with gr.TabItem("📜 History"):
+                gr.Markdown("### Review and download previous generations")
+                with gr.Row():
+                    refresh_button = gr.Button("🔄 Refresh History", variant="secondary")
+                    clear_history_button = gr.Button("🗑️ Clear History", variant="stop")
+                history_display_main = gr.Markdown(get_history_display())
+                gr.Markdown("### Download Files")
+                file_browser = gr.File(
+                    label="Generated Audio Files",
+                    file_count="multiple",
+                    interactive=False
+                )
+                def refresh_history():
+                    files = get_history_files()
+                    return get_history_display(), [f[0] for f in files]
+                refresh_button.click(
+                    refresh_history,
+                    outputs=[history_display_main, file_browser]
+                )
+                clear_history_button.click(
+                    clear_history_callback,
+                    outputs=[history_display_main, file_browser]
+                )
+                # Auto-load files on tab open
+                demo.load(
+                    refresh_history,
+                    outputs=[history_display_main, file_browser]
                 )
             # About Tab
+            with gr.TabItem("ℹ️ About"):
+                gr.Markdown(f"""
                 ## About MiraTTS
                 MiraTTS is an optimized version of Spark-TTS with the following features:
                 - **Ultra-fast generation**: Over 100x realtime speed using LMDeploy optimization
                 - **High quality**: Generates crisp 48kHz audio outputs
+                - **Memory efficient**: Works within 6GB VRAM or on CPU
+                - **Low latency**: As low as 100ms generation time (GPU)
                 - **Voice cloning**: Clone any voice from a short audio sample
+                - **Background processing**: Non-blocking audio generation
+                - **Generation history**: Review and download all generated audio
+                ### Current Configuration
+                - **Device**: {DEVICE.upper()}
+                - **Base model**: Spark-TTS-0.5B
+                - **Optimization**: LMDeploy + FlashSR
+                - **Sample rate**: 48kHz
+                - **Model size**: ~500M parameters
                 ### Usage Tips
                 - For voice cloning, use clear audio samples between 3-30 seconds
                 - Ensure reference audio is at least 16kHz quality
                 - Longer text inputs may require more memory
                 - Adjust generation parameters for different voice styles
+                - CPU mode is slower but works without GPU
+                - Check the History tab to download previous generations
+                ### Performance Notes
+                - **GPU**: ~100-200ms per generation
+                - **CPU**: ~2-5 seconds per generation (depending on CPU)
                 """)
     return demo
         default="YatharthS/MiraTTS",
         help="Path to the MiraTTS model directory or HuggingFace model ID"
     )
+    parser.add_argument(
+        "--device",
+        type=str,
+        default=None,
+        choices=["cuda", "cpu"],
+        help="Device to run model on (default: auto-detect)"
+    )
     parser.add_argument(
         "--server_name",
         type=str,
     # Parse arguments
     args = parse_arguments()
+    # Set device if specified
+    if args.device:
+        DEVICE = args.device
     # Initialize model
     logging.info("Initializing MiraTTS model...")
+    MODEL = initialize_model(args.model_dir, args.device)
     # Build and launch interface
     logging.info("Building Gradio interface...")
     demo = build_ui()
     logging.info(f"Launching web interface on {args.server_name}:{args.server_port}")
+    logging.info(f"Device: {DEVICE}")
     demo.launch(
         server_name=args.server_name,
         server_port=args.server_port,