Spaces:

WJBSCUT
/

VoiceDemo

Runtime error

App Files Files Community

jerrybwang commited on 14 days ago

Commit

ea52dd2

1 Parent(s): 289115a

11

Browse files

Files changed (3) hide show

README.md +5 -5
app.py +22 -18
requirements.txt +2 -1

README.md CHANGED Viewed

@@ -9,9 +9,9 @@ app_file: app.py
 pinned: false
 ---
-# FunAudioLLM/Fun-CosyVoice3 Text-to-Speech Demo
-A HuggingFace Space demo showcasing the FunAudioLLM/Fun-CosyVoice3 text-to-speech model.
 ## 🎯 Features
@@ -30,16 +30,16 @@ A HuggingFace Space demo showcasing the FunAudioLLM/Fun-CosyVoice3 text-to-speec
 ## 💡 Examples
 Try these example texts:
-- "Hello, welcome to the FunAudioLLM text-to-speech demo!"
 - "The quick brown fox jumps over the lazy dog."
 - "今天天气真好，适合出去散步。"
 ## 🔧 Technical Details
-- **Model**: FunAudioLLM/Fun-CosyVoice3
 - **Framework**: PyTorch + Transformers
 - **Interface**: Gradio
-- **Sample Rate**: 24kHz
 ## 📦 Dependencies

 pinned: false
 ---
+# Microsoft SpeechT5 Text-to-Speech Demo
+A HuggingFace Space demo showcasing the Microsoft SpeechT5 text-to-speech model.
 ## 🎯 Features
 ## 💡 Examples
 Try these example texts:
+- "Hello, welcome to the Microsoft SpeechT5 text-to-speech demo!"
 - "The quick brown fox jumps over the lazy dog."
 - "今天天气真好，适合出去散步。"
 ## 🔧 Technical Details
+- **Model**: Microsoft SpeechT5
 - **Framework**: PyTorch + Transformers
 - **Interface**: Gradio
+- **Sample Rate**: 16kHz
 ## 📦 Dependencies

app.py CHANGED Viewed

@@ -1,41 +1,45 @@
 import gradio as gr
 import torch
-from transformers import AutoProcessor, AutoModel
-import scipy.io.wavfile
 import numpy as np
-import io
-# Load FunAudioLLM/Fun-CosyVoice3 model
 def load_model():
     """Load the text-to-speech model"""
-    processor = AutoProcessor.from_pretrained("FunAudioLLM/Fun-CosyVoice3")
-    model = AutoModel.from_pretrained("FunAudioLLM/Fun-CosyVoice3")
-    return processor, model
 # Text-to-speech function
-def text_to_speech(text, processor, model):
-    """Convert text to speech using Fun-CosyVoice3 model"""
     try:
         # Process the input text
         inputs = processor(text=text, return_tensors="pt")
         # Generate speech
         with torch.no_grad():
-            speech = model.generate(**inputs)
         # Convert to numpy array and normalize
         speech = speech.cpu().numpy().squeeze()
         speech = speech / np.max(np.abs(speech)) * 0.8  # Normalize to prevent clipping
-        return speech, 24000  # Return audio data and sample rate
     except Exception as e:
         raise gr.Error(f"Error generating speech: {str(e)}")
 # Main function
 def main():
     # Load model once at startup
-    print("Loading FunAudioLLM/Fun-CosyVoice3 model...")
-    processor, model = load_model()
     print("Model loaded successfully!")
     def generate_speech(text):
@@ -44,7 +48,7 @@ def main():
             return None, "Please enter some text to convert to speech."
         try:
-            audio_data, sample_rate = text_to_speech(text, processor, model)
             # Return audio file
             return (sample_rate, audio_data), f"Successfully generated speech for: '{text}'"
@@ -52,11 +56,11 @@ def main():
             return None, f"Error: {str(e)}"
     # Create Gradio interface
-    with gr.Blocks(title="FunAudioLLM/Fun-CosyVoice3 Text-to-Speech") as demo:
         gr.Markdown("""
-        # 🎤 FunAudioLLM/Fun-CosyVoice3 Text-to-Speech
-        Convert your text to natural-sounding speech using the FunAudioLLM/Fun-CosyVoice3 model.
         """)
         with gr.Row():
@@ -76,7 +80,7 @@ def main():
         # Examples
         gr.Examples(
             examples=[
-                "Hello, welcome to the FunAudioLLM text-to-speech demo!",
                 "The quick brown fox jumps over the lazy dog.",
                 "Artificial intelligence is transforming the way we interact with technology.",
                 "今天天气真好，适合出去散步。"

 import gradio as gr
 import torch
+from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 import numpy as np
+# Load Microsoft SpeechT5 model
 def load_model():
     """Load the text-to-speech model"""
+    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
+    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
+    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
+    return processor, model, vocoder
 # Text-to-speech function
+def text_to_speech(text, processor, model, vocoder):
+    """Convert text to speech using SpeechT5 model"""
     try:
         # Process the input text
         inputs = processor(text=text, return_tensors="pt")
+        # Load speaker embeddings (use a default speaker)
+        from datasets import load_dataset
+        embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
+        speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
         # Generate speech
         with torch.no_grad():
+            speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
         # Convert to numpy array and normalize
         speech = speech.cpu().numpy().squeeze()
         speech = speech / np.max(np.abs(speech)) * 0.8  # Normalize to prevent clipping
+        return speech, 16000  # Return audio data and sample rate
     except Exception as e:
         raise gr.Error(f"Error generating speech: {str(e)}")
 # Main function
 def main():
     # Load model once at startup
+    print("Loading Microsoft SpeechT5 model...")
+    processor, model, vocoder = load_model()
     print("Model loaded successfully!")
     def generate_speech(text):
             return None, "Please enter some text to convert to speech."
         try:
+            audio_data, sample_rate = text_to_speech(text, processor, model, vocoder)
             # Return audio file
             return (sample_rate, audio_data), f"Successfully generated speech for: '{text}'"
             return None, f"Error: {str(e)}"
     # Create Gradio interface
+    with gr.Blocks(title="Microsoft SpeechT5 Text-to-Speech") as demo:
         gr.Markdown("""
+        # 🎤 Microsoft SpeechT5 Text-to-Speech
+        Convert your text to natural-sounding speech using the Microsoft SpeechT5 model.
         """)
         with gr.Row():
         # Examples
         gr.Examples(
             examples=[
+                "Hello, welcome to the Microsoft SpeechT5 text-to-speech demo!",
                 "The quick brown fox jumps over the lazy dog.",
                 "Artificial intelligence is transforming the way we interact with technology.",
                 "今天天气真好，适合出去散步。"

requirements.txt CHANGED Viewed

@@ -2,4 +2,5 @@ gradio==6.4.0
 torch>=2.0.0
 transformers>=4.35.0
 scipy>=1.10.0
-numpy>=1.24.0

 torch>=2.0.0
 transformers>=4.35.0
 scipy>=1.10.0
+numpy>=1.24.0
+datasets>=2.10.0