jerrybwang commited on
Commit
ea52dd2
Β·
1 Parent(s): 289115a
Files changed (3) hide show
  1. README.md +5 -5
  2. app.py +22 -18
  3. requirements.txt +2 -1
README.md CHANGED
@@ -9,9 +9,9 @@ app_file: app.py
9
  pinned: false
10
  ---
11
 
12
- # FunAudioLLM/Fun-CosyVoice3 Text-to-Speech Demo
13
 
14
- A HuggingFace Space demo showcasing the FunAudioLLM/Fun-CosyVoice3 text-to-speech model.
15
 
16
  ## 🎯 Features
17
 
@@ -30,16 +30,16 @@ A HuggingFace Space demo showcasing the FunAudioLLM/Fun-CosyVoice3 text-to-speec
30
  ## πŸ’‘ Examples
31
 
32
  Try these example texts:
33
- - "Hello, welcome to the FunAudioLLM text-to-speech demo!"
34
  - "The quick brown fox jumps over the lazy dog."
35
  - "δ»Šε€©ε€©ζ°”ηœŸε₯½οΌŒι€‚εˆε‡ΊεŽ»ζ•£ζ­₯。"
36
 
37
  ## πŸ”§ Technical Details
38
 
39
- - **Model**: FunAudioLLM/Fun-CosyVoice3
40
  - **Framework**: PyTorch + Transformers
41
  - **Interface**: Gradio
42
- - **Sample Rate**: 24kHz
43
 
44
  ## πŸ“¦ Dependencies
45
 
 
9
  pinned: false
10
  ---
11
 
12
+ # Microsoft SpeechT5 Text-to-Speech Demo
13
 
14
+ A HuggingFace Space demo showcasing the Microsoft SpeechT5 text-to-speech model.
15
 
16
  ## 🎯 Features
17
 
 
30
  ## πŸ’‘ Examples
31
 
32
  Try these example texts:
33
+ - "Hello, welcome to the Microsoft SpeechT5 text-to-speech demo!"
34
  - "The quick brown fox jumps over the lazy dog."
35
  - "δ»Šε€©ε€©ζ°”ηœŸε₯½οΌŒι€‚εˆε‡ΊεŽ»ζ•£ζ­₯。"
36
 
37
  ## πŸ”§ Technical Details
38
 
39
+ - **Model**: Microsoft SpeechT5
40
  - **Framework**: PyTorch + Transformers
41
  - **Interface**: Gradio
42
+ - **Sample Rate**: 16kHz
43
 
44
  ## πŸ“¦ Dependencies
45
 
app.py CHANGED
@@ -1,41 +1,45 @@
1
  import gradio as gr
2
  import torch
3
- from transformers import AutoProcessor, AutoModel
4
- import scipy.io.wavfile
5
  import numpy as np
6
- import io
7
 
8
- # Load FunAudioLLM/Fun-CosyVoice3 model
9
  def load_model():
10
  """Load the text-to-speech model"""
11
- processor = AutoProcessor.from_pretrained("FunAudioLLM/Fun-CosyVoice3")
12
- model = AutoModel.from_pretrained("FunAudioLLM/Fun-CosyVoice3")
13
- return processor, model
 
14
 
15
  # Text-to-speech function
16
- def text_to_speech(text, processor, model):
17
- """Convert text to speech using Fun-CosyVoice3 model"""
18
  try:
19
  # Process the input text
20
  inputs = processor(text=text, return_tensors="pt")
21
 
 
 
 
 
 
22
  # Generate speech
23
  with torch.no_grad():
24
- speech = model.generate(**inputs)
25
 
26
  # Convert to numpy array and normalize
27
  speech = speech.cpu().numpy().squeeze()
28
  speech = speech / np.max(np.abs(speech)) * 0.8 # Normalize to prevent clipping
29
 
30
- return speech, 24000 # Return audio data and sample rate
31
  except Exception as e:
32
  raise gr.Error(f"Error generating speech: {str(e)}")
33
 
34
  # Main function
35
  def main():
36
  # Load model once at startup
37
- print("Loading FunAudioLLM/Fun-CosyVoice3 model...")
38
- processor, model = load_model()
39
  print("Model loaded successfully!")
40
 
41
  def generate_speech(text):
@@ -44,7 +48,7 @@ def main():
44
  return None, "Please enter some text to convert to speech."
45
 
46
  try:
47
- audio_data, sample_rate = text_to_speech(text, processor, model)
48
 
49
  # Return audio file
50
  return (sample_rate, audio_data), f"Successfully generated speech for: '{text}'"
@@ -52,11 +56,11 @@ def main():
52
  return None, f"Error: {str(e)}"
53
 
54
  # Create Gradio interface
55
- with gr.Blocks(title="FunAudioLLM/Fun-CosyVoice3 Text-to-Speech") as demo:
56
  gr.Markdown("""
57
- # 🎀 FunAudioLLM/Fun-CosyVoice3 Text-to-Speech
58
 
59
- Convert your text to natural-sounding speech using the FunAudioLLM/Fun-CosyVoice3 model.
60
  """)
61
 
62
  with gr.Row():
@@ -76,7 +80,7 @@ def main():
76
  # Examples
77
  gr.Examples(
78
  examples=[
79
- "Hello, welcome to the FunAudioLLM text-to-speech demo!",
80
  "The quick brown fox jumps over the lazy dog.",
81
  "Artificial intelligence is transforming the way we interact with technology.",
82
  "δ»Šε€©ε€©ζ°”ηœŸε₯½οΌŒι€‚εˆε‡ΊεŽ»ζ•£ζ­₯。"
 
1
  import gradio as gr
2
  import torch
3
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 
4
  import numpy as np
 
5
 
6
+ # Load Microsoft SpeechT5 model
7
  def load_model():
8
  """Load the text-to-speech model"""
9
+ processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
10
+ model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
11
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
12
+ return processor, model, vocoder
13
 
14
  # Text-to-speech function
15
+ def text_to_speech(text, processor, model, vocoder):
16
+ """Convert text to speech using SpeechT5 model"""
17
  try:
18
  # Process the input text
19
  inputs = processor(text=text, return_tensors="pt")
20
 
21
+ # Load speaker embeddings (use a default speaker)
22
+ from datasets import load_dataset
23
+ embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
24
+ speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
25
+
26
  # Generate speech
27
  with torch.no_grad():
28
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
29
 
30
  # Convert to numpy array and normalize
31
  speech = speech.cpu().numpy().squeeze()
32
  speech = speech / np.max(np.abs(speech)) * 0.8 # Normalize to prevent clipping
33
 
34
+ return speech, 16000 # Return audio data and sample rate
35
  except Exception as e:
36
  raise gr.Error(f"Error generating speech: {str(e)}")
37
 
38
  # Main function
39
  def main():
40
  # Load model once at startup
41
+ print("Loading Microsoft SpeechT5 model...")
42
+ processor, model, vocoder = load_model()
43
  print("Model loaded successfully!")
44
 
45
  def generate_speech(text):
 
48
  return None, "Please enter some text to convert to speech."
49
 
50
  try:
51
+ audio_data, sample_rate = text_to_speech(text, processor, model, vocoder)
52
 
53
  # Return audio file
54
  return (sample_rate, audio_data), f"Successfully generated speech for: '{text}'"
 
56
  return None, f"Error: {str(e)}"
57
 
58
  # Create Gradio interface
59
+ with gr.Blocks(title="Microsoft SpeechT5 Text-to-Speech") as demo:
60
  gr.Markdown("""
61
+ # 🎀 Microsoft SpeechT5 Text-to-Speech
62
 
63
+ Convert your text to natural-sounding speech using the Microsoft SpeechT5 model.
64
  """)
65
 
66
  with gr.Row():
 
80
  # Examples
81
  gr.Examples(
82
  examples=[
83
+ "Hello, welcome to the Microsoft SpeechT5 text-to-speech demo!",
84
  "The quick brown fox jumps over the lazy dog.",
85
  "Artificial intelligence is transforming the way we interact with technology.",
86
  "δ»Šε€©ε€©ζ°”ηœŸε₯½οΌŒι€‚εˆε‡ΊεŽ»ζ•£ζ­₯。"
requirements.txt CHANGED
@@ -2,4 +2,5 @@ gradio==6.4.0
2
  torch>=2.0.0
3
  transformers>=4.35.0
4
  scipy>=1.10.0
5
- numpy>=1.24.0
 
 
2
  torch>=2.0.0
3
  transformers>=4.35.0
4
  scipy>=1.10.0
5
+ numpy>=1.24.0
6
+ datasets>=2.10.0