jerrybwang
commited on
Commit
Β·
ea52dd2
1
Parent(s):
289115a
11
Browse files- README.md +5 -5
- app.py +22 -18
- requirements.txt +2 -1
README.md
CHANGED
|
@@ -9,9 +9,9 @@ app_file: app.py
|
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
-
#
|
| 13 |
|
| 14 |
-
A HuggingFace Space demo showcasing the
|
| 15 |
|
| 16 |
## π― Features
|
| 17 |
|
|
@@ -30,16 +30,16 @@ A HuggingFace Space demo showcasing the FunAudioLLM/Fun-CosyVoice3 text-to-speec
|
|
| 30 |
## π‘ Examples
|
| 31 |
|
| 32 |
Try these example texts:
|
| 33 |
-
- "Hello, welcome to the
|
| 34 |
- "The quick brown fox jumps over the lazy dog."
|
| 35 |
- "δ»ε€©ε€©ζ°ηε₯½οΌιεεΊε»ζ£ζ₯γ"
|
| 36 |
|
| 37 |
## π§ Technical Details
|
| 38 |
|
| 39 |
-
- **Model**:
|
| 40 |
- **Framework**: PyTorch + Transformers
|
| 41 |
- **Interface**: Gradio
|
| 42 |
-
- **Sample Rate**:
|
| 43 |
|
| 44 |
## π¦ Dependencies
|
| 45 |
|
|
|
|
| 9 |
pinned: false
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# Microsoft SpeechT5 Text-to-Speech Demo
|
| 13 |
|
| 14 |
+
A HuggingFace Space demo showcasing the Microsoft SpeechT5 text-to-speech model.
|
| 15 |
|
| 16 |
## π― Features
|
| 17 |
|
|
|
|
| 30 |
## π‘ Examples
|
| 31 |
|
| 32 |
Try these example texts:
|
| 33 |
+
- "Hello, welcome to the Microsoft SpeechT5 text-to-speech demo!"
|
| 34 |
- "The quick brown fox jumps over the lazy dog."
|
| 35 |
- "δ»ε€©ε€©ζ°ηε₯½οΌιεεΊε»ζ£ζ₯γ"
|
| 36 |
|
| 37 |
## π§ Technical Details
|
| 38 |
|
| 39 |
+
- **Model**: Microsoft SpeechT5
|
| 40 |
- **Framework**: PyTorch + Transformers
|
| 41 |
- **Interface**: Gradio
|
| 42 |
+
- **Sample Rate**: 16kHz
|
| 43 |
|
| 44 |
## π¦ Dependencies
|
| 45 |
|
app.py
CHANGED
|
@@ -1,41 +1,45 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
-
from transformers import
|
| 4 |
-
import scipy.io.wavfile
|
| 5 |
import numpy as np
|
| 6 |
-
import io
|
| 7 |
|
| 8 |
-
# Load
|
| 9 |
def load_model():
|
| 10 |
"""Load the text-to-speech model"""
|
| 11 |
-
processor =
|
| 12 |
-
model =
|
| 13 |
-
|
|
|
|
| 14 |
|
| 15 |
# Text-to-speech function
|
| 16 |
-
def text_to_speech(text, processor, model):
|
| 17 |
-
"""Convert text to speech using
|
| 18 |
try:
|
| 19 |
# Process the input text
|
| 20 |
inputs = processor(text=text, return_tensors="pt")
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
# Generate speech
|
| 23 |
with torch.no_grad():
|
| 24 |
-
speech = model.
|
| 25 |
|
| 26 |
# Convert to numpy array and normalize
|
| 27 |
speech = speech.cpu().numpy().squeeze()
|
| 28 |
speech = speech / np.max(np.abs(speech)) * 0.8 # Normalize to prevent clipping
|
| 29 |
|
| 30 |
-
return speech,
|
| 31 |
except Exception as e:
|
| 32 |
raise gr.Error(f"Error generating speech: {str(e)}")
|
| 33 |
|
| 34 |
# Main function
|
| 35 |
def main():
|
| 36 |
# Load model once at startup
|
| 37 |
-
print("Loading
|
| 38 |
-
processor, model = load_model()
|
| 39 |
print("Model loaded successfully!")
|
| 40 |
|
| 41 |
def generate_speech(text):
|
|
@@ -44,7 +48,7 @@ def main():
|
|
| 44 |
return None, "Please enter some text to convert to speech."
|
| 45 |
|
| 46 |
try:
|
| 47 |
-
audio_data, sample_rate = text_to_speech(text, processor, model)
|
| 48 |
|
| 49 |
# Return audio file
|
| 50 |
return (sample_rate, audio_data), f"Successfully generated speech for: '{text}'"
|
|
@@ -52,11 +56,11 @@ def main():
|
|
| 52 |
return None, f"Error: {str(e)}"
|
| 53 |
|
| 54 |
# Create Gradio interface
|
| 55 |
-
with gr.Blocks(title="
|
| 56 |
gr.Markdown("""
|
| 57 |
-
# π€
|
| 58 |
|
| 59 |
-
Convert your text to natural-sounding speech using the
|
| 60 |
""")
|
| 61 |
|
| 62 |
with gr.Row():
|
|
@@ -76,7 +80,7 @@ def main():
|
|
| 76 |
# Examples
|
| 77 |
gr.Examples(
|
| 78 |
examples=[
|
| 79 |
-
"Hello, welcome to the
|
| 80 |
"The quick brown fox jumps over the lazy dog.",
|
| 81 |
"Artificial intelligence is transforming the way we interact with technology.",
|
| 82 |
"δ»ε€©ε€©ζ°ηε₯½οΌιεεΊε»ζ£ζ₯γ"
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
+
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
|
|
|
|
| 4 |
import numpy as np
|
|
|
|
| 5 |
|
| 6 |
+
# Load Microsoft SpeechT5 model
|
| 7 |
def load_model():
|
| 8 |
"""Load the text-to-speech model"""
|
| 9 |
+
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
|
| 10 |
+
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
|
| 11 |
+
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
|
| 12 |
+
return processor, model, vocoder
|
| 13 |
|
| 14 |
# Text-to-speech function
|
| 15 |
+
def text_to_speech(text, processor, model, vocoder):
|
| 16 |
+
"""Convert text to speech using SpeechT5 model"""
|
| 17 |
try:
|
| 18 |
# Process the input text
|
| 19 |
inputs = processor(text=text, return_tensors="pt")
|
| 20 |
|
| 21 |
+
# Load speaker embeddings (use a default speaker)
|
| 22 |
+
from datasets import load_dataset
|
| 23 |
+
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
|
| 24 |
+
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
|
| 25 |
+
|
| 26 |
# Generate speech
|
| 27 |
with torch.no_grad():
|
| 28 |
+
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
|
| 29 |
|
| 30 |
# Convert to numpy array and normalize
|
| 31 |
speech = speech.cpu().numpy().squeeze()
|
| 32 |
speech = speech / np.max(np.abs(speech)) * 0.8 # Normalize to prevent clipping
|
| 33 |
|
| 34 |
+
return speech, 16000 # Return audio data and sample rate
|
| 35 |
except Exception as e:
|
| 36 |
raise gr.Error(f"Error generating speech: {str(e)}")
|
| 37 |
|
| 38 |
# Main function
|
| 39 |
def main():
|
| 40 |
# Load model once at startup
|
| 41 |
+
print("Loading Microsoft SpeechT5 model...")
|
| 42 |
+
processor, model, vocoder = load_model()
|
| 43 |
print("Model loaded successfully!")
|
| 44 |
|
| 45 |
def generate_speech(text):
|
|
|
|
| 48 |
return None, "Please enter some text to convert to speech."
|
| 49 |
|
| 50 |
try:
|
| 51 |
+
audio_data, sample_rate = text_to_speech(text, processor, model, vocoder)
|
| 52 |
|
| 53 |
# Return audio file
|
| 54 |
return (sample_rate, audio_data), f"Successfully generated speech for: '{text}'"
|
|
|
|
| 56 |
return None, f"Error: {str(e)}"
|
| 57 |
|
| 58 |
# Create Gradio interface
|
| 59 |
+
with gr.Blocks(title="Microsoft SpeechT5 Text-to-Speech") as demo:
|
| 60 |
gr.Markdown("""
|
| 61 |
+
# π€ Microsoft SpeechT5 Text-to-Speech
|
| 62 |
|
| 63 |
+
Convert your text to natural-sounding speech using the Microsoft SpeechT5 model.
|
| 64 |
""")
|
| 65 |
|
| 66 |
with gr.Row():
|
|
|
|
| 80 |
# Examples
|
| 81 |
gr.Examples(
|
| 82 |
examples=[
|
| 83 |
+
"Hello, welcome to the Microsoft SpeechT5 text-to-speech demo!",
|
| 84 |
"The quick brown fox jumps over the lazy dog.",
|
| 85 |
"Artificial intelligence is transforming the way we interact with technology.",
|
| 86 |
"δ»ε€©ε€©ζ°ηε₯½οΌιεεΊε»ζ£ζ₯γ"
|
requirements.txt
CHANGED
|
@@ -2,4 +2,5 @@ gradio==6.4.0
|
|
| 2 |
torch>=2.0.0
|
| 3 |
transformers>=4.35.0
|
| 4 |
scipy>=1.10.0
|
| 5 |
-
numpy>=1.24.0
|
|
|
|
|
|
| 2 |
torch>=2.0.0
|
| 3 |
transformers>=4.35.0
|
| 4 |
scipy>=1.10.0
|
| 5 |
+
numpy>=1.24.0
|
| 6 |
+
datasets>=2.10.0
|