ruslanmv's picture
First commit
0a6371e
raw
history blame
2.36 kB
import gradio as gr
import numpy as np
import torch
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from io import BytesIO
import soundfile as sf
# Load models outside of function calls for efficiency
def load_models():
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
return model, processor, vocoder
model, processor, vocoder = load_models()
# Load speaker embeddings
def get_speaker_embeddings():
speaker_embeddings = np.load("cmu_us_clb_arctic-wav-arctic_a0144.npy")
return torch.tensor(speaker_embeddings).unsqueeze(0)
speaker_embeddings = get_speaker_embeddings()
# Function to convert text to speech
def text_to_speech(text):
try:
# Segment the text if it's too long
max_length = 100 # Set a max length as per model's capability
segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]
combined_speech = []
for segment in segments:
inputs = processor(text=segment, return_tensors="pt")
spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
with torch.no_grad():
speech = vocoder(spectrogram)
combined_speech.extend(speech.numpy())
# Combine audio data into a single numpy array
combined_speech = np.array(combined_speech)
return 16000, combined_speech # Return sample rate and combined audio data
except Exception as e:
return None, f"Error in text-to-speech conversion: {e}"
# Gradio Interface
def gradio_interface(text):
sample_rate, audio_data = text_to_speech(text)
if sample_rate and isinstance(audio_data, np.ndarray):
return sample_rate, audio_data
else:
return None # Return None if there's an error
interface = gr.Interface(
fn=gradio_interface,
title="Text to Voice", # Add a title to the interface
description="Hight Fidelity TTS. Visit <a href='https://ruslanmv.com/' target='_blank'>ruslanmv.com</a> for more information.",
inputs=gr.Textbox(lines=10, label="Enter text to convert to speech"),
outputs=gr.Audio(label="Generated audio")
)
interface.launch()