Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import numpy as np | |
| import torch | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
| from io import BytesIO | |
| import soundfile as sf | |
| # Load models outside of function calls for efficiency | |
| def load_models(): | |
| model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
| processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| return model, processor, vocoder | |
| model, processor, vocoder = load_models() | |
| # Load speaker embeddings | |
| def get_speaker_embeddings(): | |
| speaker_embeddings = np.load("cmu_us_clb_arctic-wav-arctic_a0144.npy") | |
| return torch.tensor(speaker_embeddings).unsqueeze(0) | |
| speaker_embeddings = get_speaker_embeddings() | |
| # Function to convert text to speech | |
| def text_to_speech(text): | |
| try: | |
| # Segment the text if it's too long | |
| max_length = 100 # Set a max length as per model's capability | |
| segments = [text[i:i+max_length] for i in range(0, len(text), max_length)] | |
| combined_speech = [] | |
| for segment in segments: | |
| inputs = processor(text=segment, return_tensors="pt") | |
| spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) | |
| with torch.no_grad(): | |
| speech = vocoder(spectrogram) | |
| combined_speech.extend(speech.numpy()) | |
| # Combine audio data into a single numpy array | |
| combined_speech = np.array(combined_speech) | |
| return 16000, combined_speech # Return sample rate and combined audio data | |
| except Exception as e: | |
| return None, f"Error in text-to-speech conversion: {e}" | |
| # Gradio Interface | |
| def gradio_interface(text): | |
| sample_rate, audio_data = text_to_speech(text) | |
| if sample_rate and isinstance(audio_data, np.ndarray): | |
| return sample_rate, audio_data | |
| else: | |
| return None # Return None if there's an error | |
| interface = gr.Interface( | |
| fn=gradio_interface, | |
| title="Text to Voice", # Add a title to the interface | |
| description="Hight Fidelity TTS. Visit <a href='https://ruslanmv.com/' target='_blank'>ruslanmv.com</a> for more information.", | |
| inputs=gr.Textbox(lines=10, label="Enter text to convert to speech"), | |
| outputs=gr.Audio(label="Generated audio") | |
| ) | |
| interface.launch() | |