Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import numpy as np | |
| import torch | |
| from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan | |
| from io import StringIO | |
| import soundfile as sf | |
| # Load models outside of function calls for efficiency | |
| def load_models(): | |
| model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts") | |
| processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts") | |
| vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan") | |
| return model, processor, vocoder | |
| model, processor, vocoder = load_models() | |
| # Load speaker embeddings | |
| def get_speaker_embeddings(): | |
| speaker_embeddings = np.load("cmu_us_clb_arctic-wav-arctic_a0144.npy") | |
| return torch.tensor(speaker_embeddings).unsqueeze(0) | |
| speaker_embeddings = get_speaker_embeddings() | |
| # Improved Styling | |
| def local_css(file_name): | |
| with open(file_name) as f: | |
| st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True) | |
| local_css("style.css") | |
| # Streamlined Layout | |
| st.title("Text-to-Voice Conversion") | |
| st.markdown("Convert your text to speech using advanced AI models.") | |
| # Function to convert text to speech | |
| def text_to_speech(text): | |
| try: | |
| # Segment the text if it's too long | |
| max_length = 100 # Set a max length as per model's capability | |
| segments = [text[i:i+max_length] for i in range(0, len(text), max_length)] | |
| audio_paths = [] | |
| for segment in segments: | |
| inputs = processor(text=segment, return_tensors="pt") | |
| spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings) | |
| with torch.no_grad(): | |
| speech = vocoder(spectrogram) | |
| audio_path = f"speech_segment_{len(audio_paths)}.wav" | |
| sf.write(audio_path, speech.numpy(), samplerate=16000) | |
| audio_paths.append(audio_path) | |
| return audio_paths | |
| except Exception as e: | |
| st.error(f"Error in text-to-speech conversion: {e}") | |
| return [] | |
| # Function to combine audio segments | |
| def combine_audio_segments(paths): | |
| combined_speech = [] | |
| for path in paths: | |
| data, samplerate = sf.read(path) | |
| combined_speech.extend(data) | |
| sf.write("combined_speech.wav", np.array(combined_speech), samplerate) | |
| return "combined_speech.wav" | |
| # Text Input | |
| text = st.text_area("Type your text or upload a text file below.") | |
| # Convert Button | |
| if st.button("Convert"): | |
| if text: | |
| audio_paths = text_to_speech(text) | |
| combined_audio_path = combine_audio_segments(audio_paths) | |
| audio_file = open(combined_audio_path, 'rb') | |
| audio_bytes = audio_file.read() | |
| st.audio(audio_bytes, format='audio/wav') | |
| else: | |
| st.error("Please enter some text to convert.") | |
| # File Uploader | |
| uploaded_file = st.file_uploader("Upload your text file here", type=['txt']) | |
| if uploaded_file is not None: | |
| stringio = StringIO(uploaded_file.getvalue().decode("utf-8")) | |
| text = stringio.read() | |
| st.write(text) | |
| if st.button("Convert Uploaded File", key=1): | |
| audio_paths = text_to_speech(text) | |
| combined_audio_path = combine_audio_segments(audio_paths) | |
| audio_file = open(combined_audio_path, 'rb') | |
| audio_bytes = audio_file.read() | |
| st.audio(audio_bytes, format='audio/wav') |