Spaces:

ruslanmv
/

Text-to-Voice

Runtime error

App Files Files Community

ruslanmv commited on Jun 14, 2024

Commit

2dac140

1 Parent(s): a479ddf

updates

Browse files

Files changed (1) hide show

app.py +18 -15

app.py CHANGED Viewed

@@ -2,8 +2,8 @@ import streamlit as st
 import numpy as np
 import torch
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
-import soundfile as sf
 from io import StringIO
 # Load models outside of function calls for efficiency
 @st.cache_data
@@ -23,30 +23,31 @@ def get_speaker_embeddings():
 speaker_embeddings = get_speaker_embeddings()
-# Improved Styling (assuming style.css is present)
 def local_css(file_name):
     with open(file_name) as f:
         st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
-local_css("style.css")  # Apply custom CSS styles
-# Streamlit Layout
 st.title("Text-to-Voice Conversion")
 st.markdown("Convert your text to speech using advanced AI models.")
 # Function to convert text to speech
 def text_to_speech(text):
     try:
         max_length = 100  # Set a max length as per model's capability
         segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]
         audio_paths = []
-        for i, segment in enumerate(segments):
             inputs = processor(text=segment, return_tensors="pt")
             spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
             with torch.no_grad():
                 speech = vocoder(spectrogram)
-                audio_path = f"speech_segment_{i}.wav"
                 sf.write(audio_path, speech.numpy(), samplerate=16000)
                 audio_paths.append(audio_path)
@@ -64,28 +65,30 @@ def combine_audio_segments(paths):
     sf.write("combined_speech.wav", np.array(combined_speech), samplerate)
     return "combined_speech.wav"
-# Text Input and Conversion Button
-text = st.text_area("Type your text here.")
 if st.button("Convert"):
     if text:
         audio_paths = text_to_speech(text)
         combined_audio_path = combine_audio_segments(audio_paths)
-        audio_bytes = open(combined_audio_path, 'rb').read()
         st.audio(audio_bytes, format='audio/wav')
     else:
         st.error("Please enter some text to convert.")
-# File Uploader and Conversion Button
-uploaded_file = st.file_uploader("Upload a text file here", type=['txt'])
 if uploaded_file is not None:
     stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
     text = stringio.read()
     st.write(text)
-    if st.button("Convert Uploaded File", key="upload"):
         audio_paths = text_to_speech(text)
         combined_audio_path = combine_audio_segments(audio_paths)
-        audio_bytes = open(combined_audio_path, 'rb').read()
-        st.audio(audio_bytes, format='audio/wav')

 import numpy as np
 import torch
 from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
 from io import StringIO
+import soundfile as sf
 # Load models outside of function calls for efficiency
 @st.cache_data
 speaker_embeddings = get_speaker_embeddings()
+# Improved Styling
 def local_css(file_name):
     with open(file_name) as f:
         st.markdown(f'<style>{f.read()}</style>', unsafe_allow_html=True)
+local_css("style.css")
+# Streamlined Layout
 st.title("Text-to-Voice Conversion")
 st.markdown("Convert your text to speech using advanced AI models.")
 # Function to convert text to speech
 def text_to_speech(text):
     try:
+        # Segment the text if it's too long
         max_length = 100  # Set a max length as per model's capability
         segments = [text[i:i+max_length] for i in range(0, len(text), max_length)]
         audio_paths = []
+        for segment in segments:
             inputs = processor(text=segment, return_tensors="pt")
             spectrogram = model.generate_speech(inputs["input_ids"], speaker_embeddings)
             with torch.no_grad():
                 speech = vocoder(spectrogram)
+                audio_path = f"speech_segment_{len(audio_paths)}.wav"
                 sf.write(audio_path, speech.numpy(), samplerate=16000)
                 audio_paths.append(audio_path)
     sf.write("combined_speech.wav", np.array(combined_speech), samplerate)
     return "combined_speech.wav"
+# Text Input
+text = st.text_area("Type your text or upload a text file below.")
+# Convert Button
 if st.button("Convert"):
     if text:
         audio_paths = text_to_speech(text)
         combined_audio_path = combine_audio_segments(audio_paths)
+        audio_file = open(combined_audio_path, 'rb')
+        audio_bytes = audio_file.read()
         st.audio(audio_bytes, format='audio/wav')
     else:
         st.error("Please enter some text to convert.")
+# File Uploader
+uploaded_file = st.file_uploader("Upload your text file here", type=['txt'])
 if uploaded_file is not None:
     stringio = StringIO(uploaded_file.getvalue().decode("utf-8"))
     text = stringio.read()
     st.write(text)
+    if st.button("Convert Uploaded File", key=1):
         audio_paths = text_to_speech(text)
         combined_audio_path = combine_audio_segments(audio_paths)
+        audio_file = open(combined_audio_path, 'rb')
+        audio_bytes = audio_file.read()
+        st.audio(audio_bytes, format='audio/wav')