rockerritesh commited on
Commit
98ed243
·
verified ·
1 Parent(s): 8242dde

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +172 -0
app.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
4
+ import soundfile as sf
5
+ import io
6
+ import numpy as np
7
+ import base64
8
+
9
+ # Set page config
10
+ st.set_page_config(
11
+ page_title="Nepali Text-to-Speech Converter",
12
+ page_icon="🎧",
13
+ layout="centered"
14
+ )
15
+
16
+ # Custom CSS
17
+ st.markdown("""
18
+ <style>
19
+ .main {
20
+ padding: 2rem;
21
+ }
22
+ .stTextInput > div > div > input {
23
+ min-height: 100px;
24
+ }
25
+ </style>
26
+ """, unsafe_allow_html=True)
27
+
28
+ @st.cache_resource
29
+ def load_model():
30
+ """Load and cache the model and processor"""
31
+ try:
32
+ processor = SpeechT5Processor.from_pretrained("aryamanstha/speecht5_nepali_oslr43_oslr143")
33
+ model = SpeechT5ForTextToSpeech.from_pretrained("aryamanstha/speecht5_nepali_oslr43_oslr143")
34
+ vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
35
+
36
+ # Move to GPU if available
37
+ device = "cuda" if torch.cuda.is_available() else "cpu"
38
+ model = model.to(device)
39
+ vocoder = vocoder.to(device)
40
+
41
+ return processor, model, vocoder, device
42
+ except Exception as e:
43
+ st.error(f"Error loading model: {str(e)}")
44
+ return None, None, None, None
45
+
46
+ def create_speaker_embedding():
47
+ """Create a default speaker embedding"""
48
+ speaker_embedding = torch.zeros(512)
49
+ return speaker_embedding.unsqueeze(0)
50
+
51
+ def generate_speech(text, processor, model, vocoder, speaker_embeddings, device):
52
+ """Generate speech from text"""
53
+ try:
54
+ # Prepare input
55
+ inputs = processor(text=text, return_tensors="pt").to(device)
56
+ speaker_embeddings = speaker_embeddings.to(device)
57
+
58
+ # Generate speech
59
+ speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
60
+
61
+ # Convert to numpy
62
+ speech = speech.cpu().numpy()
63
+
64
+ # Save to BytesIO
65
+ audio_buffer = io.BytesIO()
66
+ sf.write(audio_buffer, speech, samplerate=16000, format='WAV')
67
+ audio_buffer.seek(0)
68
+
69
+ return audio_buffer
70
+ except Exception as e:
71
+ st.error(f"Error generating speech: {str(e)}")
72
+ return None
73
+
74
+ def get_audio_player_html(audio_bytes):
75
+ """Create an HTML audio player with the audio data"""
76
+ audio_base64 = base64.b64encode(audio_bytes.read()).decode()
77
+ return f"""
78
+ <audio controls autoplay>
79
+ <source src="data:audio/wav;base64,{audio_base64}" type="audio/wav">
80
+ Your browser does not support the audio element.
81
+ </audio>
82
+ """
83
+
84
+ def main():
85
+ st.title("🎤 Nepali Text-to-Speech Converter")
86
+
87
+ # Add introduction
88
+ st.markdown("""
89
+ Convert Nepali text to speech using SpeechT5 model. Simply enter your text below and click 'Generate Speech'.
90
+ """)
91
+
92
+ # Initialize session state for tracking model loading
93
+ if 'model_loaded' not in st.session_state:
94
+ st.session_state.model_loaded = False
95
+
96
+ # Load model
97
+ if not st.session_state.model_loaded:
98
+ with st.spinner("Loading model... This may take a few minutes..."):
99
+ processor, model, vocoder, device = load_model()
100
+ if None not in (processor, model, vocoder):
101
+ st.session_state.model_loaded = True
102
+ st.session_state.processor = processor
103
+ st.session_state.model = model
104
+ st.session_state.vocoder = vocoder
105
+ st.session_state.device = device
106
+ st.success("Model loaded successfully! 🚀")
107
+ else:
108
+ st.error("Failed to load model. Please refresh the page to try again.")
109
+ return
110
+
111
+ # Create text input area
112
+ text_input = st.text_area(
113
+ "Enter Nepali Text:",
114
+ height=100,
115
+ placeholder="तपाईंको नेपाली पाठ यहाँ लेख्नुहोस्..."
116
+ )
117
+
118
+ # Create speaker embedding
119
+ speaker_embeddings = create_speaker_embedding()
120
+
121
+ # Add generate button
122
+ col1, col2 = st.columns([1, 2])
123
+ with col1:
124
+ generate_button = st.button("🔊 Generate Speech")
125
+
126
+ # Generate speech when button is clicked
127
+ if generate_button and text_input:
128
+ with st.spinner("Generating speech..."):
129
+ audio_buffer = generate_speech(
130
+ text_input,
131
+ st.session_state.processor,
132
+ st.session_state.model,
133
+ st.session_state.vocoder,
134
+ speaker_embeddings,
135
+ st.session_state.device
136
+ )
137
+
138
+ if audio_buffer:
139
+ # Display audio player
140
+ st.markdown("### Generated Speech:")
141
+ st.markdown(get_audio_player_html(audio_buffer), unsafe_allow_html=True)
142
+
143
+ # Add download button
144
+ audio_buffer.seek(0)
145
+ st.download_button(
146
+ label="📥 Download Audio",
147
+ data=audio_buffer,
148
+ file_name="generated_speech.wav",
149
+ mime="audio/wav"
150
+ )
151
+
152
+ # Add usage instructions
153
+ with st.expander("ℹ️ Usage Instructions"):
154
+ st.markdown("""
155
+ 1. Enter your Nepali text in the text area above
156
+ 2. Click the 'Generate Speech' button
157
+ 3. Wait for the audio to be generated
158
+ 4. Use the audio player to listen to the generated speech
159
+ 5. Download the audio file if desired
160
+
161
+ **Note**: For best results, enter clear and grammatically correct Nepali text.
162
+ """)
163
+
164
+ # Add footer
165
+ st.markdown("---")
166
+ st.markdown(
167
+ "Made with ❤️ using Streamlit and SpeechT5 | "
168
+ "Model: [aryamanstha/speecht5_nepali_oslr43_oslr143](https://huggingface.co/aryamanstha/speecht5_nepali_oslr43_oslr143)"
169
+ )
170
+
171
+ if __name__ == "__main__":
172
+ main()