ajajali09 commited on
Commit
fc2d6c1
·
1 Parent(s): b2531e4

new streamlit app

Browse files
Files changed (4) hide show
  1. app.py +568 -332
  2. parameters.py +2 -1
  3. requirements.txt +4 -5
  4. utils.py +2 -2
app.py CHANGED
@@ -6,7 +6,6 @@ import utils
6
  import classes
7
  import json
8
  import random
9
- from st_audiorec import st_audiorec
10
  from S3_bucket import AWS
11
 
12
  aws = AWS()
@@ -33,366 +32,603 @@ if "pronunc_dict" not in st.session_state:
33
  if "voice_cache" not in st.session_state:
34
  st.session_state.voice_cache = {}
35
 
36
- # Streamlit UI
37
- st.set_page_config(page_title="Ori TTS & Voice Cloning", layout="wide")
38
- st.title("🎙️ Ori TTS & Voice Cloning System")
39
- st.markdown("Choose a default speaker or upload reference audio (min 5 sec), select a language, and enter text to generate speech")
40
-
41
- with st.sidebar:
42
- st.header("Models......")
43
- model = st.radio("Select Model", ["V1", "V2"])
44
- if model == "V1":
45
- st.header("Languages.....")
46
- language = st.selectbox("Select Language", list(utils.V1_LANGUAGES.keys()))
47
- st.header("Voice Settings.....")
48
- voice_mode = st.radio("Voice Selection Mode", ["Default Speaker", "Upload Audio"])
49
- if voice_mode == "Default Speaker":
50
- default_speaker = st.selectbox("Select Default Speaker", list(utils.V1_SPEAKERS[utils.V1_LANGUAGES[language]]))
51
- reference_audio = None
52
- else:
53
- st.info("Give a reference audio (min 5 seconds)")
54
- # audio_file = st.file_uploader("Reference Audio", type=['wav', 'mp3', 'flac'])
55
- # reference_audio = audio_file
56
- # default_speaker = None
57
- audio_source = st.radio(
58
- "Reference audio source",
59
- ["Upload file", "Record audio"],
60
- horizontal=True,
61
- key="v1_audio_source",
62
- )
63
-
64
- default_speaker = None
65
 
66
- if audio_source == "Upload file":
67
- reference_audio = st.file_uploader(
68
- "Upload Reference Audio",
69
- type=["wav", "mp3", "flac"],
70
- key="v1_file_uploader",
71
- )
72
- else: # Record audio
73
- reference_audio = st.audio_input(
74
- "Record Reference Audio",
75
- key="v1_audio_input",
76
- )
77
  else:
78
- st.header("Languages.....")
79
- language = st.selectbox("Select Language", list(utils.V2_LANGUAGES.keys()))
80
- st.header("Voice Settings.....")
81
- voice_mode = st.radio("Voice Selection Mode", ["Default Speaker", "Upload Audio"])
82
- if voice_mode == "Default Speaker":
83
- default_speaker = st.selectbox("Select Default Speaker", list(utils.V2_SPEAKERS[utils.V2_LANGUAGES[language]]))
84
- reference_audio = None
85
- else:
86
- st.info("Give a reference audio (min 5 seconds)")
87
- # audio_file = st.file_uploader("Reference Audio", type=['wav', 'mp3', 'flac'])
88
- # reference_audio = audio_file
89
- # default_speaker = None
90
- audio_source = st.radio(
91
- "Reference audio source",
92
- ["Upload file", "Record audio"],
93
- horizontal=True,
94
- key="v2_audio_source",
95
- )
96
 
97
- default_speaker = None
 
 
 
 
 
 
98
 
99
- if audio_source == "Upload file":
100
- reference_audio = st.file_uploader(
101
- "Upload Reference Audio",
102
- type=["wav", "mp3", "flac"],
103
- key="v2_file_uploader",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
105
  else:
106
- reference_audio = st.audio_input(
107
- "Record Reference Audio",
108
- key="v2_audio_input",
 
 
 
 
 
 
109
  )
110
 
111
- with st.expander("Advanced Settings"):
112
- speech_rate = st.slider("Speech Rate", 0.25, 2.0, 1.0, 0.25)
113
- speed = st.slider("Speed", 0.5, 2.0, 1.0, 0.1)
114
- expressive = st.slider("Expressive", 0.0, 1.0, 0.1, 0.05)
115
- stability = st.slider("Stability", 0, 10, 1, 1)
116
- clarity = st.slider("Clarity", 0.0, 1.0, 0.1, 0.1)
117
- volume_level = st.slider("Volume Level", 0.5, 3.0, 1.0, 0.1)
118
- stitch_request = st.checkbox("Stitch Request ()", value=False)
119
-
120
-
121
- # Main content
122
- col1, col2 = st.columns([2, 1])
123
-
124
- with col1:
125
- if 'input_text' not in st.session_state:
126
- st.session_state['input_text'] = ''
127
- if 'set_random_next_run' not in st.session_state:
128
- st.session_state.set_random_next_run = False
129
- if 'pending_random_text' not in st.session_state:
130
- st.session_state.pending_random_text = ''
131
-
132
- input_text = st.text_area(
133
- "Input Text",
134
- key='input_text',
135
- placeholder="Enter the text you want to synthesize...",
136
- height=130
137
- )
138
 
139
- btn_col1, btn_col2 = st.columns(2)
140
- with btn_col1:
141
- random_btn = st.button("🎲 Random Text", use_container_width=True)
142
- with btn_col2:
143
- generate_btn = st.button("🎵 Generate Speech", type="primary", use_container_width=True)
144
-
145
- with col2:
146
- st.markdown("### Add Pronunciation Pair")
147
-
148
- key_col1, value_col2 = st.columns(2)
149
- with key_col1:
150
- pr_key = st.text_input(
151
- "Pronunciation key 👇",
152
- label_visibility="visible",
153
- disabled=False,
154
- placeholder="Enter word",
155
- key="pr_key",
156
- )
157
- with value_col2:
158
- pr_value = st.text_input(
159
- "Pronunciation value 👇",
160
- label_visibility="visible",
161
- disabled=False,
162
- placeholder="Enter correct pronunciation",
163
- key="pr_value",
164
- )
165
- add_pair = st.button("Add Pronunciation Pair", type='primary', use_container_width=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
 
167
- if add_pair:
168
- if pr_key.strip() and pr_value.strip():
169
- st.session_state.pronunc_dict[pr_key.strip()] = pr_value.strip()
170
- st.success(f"Added pronunciation pair: {pr_key.strip()} → {pr_value.strip()}")
171
- # do NOT assign st.session_state.pr_key / pr_value here
 
 
 
 
 
 
 
 
 
 
 
 
 
172
  else:
173
- st.warning("Both key and value are required to add a pronunciation pair.")
174
-
175
- st.markdown("""
176
- If the model mispronounces some word incorrectly, you can correct it by adding the term as the Pronunciation Key and its phonetic spelling as the Pronunciation Value. For example, if AI/Cholestrol isn't pronounced correctly, respell it as ए आई/colestrol: enter AI/Cholestrol in the Pronunciation Key field and ए आई/colestrol in the Pronunciation Value field, then click **Add Pronunciation Pair**.
177
- """)
178
-
179
- if random_btn:
180
- if language in utils.language_sentences.keys():
181
- random_text = random.choice(utils.language_sentences[language])
182
- st.session_state.pending_random_text = random_text
183
- st.session_state.set_random_next_run = True
184
- st.rerun()
185
- else:
186
- st.warning(f"No sample sentences available for {language}")
187
 
188
 
189
- if generate_btn:
190
 
191
- session_id = utils.generate_session_id()
192
- print(f"\n\nGenerate btn is pressed.....\nThis is the session ID : -{session_id}")
193
 
194
- # Validate pronunciation input
195
- # if pr_key.strip() and pr_value.strip():
196
- pronunciation_dict_str = st.session_state.pronunc_dict
197
- # else:
198
- # pronunciation_dict_str = {}
199
 
200
- input_text = st.session_state.input_text
201
- if not input_text.strip():
202
- st.error("Please enter text to synthesize")
203
- elif len(input_text) > 1000:
204
- st.error(f"Text length must be less than 1000 characters. Current length: {len(input_text)}")
205
- else:
206
- try:
207
- token = parameters.TTS_SECRET_KEY
208
-
209
- if model == "V1":
210
- language_code = utils.V1_LANGUAGES[language]
211
- else:
212
- language_code = utils.V2_LANGUAGES[language]
213
 
214
- user_id = parameters.user_id
215
- voice_path = None
216
- # Determine voice_id based on mode
217
- if voice_mode == "Default Speaker" and model == "V1":
218
- if language_code in list(utils.V1_SPEAKERS.keys()):
219
- voice_id = default_speaker
220
- status_msg = f"Using default speaker: {default_speaker} for {language}"
221
- else:
222
- st.error(f"Language {language} not available for {default_speaker}")
223
- st.stop()
224
- elif voice_mode == "Default Speaker" and model == "V2":
225
- if language_code in list(utils.V2_SPEAKERS.keys()):
226
- voice_id = default_speaker
227
- status_msg = f"Using default speaker: {default_speaker} for {language}"
228
- else:
229
- st.error(f"Language {language} not available for {default_speaker}")
230
- st.stop()
231
-
232
- else:
233
- if not reference_audio:
234
- st.warning("Please upload a reference audio file")
235
- st.stop()
236
- audio_hash = utils.get_audio_hash(reference_audio)
237
- cache_key = f"{audio_hash}_{language_code}_{model}"
238
-
239
- if cache_key in st.session_state.voice_cache:
240
- voice_id = st.session_state.voice_cache[cache_key]
241
- voice_path = cache_key
242
- status_msg = f"✓ Using cached voice ID for language: {language}"
243
  else:
244
- with st.spinner("Cloning voice..."):
245
- with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
246
- tmp_file.write(reference_audio.read())
247
- tmp_file.flush()
248
- if model == "V1":
249
- result = utils.v1_clone_voice(tmp_file.name, user_id, token, language_code )
250
- else:
251
- result = utils.v2_clone_voice(tmp_file.name, user_id, token)
252
- voice_id = result['voice_id']
253
- print(f"Voice Clone succesfully from mode {model} id is {voice_id}")
254
- reference_audio.seek(0)
255
- classes.upload_voice_clone_audio(reference_audio, voice_id)
256
- voice_path = cache_key
257
- st.session_state.voice_cache[cache_key] = voice_id
258
- status_msg = f"✓ Cloned voice successfully for language: {language}"
259
- # Generate speech
260
- with st.spinner("Generating speech..."):
261
- loop = asyncio.new_event_loop()
262
- asyncio.set_event_loop(loop)
263
 
264
- if model=="V1":
265
- sr, audio = loop.run_until_complete(
266
- utils.v1_generate_speech_async(
267
- session_id, voice_mode, voice_id, model, input_text, language_code, user_id,
268
- pronunciation_dict_str, speed, expressive, stability, clarity,
269
- volume_level, speech_rate, stitch_request
270
- )
271
- )
272
  else:
273
- sr, audio = loop.run_until_complete(
274
- utils.v2_generate_speech_async(
275
- session_id, voice_mode, voice_id, model, input_text, language_code, user_id,
276
- pronunciation_dict_str, speed, expressive, stability, clarity,
277
- volume_level, speech_rate, stitch_request
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
  )
279
- )
280
- loop.close()
281
-
282
- # st.success(status_msg)
283
- # st.audio(audio, sample_rate=sr)
284
-
285
- # st.session_state.show_feedback = True
286
- # st.session_state.last_session_id = session_id
287
- # st.success(status_msg)
288
-
289
- # Store audio + session info in state, mark as available
290
- st.session_state.last_msg = status_msg
291
- st.session_state.last_audio = audio
292
- st.session_state.last_sr = sr
293
- st.session_state.last_session_id = session_id
294
- st.session_state.has_audio = True
295
- st.session_state.show_feedback = True
296
-
297
- except Exception as e:
298
- st.error(f"Error: {str(e)}")
299
- st.session_state.show_feedback = False
300
- st.markdown("---")
301
- st.markdown("### 🎧 Output & Feedback")
302
-
303
- # Column layout for audio + feedback
304
- a_col, f_col = st.columns([1, 1])
305
-
306
- with a_col:
307
- if st.session_state.has_audio and st.session_state.last_audio is not None:
308
- st.success(st.session_state.last_msg)
309
- st.audio(st.session_state.last_audio, sample_rate=st.session_state.last_sr)
310
- else:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
  st.markdown(
312
- "<div style='opacity:0.4; border:1px dashed #888; padding:0.75rem; text-align:center;'>"
313
- "Audio preview will appear here after you generate speech."
314
- "</div>",
315
  unsafe_allow_html=True,
316
  )
317
 
318
- with f_col:
319
- # Disabled / enabled based on has_audio flag
320
- disabled = not (st.session_state.show_feedback and st.session_state.has_audio)
321
-
322
- st.markdown(
323
- "<div style='opacity:{};'>".format("1.0" if not disabled else "0.4"),
324
- unsafe_allow_html=True,
325
- )
326
-
327
- rating_index = st.radio(
328
- "Rate this audio:",
329
- options=[0, 1, 2, 3, 4],
330
- format_func=lambda i: "⭐" * (i + 1),
331
- horizontal=True,
332
- index=None,
333
- key="rating_index",
334
- disabled=disabled,
335
- )
336
-
337
- feedback_msg = st.text_area(
338
- "✍️ Feedback (optional)",
339
- placeholder="Enter your feedback here...",
340
- height=80,
341
- key="feedback_msg",
342
- disabled=disabled,
343
- )
344
 
345
- submit_clicked = st.button(
346
- "📤 Submit Feedback",
347
- type="primary",
348
- disabled=disabled,
349
- key="submit_feedback_btn",
350
- use_container_width=True
351
- )
352
 
353
- st.markdown("</div>", unsafe_allow_html=True)
 
 
 
 
 
 
354
 
355
- if submit_clicked:
356
- if rating_index is None:
357
- st.warning("Please select a rating before submitting.")
358
- else:
359
- utils.update_rating(
360
- session_id=st.session_state.last_session_id,
361
- rating_index=rating_index,
362
- feedback_msg=feedback_msg or "",
363
- )
364
- # Optionally keep or reset feedback area
365
- st.session_state.show_feedback = False
366
 
 
 
 
 
 
 
 
 
 
 
 
367
 
368
- st.markdown("---")
369
- st.markdown("### How to Use This App")
370
 
371
- st.markdown("""
372
- **Step 1: Select Model**
373
- - Choose between **V1** or **V2** model from the sidebar
374
-
375
- **Step 2: Choose Language**
376
- - Select your desired language from the dropdown
377
-
378
- **Step 3: Select Voice Mode**
379
- - **Default Speaker**: Choose from pre-trained voices
380
- - **Upload Audio**: Upload your own reference audio (min 5 seconds) for voice cloning
381
-
382
- **Step 4: Enter Text**
383
- - Type or paste the text you want to convert to speech
384
-
385
- **Step 5: Adjust Settings (Optional)**
386
- - Expand "Advanced Settings" in sidebar to fine-tune:
387
- - Speech rate
388
- - Speed
389
- - Expressive
390
- - Other voice parameters
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
391
 
392
- **Step 6: Generate**
393
- - Click the **"🎵 Generate Speech"** button
394
- - Wait for the audio to be generated
395
- - Play the audio directly in the browser
396
- """)
397
- st.markdown("---")
398
- st.caption("Ori TTS & Voice Cloning System | Powered by Oriserve")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  import classes
7
  import json
8
  import random
 
9
  from S3_bucket import AWS
10
 
11
  aws = AWS()
 
32
  if "voice_cache" not in st.session_state:
33
  st.session_state.voice_cache = {}
34
 
35
+ if "page" not in st.session_state:
36
+ st.session_state.page = "Home"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
37
 
38
+ # Top nav (always visible)
39
+ col_h, col_u, col_a, _ = st.columns([0.2, 0.2, 0.2, 0.4])
40
+ with col_h:
41
+ if st.session_state.page == "Home":
42
+ if st.button("🏠 Home", key="nav_home", type='primary', use_container_width=True):
43
+ st.session_state.page = "Home"
 
 
 
 
 
44
  else:
45
+ if st.button("Home", key="nav_home", use_container_width=True):
46
+ st.session_state.page = "Home"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
+ with col_u:
49
+ if st.session_state.page == "Use":
50
+ if st.button("Use", key="nav_use", type='primary', use_container_width=True):
51
+ st.session_state.page = "Use"
52
+ else:
53
+ if st.button("How to use app", key="nav_use", use_container_width=True):
54
+ st.session_state.page = "Use"
55
 
56
+ with col_a:
57
+ if st.session_state.page == "About":
58
+ if st.button("ℹ️ About", key="nav_about", type='primary', use_container_width=True):
59
+ st.session_state.page = "About"
60
+ else:
61
+ if st.button("About", key="nav_about", use_container_width=True):
62
+ st.session_state.page = "About"
63
+
64
+ if st.session_state.page == "Home":
65
+ # Streamlit UI
66
+ st.set_page_config(page_title="Ori TTS & Voice Cloning", layout="wide")
67
+ st.title("🎙️ Ori TTS & Voice Cloning System")
68
+ st.markdown("Choose a default speaker or upload reference audio (min 5 sec), select a language, and enter text to generate speech")
69
+
70
+ with st.sidebar:
71
+ st.title("Home")
72
+ st.markdown("---")
73
+ st.header("Models......")
74
+ model = st.radio("Select Model", ["V1", "V2"])
75
+ if model == "V1":
76
+ st.header("Languages.....")
77
+ language = st.selectbox("Select Language", list(utils.V1_LANGUAGES.keys()))
78
+ st.header("Voice Settings.....")
79
+ voice_mode = st.radio("Voice Selection Mode", ["Default Speaker", "Upload Audio"])
80
+ if voice_mode == "Default Speaker":
81
+ default_speaker = st.selectbox("Select Default Speaker", list(utils.V1_SPEAKERS[utils.V1_LANGUAGES[language]]))
82
+ reference_audio = None
83
+ else:
84
+ st.info("Give a reference audio (min 5 seconds)")
85
+ # audio_file = st.file_uploader("Reference Audio", type=['wav', 'mp3', 'flac'])
86
+ # reference_audio = audio_file
87
+ # default_speaker = None
88
+ audio_source = st.radio(
89
+ "Reference audio source",
90
+ ["Upload file", "Record audio"],
91
+ horizontal=True,
92
+ key="v1_audio_source",
93
  )
94
+
95
+ default_speaker = None
96
+
97
+ if audio_source == "Upload file":
98
+ reference_audio = st.file_uploader(
99
+ "Upload Reference Audio",
100
+ type=["wav", "mp3", "flac"],
101
+ key="v1_file_uploader",
102
+ )
103
+ else: # Record audio
104
+ reference_audio = st.audio_input(
105
+ "Record Reference Audio",
106
+ key="v1_audio_input",
107
+ )
108
+ else:
109
+ st.header("Languages.....")
110
+ language = st.selectbox("Select Language", list(utils.V2_LANGUAGES.keys()))
111
+ st.header("Voice Settings.....")
112
+ voice_mode = st.radio("Voice Selection Mode", ["Default Speaker", "Upload Audio"])
113
+ if voice_mode == "Default Speaker":
114
+ default_speaker = st.selectbox("Select Default Speaker", list(utils.V2_SPEAKERS[utils.V2_LANGUAGES[language]]))
115
+ reference_audio = None
116
  else:
117
+ st.info("Give a reference audio (min 5 seconds)")
118
+ # audio_file = st.file_uploader("Reference Audio", type=['wav', 'mp3', 'flac'])
119
+ # reference_audio = audio_file
120
+ # default_speaker = None
121
+ audio_source = st.radio(
122
+ "Reference audio source",
123
+ ["Upload file", "Record audio"],
124
+ horizontal=True,
125
+ key="v2_audio_source",
126
  )
127
 
128
+ default_speaker = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
129
 
130
+ if audio_source == "Upload file":
131
+ reference_audio = st.file_uploader(
132
+ "Upload Reference Audio",
133
+ type=["wav", "mp3", "flac"],
134
+ key="v2_file_uploader",
135
+ )
136
+ else:
137
+ reference_audio = st.audio_input(
138
+ "Record Reference Audio",
139
+ key="v2_audio_input",
140
+ )
141
+
142
+ with st.expander("Advanced Settings"):
143
+ speech_rate = st.slider("Speech Rate", 0.25, 2.0, 1.0, 0.25)
144
+ speed = st.slider("Speed", 0.5, 2.0, 1.0, 0.1)
145
+ expressive = st.slider("Expressive", 0.0, 1.0, 0.1, 0.05)
146
+ stability = st.slider("Stability", 0, 10, 1, 1)
147
+ clarity = st.slider("Clarity", 0.0, 1.0, 0.1, 0.1)
148
+ volume_level = st.slider("Volume Level", 0.5, 3.0, 1.0, 0.1)
149
+ stitch_request = st.checkbox("Stitch Request ()", value=False)
150
+
151
+
152
+ # Main content
153
+ col1, col2 = st.columns([2, 1])
154
+
155
+ with col1:
156
+ if 'input_text' not in st.session_state:
157
+ st.session_state['input_text'] = ''
158
+ if 'set_random_next_run' not in st.session_state:
159
+ st.session_state.set_random_next_run = False
160
+ if 'pending_random_text' not in st.session_state:
161
+ st.session_state.pending_random_text = ''
162
+
163
+ input_text = st.text_area(
164
+ "Input Text",
165
+ key='input_text',
166
+ placeholder="Enter the text you want to synthesize...",
167
+ height=130
168
+ )
169
+
170
+ btn_col1, btn_col2 = st.columns(2)
171
+ with btn_col1:
172
+ random_btn = st.button("🎲 Random Text", use_container_width=True)
173
+ with btn_col2:
174
+ generate_btn = st.button("🎵 Generate Speech", type="primary", use_container_width=True)
175
+
176
+ with col2:
177
+ st.markdown("### Add Pronunciation Pair")
178
+
179
+ key_col1, value_col2 = st.columns(2)
180
+ with key_col1:
181
+ pr_key = st.text_input(
182
+ "Pronunciation key 👇",
183
+ label_visibility="visible",
184
+ disabled=False,
185
+ placeholder="Enter word",
186
+ key="pr_key",
187
+ )
188
+ with value_col2:
189
+ pr_value = st.text_input(
190
+ "Pronunciation value 👇",
191
+ label_visibility="visible",
192
+ disabled=False,
193
+ placeholder="Enter correct pronunciation",
194
+ key="pr_value",
195
+ )
196
+ add_pair = st.button("Add Pronunciation Pair", type='primary', use_container_width=True)
197
 
198
+ if add_pair:
199
+ if pr_key.strip() and pr_value.strip():
200
+ st.session_state.pronunc_dict[pr_key.strip()] = pr_value.strip()
201
+ st.success(f"Added pronunciation pair: {pr_key.strip()} → {pr_value.strip()}")
202
+ # do NOT assign st.session_state.pr_key / pr_value here
203
+ else:
204
+ st.warning("Both key and value are required to add a pronunciation pair.")
205
+
206
+ st.markdown("""
207
+ If the model mispronounces some word incorrectly, you can correct it by adding the term as the Pronunciation Key and its phonetic spelling as the Pronunciation Value. For example, if AI/Cholestrol isn't pronounced correctly, respell it as ए आई/colestrol: enter AI/Cholestrol in the Pronunciation Key field and ए आई/colestrol in the Pronunciation Value field, then click **Add Pronunciation Pair**.
208
+ """)
209
+
210
+ if random_btn:
211
+ if language in utils.language_sentences.keys():
212
+ random_text = random.choice(utils.language_sentences[language])
213
+ st.session_state.pending_random_text = random_text
214
+ st.session_state.set_random_next_run = True
215
+ st.rerun()
216
  else:
217
+ st.warning(f"No sample sentences available for {language}")
 
 
 
 
 
 
 
 
 
 
 
 
 
218
 
219
 
220
+ if generate_btn:
221
 
222
+ session_id = utils.generate_session_id()
223
+ print(f"\n\nGenerate btn is pressed.....\nThis is the session ID : -{session_id}")
224
 
225
+ # Validate pronunciation input
226
+ # if pr_key.strip() and pr_value.strip():
227
+ pronunciation_dict_str = st.session_state.pronunc_dict
228
+ # else:
229
+ # pronunciation_dict_str = {}
230
 
231
+ input_text = st.session_state.input_text
232
+ if not input_text.strip():
233
+ st.error("Please enter text to synthesize")
234
+ elif len(input_text) > 1000:
235
+ st.error(f"Text length must be less than 1000 characters. Current length: {len(input_text)}")
236
+ else:
237
+ try:
238
+ token = parameters.TTS_SECRET_KEY
 
 
 
 
 
239
 
240
+ if model == "V1":
241
+ language_code = utils.V1_LANGUAGES[language]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
242
  else:
243
+ language_code = utils.V2_LANGUAGES[language]
244
+
245
+ user_id = parameters.user_id
246
+ voice_path = None
247
+ # Determine voice_id based on mode
248
+ if voice_mode == "Default Speaker" and model == "V1":
249
+ if language_code in list(utils.V1_SPEAKERS.keys()):
250
+ voice_id = default_speaker
251
+ status_msg = f"Using default speaker: {default_speaker} for {language}"
252
+ else:
253
+ st.error(f"Language {language} not available for {default_speaker}")
254
+ st.stop()
255
+ elif voice_mode == "Default Speaker" and model == "V2":
256
+ if language_code in list(utils.V2_SPEAKERS.keys()):
257
+ voice_id = default_speaker
258
+ status_msg = f"Using default speaker: {default_speaker} for {language}"
259
+ else:
260
+ st.error(f"Language {language} not available for {default_speaker}")
261
+ st.stop()
262
 
 
 
 
 
 
 
 
 
263
  else:
264
+ if not reference_audio:
265
+ st.warning("Please upload a reference audio file")
266
+ st.stop()
267
+ audio_hash = utils.get_audio_hash(reference_audio)
268
+ cache_key = f"{audio_hash}_{language_code}_{model}"
269
+
270
+ if cache_key in st.session_state.voice_cache:
271
+ voice_id = st.session_state.voice_cache[cache_key]
272
+ voice_path = cache_key
273
+ status_msg = f"✓ Using cached voice ID for language: {language}"
274
+ else:
275
+ with st.spinner("Cloning voice..."):
276
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
277
+ tmp_file.write(reference_audio.read())
278
+ tmp_file.flush()
279
+ if model == "V1":
280
+ result = utils.v1_clone_voice(tmp_file.name, user_id, token, language_code )
281
+ else:
282
+ result = utils.v2_clone_voice(tmp_file.name, user_id, token)
283
+ voice_id = result['voice_id']
284
+ print(f"Voice Clone succesfully from mode {model} id is {voice_id}")
285
+ reference_audio.seek(0)
286
+ classes.upload_voice_clone_audio(reference_audio, voice_id)
287
+ voice_path = cache_key
288
+ st.session_state.voice_cache[cache_key] = voice_id
289
+ status_msg = f"✓ Cloned voice successfully for language: {language}"
290
+ # Generate speech
291
+ with st.spinner("Generating speech..."):
292
+ loop = asyncio.new_event_loop()
293
+ asyncio.set_event_loop(loop)
294
+
295
+ if model=="V1":
296
+ sr, audio = loop.run_until_complete(
297
+ utils.v1_generate_speech_async(
298
+ session_id, voice_mode, voice_id, model, input_text, language_code, user_id,
299
+ pronunciation_dict_str, speed, expressive, stability, clarity,
300
+ volume_level, speech_rate, stitch_request
301
+ )
302
  )
303
+ else:
304
+ sr, audio = loop.run_until_complete(
305
+ utils.v2_generate_speech_async(
306
+ session_id, voice_mode, voice_id, model, input_text, language_code, user_id,
307
+ pronunciation_dict_str, speed, expressive, stability, clarity,
308
+ volume_level, speech_rate, stitch_request
309
+ )
310
+ )
311
+ loop.close()
312
+
313
+ # st.success(status_msg)
314
+ # st.audio(audio, sample_rate=sr)
315
+
316
+ # st.session_state.show_feedback = True
317
+ # st.session_state.last_session_id = session_id
318
+ # st.success(status_msg)
319
+
320
+ # Store audio + session info in state, mark as available
321
+ st.session_state.last_msg = status_msg
322
+ st.session_state.last_audio = audio
323
+ st.session_state.last_sr = sr
324
+ st.session_state.last_session_id = session_id
325
+ st.session_state.has_audio = True
326
+ st.session_state.show_feedback = True
327
+
328
+ except Exception as e:
329
+ st.error(f"Error: {str(e)}")
330
+ st.session_state.show_feedback = False
331
+ st.markdown("---")
332
+ st.markdown("### 🎧 Output & Feedback")
333
+
334
+ # Column layout for audio + feedback
335
+ a_col, f_col = st.columns([1, 1])
336
+
337
+ with a_col:
338
+ if st.session_state.has_audio and st.session_state.last_audio is not None:
339
+ st.success(st.session_state.last_msg)
340
+ st.audio(st.session_state.last_audio, sample_rate=st.session_state.last_sr)
341
+ else:
342
+ st.markdown(
343
+ "<div style='opacity:0.4; border:1px dashed #888; padding:0.75rem; text-align:center;'>"
344
+ "Audio preview will appear here after you generate speech."
345
+ "</div>",
346
+ unsafe_allow_html=True,
347
+ )
348
+
349
+ with f_col:
350
+ # Disabled / enabled based on has_audio flag
351
+ disabled = not (st.session_state.show_feedback and st.session_state.has_audio)
352
+
353
  st.markdown(
354
+ "<div style='opacity:{};'>".format("1.0" if not disabled else "0.4"),
 
 
355
  unsafe_allow_html=True,
356
  )
357
 
358
+ rating_index = st.radio(
359
+ "Rate this audio:",
360
+ options=[0, 1, 2, 3, 4],
361
+ format_func=lambda i: "⭐" * (i + 1),
362
+ horizontal=True,
363
+ index=None,
364
+ key="rating_index",
365
+ disabled=disabled,
366
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
367
 
368
+ feedback_msg = st.text_area(
369
+ "✍️ Feedback (optional)",
370
+ placeholder="Enter your feedback here...",
371
+ height=80,
372
+ key="feedback_msg",
373
+ disabled=disabled,
374
+ )
375
 
376
+ submit_clicked = st.button(
377
+ "📤 Submit Feedback",
378
+ type="primary",
379
+ disabled=disabled,
380
+ key="submit_feedback_btn",
381
+ use_container_width=True
382
+ )
383
 
384
+ st.markdown("</div>", unsafe_allow_html=True)
 
 
 
 
 
 
 
 
 
 
385
 
386
+ if submit_clicked:
387
+ if rating_index is None:
388
+ st.warning("Please select a rating before submitting.")
389
+ else:
390
+ utils.update_rating(
391
+ session_id=st.session_state.last_session_id,
392
+ rating_index=rating_index,
393
+ feedback_msg=feedback_msg or "",
394
+ )
395
+ # Optionally keep or reset feedback area
396
+ st.session_state.show_feedback = False
397
 
398
+ st.markdown("---")
399
+ st.caption("Ori TTS & Voice Cloning System | Powered by Oriserve")
400
 
401
+ elif st.session_state.page == "Use":
402
+ with st.sidebar:
403
+ st.title("Use this app......")
404
+ st.markdown("---")
405
+ # About Page
406
+ st.markdown("### How to Use This App")
407
+
408
+ st.markdown("""
409
+ **Step 1: Select Model**
410
+ - Select between **V1** or **V2** model from the sidebar
411
+
412
+ **Step 2: 🌐 Select Language**
413
+ - Select your desired language from the dropdown
414
+
415
+ **Step 3: 🎤 Select Voice Mode**
416
+ - **Default Speaker**: Choose from pre-trained voices
417
+ - **Upload Audio**: Upload or Record your own reference audio (min 5 seconds) for voice cloning
418
+
419
+ **Step 4: ✍️ Enter Text**
420
+ - Type or paste the text you want to convert to speech
421
+ - Or you can select any random text by clicking on 🎲 Random Text button
422
+
423
+ **Step 5: ⚙️ Customize Voice Parameters (Optional)**
424
+ - Expand "Advanced Settings" in sidebar to fine-tune:
425
+ - Speech rate
426
+ - Speed
427
+ - Expressive
428
+ - Other voice parameters
429
+
430
+ **Step 6: 🎵 Generate Audio**
431
+
432
+ - Click the **"🎵 Generate Speech"** button
433
+ - Wait for the audio to be generated
434
+ - Play the audio directly in the browser
435
+
436
+ **Step7: Add Pronunciation Pair**
437
+
438
+ - <div>If the model mispronounces some word incorrectly,<br>
439
+ you can correct it by adding the term as the Pronunciation Key and <br>
440
+ its phonetical spelling as the Pronunciation Value. <br>
441
+ For example, if <i><b style="color:red">AI/Cholestrol</b></i> isn't pronounced correctly, respell it as <i><b style = "color:green">ए आई/colestrol</b></i>: <br>
442
+ enter <i><b style="color:red">AI/Cholestrol</b></i> in the Pronunciation Key field and <i><b style = "color:green">ए आई/colestrol</b></i> in the Pronunciation Value field, then click Add Pronunciation Pair.</div>
443
 
444
+ ** Provide Feedback**
445
+ - Rate the generated audio quality
446
+ - Give us your feedback
447
+ - Your feedback helps improve our system
448
+ """,
449
+ unsafe_allow_html=True)
450
+ st.markdown("---")
451
+ st.caption("Ori TTS & Voice Cloning System | Powered by Oriserve")
452
+
453
+ else:
454
+ with st.sidebar:
455
+ st.title("About Us......")
456
+ st.markdown("---")
457
+ # About Page
458
+ st.markdown(
459
+ """
460
+ <style>
461
+ .features-container {
462
+ display: grid;
463
+ grid-template-columns: repeat(2, 1fr);
464
+ gap: 20px;
465
+ }
466
+ .feature-block {
467
+ padding: 15px;
468
+ border-radius: 8px;
469
+ transition: background-color 0.3s ease;
470
+ min-height: 200px;
471
+ display: flex;
472
+ flex-direction: column;
473
+ justify-content: flex-start;
474
+ border: 1px solid #e0e0e0;
475
+ background-color: #111827;
476
+ color: #e5e7eb;
477
+ }
478
+ .feature-block:hover {
479
+ background-color: #EA580C;
480
+ cursor: pointer;
481
+ }
482
+ .feature-title {
483
+ font-size: 1.4em;
484
+ font-weight: bold;
485
+ margin-bottom: 10px;
486
+ }
487
+ .feature-list {
488
+ font-size: 1.05em;
489
+ margin-left: 20px;
490
+ list-style-type: none;
491
+ padding-left: 0;
492
+ }
493
+ .feature-list li {
494
+ margin: 8px 0;
495
+ }
496
+ .section-header {
497
+ font-size: 1.8em;
498
+ font-weight: bold;
499
+ margin: 25px 0 15px 0;
500
+ color: #38bdf8;
501
+ }
502
+ .intro-text {
503
+ font-size: 1.1em;
504
+ line-height: 1.4;
505
+ margin-bottom: 20px;
506
+ }
507
+ .footer {
508
+ margin-top: 20px;
509
+ padding: 15px;
510
+ border-radius: 8px;
511
+ transition: background-color 0.3s ease;
512
+ min-height: 150px;
513
+ display: flex;
514
+ flex-direction: column;
515
+ justify-content: flex-start;
516
+ border: 1px solid #e0e0e0;
517
+ background-color: #020617;
518
+ color: #e5e7eb;
519
+ }
520
+ .footer:hover{
521
+ background-color: #3f3f46;
522
+ }
523
+ .footer .feature-list a.hf-link {
524
+ color: #FFFF;
525
+ text-decoration: none;
526
+ transition: all 0.3s ease;
527
+ display: inline-block;
528
+ }
529
+ .footer .feature-list a.hf-link:hover {
530
+ color: #EA580C;
531
+ font-weight: 600;
532
+ transform: translateX(10px);
533
+ }
534
+ .footer .feature-list span {
535
+ color: #FFFF;
536
+ text-decoration: none;
537
+ transition: all 0.3s ease;
538
+ display: inline-block;
539
+ }
540
+ .footer .feature-list span:hover {
541
+ color: #EA580C;
542
+ font-weight: 600;
543
+ text-decoration: underline;
544
+ }
545
+ @media (max-width: 768px) {
546
+ .features-container {
547
+ grid-template-columns: 1fr;
548
+ }
549
+ }
550
+ </style>
551
+ <div style="text-align: center; font-size: 2.2em; font-weight: bold; margin-bottom: 20px;">
552
+ 🚀 Welcome to ORI Text-to-Speech
553
+ </div>
554
+ <div class="section-header">🌟 About Our Technology</div>
555
+ <div class="intro-text">
556
+ Greetings from Oriserve! We're excited to showcase our refined Text-to-Speech capabilities—powered by generative voice synthesis to deliver
557
+ <strong>natural-sounding</strong> and <strong>professionally tuned</strong> speech output.
558
+ </div>
559
+ <div class="section-header">✨ Key Features</div>
560
+ <div class="features-container">
561
+ <div class="feature-block">
562
+ <div class="feature-title">🎯 Core Capabilities</div>
563
+ <ul class="feature-list">
564
+ <li><strong>Robust voice models suited for production use</strong></li>
565
+ <li><strong>Optimized for English and Hindi, with multilingual expansion underway</strong></li>
566
+ <li><strong>Diverse voice styles for varied use cases</strong></li>
567
+ <li><strong>Responsive audio generation with practical latency</strong></li>
568
+ </ul>
569
+ </div>
570
+ <div class="feature-block">
571
+ <div class="feature-title">🛠️ Advanced Controls</div>
572
+ <ul class="feature-list">
573
+ <li><strong>Customizable voice parameters</strong></li>
574
+ <li><strong>Expressiveness adjustment options</strong></li>
575
+ <li><strong>Balance tuning for clarity and stability</strong></li>
576
+ </ul>
577
+ </div>
578
+ <div class="feature-block">
579
+ <div class="feature-title">💫 Special Features</div>
580
+ <ul class="feature-list">
581
+ <li><strong>Basic context understanding during synthesis</strong></li>
582
+ <li><strong>Text formatting optimized for speech</strong></li>
583
+ <li><strong>Improved handling of common pronunciation cases</strong></li>
584
+ </ul>
585
+ </div>
586
+ <div class="feature-block">
587
+ <div class="feature-title">⚡ Processing Capabilities</div>
588
+ <ul class="feature-list">
589
+ <li><strong>Near real-time synthesis performance</strong></li>
590
+ <li><strong>Optimized latency for interactive use</strong></li>
591
+ <li><strong>Audio streaming with first-byte latency as low as ~150 ms</strong></li>
592
+ </ul>
593
+ </div>
594
+ <div class="feature-block">
595
+ <div class="feature-title">🔊 Audio Quality</div>
596
+ <ul class="feature-list">
597
+ <li><strong>Clear and natural-sounding speech</strong></li>
598
+ <li><strong>Audio fidelity aligned with general production standards</strong></li>
599
+ <li><strong>Consistent synthesis across sessions</strong></li>
600
+ </ul>
601
+ </div>
602
+ <div class="feature-block">
603
+ <div class="feature-title">📈 Future Development</div>
604
+ <ul class="feature-list">
605
+ <li><strong>Continuous quality and performance updates</strong></li>
606
+ <li><strong>More expressive and natural voice styles in progress</strong></li>
607
+ <li><strong>Expanded language and dialect support coming soon</strong></li>
608
+ </ul>
609
+ </div>
610
+ <div class="feature-block">
611
+ <div class="feature-title">🚨 Disclaimer</div>
612
+ <ul class="feature-list">
613
+ <li><strong>The voices and utterances produced by this application are generated by an AI model.</strong></li>
614
+ <li><strong>By using the Voice Clone feature, you confirm you have the necessary rights to any uploaded audio.</strong></li>
615
+ <li><strong>We make no warranty—express or implied—on the accuracy, appropriateness, or quality of the generated speech.</strong></li>
616
+ </ul>
617
+ </div>
618
+ <div class="feature-block">
619
+ <div class="feature-title">How to Reach Us</div>
620
+ <ul class="feature-list">
621
+ <li><strong>Email : <span>ai-team@oriserve.com</span></strong></li>
622
+ <li><strong>Huggingface : <a href="https://huggingface.co/Oriserve" class="hf-link">Oriserve Hugging Face</a></strong></li>
623
+ <li><strong>GitHub : <a href="https://github.com/OriserveAI" class="hf-link">OriserveAI GitHub</a></strong></li>
624
+ <li><strong>Website : <a href="https://oriserve.com/" class="hf-link">Oriserve website</a></strong></li>
625
+ </ul>
626
+ </div>
627
+ </div>
628
+ """,
629
+ unsafe_allow_html=True,
630
+ )
631
+ st.markdown("---")
632
+ st.caption("Ori TTS & Voice Cloning System | Powered by Oriserve")
633
+ pass
634
+
parameters.py CHANGED
@@ -20,4 +20,5 @@ s3_bucket_name = os.getenv("AWS_BUCKET_NAME")
20
  GLOBAL_PRONUNCIATION_DICT=os.getenv("GLOBAL_PRONUNCIATION_DICT")
21
  GLOBAL_PRONUNCIATION_DICT_PATH=f"s3://{s3_bucket_name}/{GLOBAL_PRONUNCIATION_DICT}"
22
  voice_clone_data_key = os.getenv("voice_clone_data_key")
23
- model="ori-tts-v1"
 
 
20
  GLOBAL_PRONUNCIATION_DICT=os.getenv("GLOBAL_PRONUNCIATION_DICT")
21
  GLOBAL_PRONUNCIATION_DICT_PATH=f"s3://{s3_bucket_name}/{GLOBAL_PRONUNCIATION_DICT}"
22
  voice_clone_data_key = os.getenv("voice_clone_data_key")
23
+ model_v1=os.getenv("MODEL_NAME_V1")
24
+ model_v2=os.getenv("MODEL_NAME_V2")
requirements.txt CHANGED
@@ -1,5 +1,6 @@
1
- gradio>=4.0.0
2
- gradio_toggle>=0.0.3
 
3
  pandas>=1.3.0
4
  numpy>=1.20.0
5
  librosa>=0.9.0
@@ -10,6 +11,4 @@ s3fs>=2022.1.0
10
  boto3>=1.20.0
11
  pytz>=2024.1
12
  pydantic==2.10.6
13
- openai
14
-
15
-
 
1
+ altair
2
+ pandas
3
+ streamlit
4
  pandas>=1.3.0
5
  numpy>=1.20.0
6
  librosa>=0.9.0
 
11
  boto3>=1.20.0
12
  pytz>=2024.1
13
  pydantic==2.10.6
14
+ openai
 
 
utils.py CHANGED
@@ -428,7 +428,7 @@ async def v1_generate_speech_async(
428
  # Use AsyncOpenAI streaming response (matches your original code)
429
  try:
430
  async with v1_client.audio.speech.with_streaming_response.create(
431
- model=parameters.model,
432
  voice=send_voice_id,
433
  input=[text],
434
  extra_body=extra_body,
@@ -519,7 +519,7 @@ async def v2_generate_speech_async(
519
  # Use AsyncOpenAI streaming response (matches your original code)
520
  try:
521
  async with v2_client.audio.speech.with_streaming_response.create(
522
- model="ori-tts-v2",
523
  voice=send_voice_id,
524
  input=[text],
525
  extra_body=extra_body
 
428
  # Use AsyncOpenAI streaming response (matches your original code)
429
  try:
430
  async with v1_client.audio.speech.with_streaming_response.create(
431
+ model=parameters.model_v1,
432
  voice=send_voice_id,
433
  input=[text],
434
  extra_body=extra_body,
 
519
  # Use AsyncOpenAI streaming response (matches your original code)
520
  try:
521
  async with v2_client.audio.speech.with_streaming_response.create(
522
+ model=parameters.model_v2,
523
  voice=send_voice_id,
524
  input=[text],
525
  extra_body=extra_body