Spaces:

Oriserve
/

OriTTS

Running

App Files Files Community

ajajali09 commited on Dec 1, 2025

Commit

fc2d6c1

1 Parent(s): b2531e4

new streamlit app

Browse files

Files changed (4) hide show

app.py +568 -332
parameters.py +2 -1
requirements.txt +4 -5
utils.py +2 -2

app.py CHANGED Viewed

@@ -6,7 +6,6 @@ import utils
 import classes
 import json
 import random
-from st_audiorec import st_audiorec
 from S3_bucket import AWS
 aws = AWS()
@@ -33,366 +32,603 @@ if "pronunc_dict" not in st.session_state:
 if "voice_cache" not in st.session_state:
     st.session_state.voice_cache = {}
-# Streamlit UI
-st.set_page_config(page_title="Ori TTS & Voice Cloning", layout="wide")
-st.title("🎙️ Ori TTS & Voice Cloning System")
-st.markdown("Choose a default speaker or upload reference audio (min 5 sec), select a language, and enter text to generate speech")
-with st.sidebar:
-    st.header("Models......")
-    model = st.radio("Select Model", ["V1", "V2"])
-    if model == "V1":
-        st.header("Languages.....")
-        language = st.selectbox("Select Language", list(utils.V1_LANGUAGES.keys()))
-        st.header("Voice Settings.....")
-        voice_mode = st.radio("Voice Selection Mode", ["Default Speaker", "Upload Audio"])
-        if voice_mode == "Default Speaker":
-            default_speaker = st.selectbox("Select Default Speaker", list(utils.V1_SPEAKERS[utils.V1_LANGUAGES[language]]))
-            reference_audio = None
-        else:
-            st.info("Give a reference audio (min 5 seconds)")
-            # audio_file = st.file_uploader("Reference Audio", type=['wav', 'mp3', 'flac'])
-            # reference_audio = audio_file
-            # default_speaker = None
-            audio_source = st.radio(
-                "Reference audio source",
-                ["Upload file", "Record audio"],
-                horizontal=True,
-                key="v1_audio_source",
-            )
-            default_speaker = None
-            if audio_source == "Upload file":
-                reference_audio = st.file_uploader(
-                    "Upload Reference Audio",
-                    type=["wav", "mp3", "flac"],
-                    key="v1_file_uploader",
-                )
-            else:  # Record audio
-                reference_audio = st.audio_input(
-                    "Record Reference Audio",
-                    key="v1_audio_input",
-                )
     else:
-        st.header("Languages.....")
-        language = st.selectbox("Select Language", list(utils.V2_LANGUAGES.keys()))
-        st.header("Voice Settings.....")
-        voice_mode = st.radio("Voice Selection Mode", ["Default Speaker", "Upload Audio"])
-        if voice_mode == "Default Speaker":
-            default_speaker = st.selectbox("Select Default Speaker", list(utils.V2_SPEAKERS[utils.V2_LANGUAGES[language]]))
-            reference_audio = None
-        else:
-            st.info("Give a reference audio (min 5 seconds)")
-            # audio_file = st.file_uploader("Reference Audio", type=['wav', 'mp3', 'flac'])
-            # reference_audio = audio_file
-            # default_speaker = None
-            audio_source = st.radio(
-                "Reference audio source",
-                ["Upload file", "Record audio"],
-                horizontal=True,
-                key="v2_audio_source",
-            )
-            default_speaker = None
-            if audio_source == "Upload file":
-                reference_audio = st.file_uploader(
-                    "Upload Reference Audio",
-                    type=["wav", "mp3", "flac"],
-                    key="v2_file_uploader",
                 )
             else:
-                reference_audio = st.audio_input(
-                    "Record Reference Audio",
-                    key="v2_audio_input",
                 )
-    with st.expander("Advanced Settings"):
-        speech_rate = st.slider("Speech Rate", 0.25, 2.0, 1.0, 0.25)
-        speed = st.slider("Speed", 0.5, 2.0, 1.0, 0.1)
-        expressive = st.slider("Expressive", 0.0, 1.0, 0.1, 0.05)
-        stability = st.slider("Stability", 0, 10, 1, 1)
-        clarity = st.slider("Clarity", 0.0, 1.0, 0.1, 0.1)
-        volume_level = st.slider("Volume Level", 0.5, 3.0, 1.0, 0.1)
-        stitch_request = st.checkbox("Stitch Request ()", value=False)
-# Main content
-col1, col2 = st.columns([2, 1])
-with col1:
-    if 'input_text' not in st.session_state:
-        st.session_state['input_text'] = ''
-    if 'set_random_next_run' not in st.session_state:
-        st.session_state.set_random_next_run = False
-    if 'pending_random_text' not in st.session_state:
-        st.session_state.pending_random_text = ''
-    input_text = st.text_area(
-        "Input Text",
-        key='input_text',
-        placeholder="Enter the text you want to synthesize...",
-        height=130
-        )
-    btn_col1, btn_col2 = st.columns(2)
-    with btn_col1:
-        random_btn = st.button("🎲 Random Text", use_container_width=True)
-    with btn_col2:
-        generate_btn = st.button("🎵 Generate Speech", type="primary", use_container_width=True)
-with col2:
-    st.markdown("### Add Pronunciation Pair")
-    key_col1, value_col2 = st.columns(2)
-    with key_col1:
-        pr_key = st.text_input(
-            "Pronunciation key 👇",
-            label_visibility="visible",
-            disabled=False,
-            placeholder="Enter word",
-            key="pr_key",
-        )
-    with value_col2:
-        pr_value = st.text_input(
-            "Pronunciation value 👇",
-            label_visibility="visible",
-            disabled=False,
-            placeholder="Enter correct pronunciation",
-            key="pr_value",
-        )
-    add_pair = st.button("Add Pronunciation Pair", type='primary', use_container_width=True)
-    if add_pair:
-        if pr_key.strip() and pr_value.strip():
-            st.session_state.pronunc_dict[pr_key.strip()] = pr_value.strip()
-            st.success(f"Added pronunciation pair: {pr_key.strip()} → {pr_value.strip()}")
-            # do NOT assign st.session_state.pr_key / pr_value here
         else:
-            st.warning("Both key and value are required to add a pronunciation pair.")
-        st.markdown("""
-        If the model mispronounces some word incorrectly, you can correct it by adding the term as the Pronunciation Key and its phonetic spelling as the Pronunciation Value. For example, if AI/Cholestrol isn't pronounced correctly, respell it as ए आई/colestrol: enter AI/Cholestrol in the Pronunciation Key field and ए आई/colestrol in the Pronunciation Value field, then click **Add Pronunciation Pair**.
-        """)
-if random_btn:
-    if language in utils.language_sentences.keys():
-        random_text = random.choice(utils.language_sentences[language])
-        st.session_state.pending_random_text = random_text
-        st.session_state.set_random_next_run = True
-        st.rerun()
-    else:
-        st.warning(f"No sample sentences available for {language}")
-if generate_btn:
-    session_id = utils.generate_session_id()
-    print(f"\n\nGenerate btn is pressed.....\nThis is the session ID : -{session_id}")
-    # Validate pronunciation input
-    # if pr_key.strip() and pr_value.strip():
-    pronunciation_dict_str = st.session_state.pronunc_dict
-    # else:
-    #     pronunciation_dict_str = {}
-    input_text = st.session_state.input_text
-    if not input_text.strip():
-        st.error("Please enter text to synthesize")
-    elif len(input_text) > 1000:
-        st.error(f"Text length must be less than 1000 characters. Current length: {len(input_text)}")
-    else:
-        try:
-            token = parameters.TTS_SECRET_KEY
-            if model == "V1":
-                language_code = utils.V1_LANGUAGES[language]
-            else:
-                language_code = utils.V2_LANGUAGES[language]
-            user_id = parameters.user_id
-            voice_path = None
-            # Determine voice_id based on mode
-            if voice_mode == "Default Speaker" and model == "V1":
-                if language_code in list(utils.V1_SPEAKERS.keys()):
-                    voice_id = default_speaker
-                    status_msg = f"Using default speaker: {default_speaker} for {language}"
-                else:
-                    st.error(f"Language {language} not available for {default_speaker}")
-                    st.stop()
-            elif voice_mode == "Default Speaker" and model == "V2":
-                if language_code in list(utils.V2_SPEAKERS.keys()):
-                    voice_id = default_speaker
-                    status_msg = f"Using default speaker: {default_speaker} for {language}"
-                else:
-                    st.error(f"Language {language} not available for {default_speaker}")
-                    st.stop()
-            else:
-                if not reference_audio:
-                    st.warning("Please upload a reference audio file")
-                    st.stop()
-                audio_hash = utils.get_audio_hash(reference_audio)
-                cache_key = f"{audio_hash}_{language_code}_{model}"
-                if cache_key in st.session_state.voice_cache:
-                    voice_id = st.session_state.voice_cache[cache_key]
-                    voice_path = cache_key
-                    status_msg = f"✓ Using cached voice ID for language: {language}"
                 else:
-                    with st.spinner("Cloning voice..."):
-                        with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
-                            tmp_file.write(reference_audio.read())
-                            tmp_file.flush()
-                            if model == "V1":
-                                result = utils.v1_clone_voice(tmp_file.name, user_id, token, language_code )
-                            else:
-                                result = utils.v2_clone_voice(tmp_file.name, user_id, token)
-                            voice_id = result['voice_id']
-                            print(f"Voice Clone succesfully from mode {model} id is {voice_id}")
-                            reference_audio.seek(0)
-                            classes.upload_voice_clone_audio(reference_audio, voice_id)
-                            voice_path = cache_key
-                            st.session_state.voice_cache[cache_key] = voice_id
-                    status_msg = f"✓ Cloned voice successfully for language: {language}"
-            # Generate speech
-            with st.spinner("Generating speech..."):
-                loop = asyncio.new_event_loop()
-                asyncio.set_event_loop(loop)
-                if model=="V1":
-                    sr, audio = loop.run_until_complete(
-                        utils.v1_generate_speech_async(
-                            session_id, voice_mode, voice_id, model, input_text, language_code, user_id,
-                            pronunciation_dict_str, speed, expressive, stability, clarity,
-                            volume_level, speech_rate, stitch_request
-                        )
-                    )
                 else:
-                    sr, audio = loop.run_until_complete(
-                        utils.v2_generate_speech_async(
-                            session_id, voice_mode, voice_id, model, input_text, language_code, user_id,
-                            pronunciation_dict_str, speed, expressive, stability, clarity,
-                            volume_level, speech_rate, stitch_request
                         )
-                    )
-                loop.close()
-            # st.success(status_msg)
-            # st.audio(audio, sample_rate=sr)
-            # st.session_state.show_feedback = True
-            # st.session_state.last_session_id = session_id
-            # st.success(status_msg)
-            # Store audio + session info in state, mark as available
-            st.session_state.last_msg = status_msg
-            st.session_state.last_audio = audio
-            st.session_state.last_sr = sr
-            st.session_state.last_session_id = session_id
-            st.session_state.has_audio = True
-            st.session_state.show_feedback = True
-        except Exception as e:
-            st.error(f"Error: {str(e)}")
-            st.session_state.show_feedback = False
-st.markdown("---")
-st.markdown("### 🎧 Output & Feedback")
-# Column layout for audio + feedback
-a_col, f_col = st.columns([1, 1])
-with a_col:
-    if st.session_state.has_audio and st.session_state.last_audio is not None:
-        st.success(st.session_state.last_msg)
-        st.audio(st.session_state.last_audio, sample_rate=st.session_state.last_sr)
-    else:
         st.markdown(
-            "<div style='opacity:0.4; border:1px dashed #888; padding:0.75rem; text-align:center;'>"
-            "Audio preview will appear here after you generate speech."
-            "</div>",
             unsafe_allow_html=True,
         )
-with f_col:
-    # Disabled / enabled based on has_audio flag
-    disabled = not (st.session_state.show_feedback and st.session_state.has_audio)
-    st.markdown(
-        "<div style='opacity:{};'>".format("1.0" if not disabled else "0.4"),
-        unsafe_allow_html=True,
-    )
-    rating_index = st.radio(
-        "Rate this audio:",
-        options=[0, 1, 2, 3, 4],
-        format_func=lambda i: "⭐" * (i + 1),
-        horizontal=True,
-        index=None,
-        key="rating_index",
-        disabled=disabled,
-    )
-    feedback_msg = st.text_area(
-        "✍️ Feedback (optional)",
-        placeholder="Enter your feedback here...",
-        height=80,
-        key="feedback_msg",
-        disabled=disabled,
-    )
-    submit_clicked = st.button(
-        "📤 Submit Feedback",
-        type="primary",
-        disabled=disabled,
-        key="submit_feedback_btn",
-        use_container_width=True
-    )
-    st.markdown("</div>", unsafe_allow_html=True)
-    if submit_clicked:
-        if rating_index is None:
-            st.warning("Please select a rating before submitting.")
-        else:
-            utils.update_rating(
-                session_id=st.session_state.last_session_id,
-                rating_index=rating_index,
-                feedback_msg=feedback_msg or "",
-            )
-            # Optionally keep or reset feedback area
-            st.session_state.show_feedback = False
-st.markdown("---")
-st.markdown("### How to Use This App")
-st.markdown("""
-    **Step 1: Select Model**
-    - Choose between **V1** or **V2** model from the sidebar
-    **Step 2: Choose Language**
-    - Select your desired language from the dropdown
-    **Step 3: Select Voice Mode**
-    - **Default Speaker**: Choose from pre-trained voices
-    - **Upload Audio**: Upload your own reference audio (min 5 seconds) for voice cloning
-    **Step 4: Enter Text**
-    - Type or paste the text you want to convert to speech
-    **Step 5: Adjust Settings (Optional)**
-    - Expand "Advanced Settings" in sidebar to fine-tune:
-        - Speech rate
-        - Speed
-        - Expressive
-        - Other voice parameters
-    **Step 6: Generate**
-    - Click the **"🎵 Generate Speech"** button
-    - Wait for the audio to be generated
-    - Play the audio directly in the browser
-    """)
-st.markdown("---")
-st.caption("Ori TTS & Voice Cloning System | Powered by Oriserve")

 import classes
 import json
 import random
 from S3_bucket import AWS
 aws = AWS()
 if "voice_cache" not in st.session_state:
     st.session_state.voice_cache = {}
+if "page" not in st.session_state:
+    st.session_state.page = "Home"
+# Top nav (always visible)
+col_h, col_u, col_a, _ = st.columns([0.2, 0.2, 0.2, 0.4])
+with col_h:
+    if st.session_state.page == "Home":
+        if st.button("🏠 Home", key="nav_home", type='primary', use_container_width=True):
+            st.session_state.page = "Home"
     else:
+        if st.button("Home", key="nav_home", use_container_width=True):
+            st.session_state.page = "Home"
+with col_u:
+    if st.session_state.page == "Use":
+        if st.button("Use", key="nav_use", type='primary', use_container_width=True):
+            st.session_state.page = "Use"
+    else:
+        if st.button("How to use app", key="nav_use", use_container_width=True):
+            st.session_state.page = "Use"
+with col_a:
+    if st.session_state.page == "About":
+        if st.button("ℹ️ About", key="nav_about", type='primary', use_container_width=True):
+            st.session_state.page = "About"
+    else:
+        if st.button("About", key="nav_about", use_container_width=True):
+            st.session_state.page = "About"
+if st.session_state.page == "Home":
+    # Streamlit UI
+    st.set_page_config(page_title="Ori TTS & Voice Cloning", layout="wide")
+    st.title("🎙️ Ori TTS & Voice Cloning System")
+    st.markdown("Choose a default speaker or upload reference audio (min 5 sec), select a language, and enter text to generate speech")
+    with st.sidebar:
+        st.title("Home")
+        st.markdown("---")
+        st.header("Models......")
+        model = st.radio("Select Model", ["V1", "V2"])
+        if model == "V1":
+            st.header("Languages.....")
+            language = st.selectbox("Select Language", list(utils.V1_LANGUAGES.keys()))
+            st.header("Voice Settings.....")
+            voice_mode = st.radio("Voice Selection Mode", ["Default Speaker", "Upload Audio"])
+            if voice_mode == "Default Speaker":
+                default_speaker = st.selectbox("Select Default Speaker", list(utils.V1_SPEAKERS[utils.V1_LANGUAGES[language]]))
+                reference_audio = None
+            else:
+                st.info("Give a reference audio (min 5 seconds)")
+                # audio_file = st.file_uploader("Reference Audio", type=['wav', 'mp3', 'flac'])
+                # reference_audio = audio_file
+                # default_speaker = None
+                audio_source = st.radio(
+                    "Reference audio source",
+                    ["Upload file", "Record audio"],
+                    horizontal=True,
+                    key="v1_audio_source",
                 )
+                default_speaker = None
+                if audio_source == "Upload file":
+                    reference_audio = st.file_uploader(
+                        "Upload Reference Audio",
+                        type=["wav", "mp3", "flac"],
+                        key="v1_file_uploader",
+                    )
+                else:  # Record audio
+                    reference_audio = st.audio_input(
+                        "Record Reference Audio",
+                        key="v1_audio_input",
+                    )
+        else:
+            st.header("Languages.....")
+            language = st.selectbox("Select Language", list(utils.V2_LANGUAGES.keys()))
+            st.header("Voice Settings.....")
+            voice_mode = st.radio("Voice Selection Mode", ["Default Speaker", "Upload Audio"])
+            if voice_mode == "Default Speaker":
+                default_speaker = st.selectbox("Select Default Speaker", list(utils.V2_SPEAKERS[utils.V2_LANGUAGES[language]]))
+                reference_audio = None
             else:
+                st.info("Give a reference audio (min 5 seconds)")
+                # audio_file = st.file_uploader("Reference Audio", type=['wav', 'mp3', 'flac'])
+                # reference_audio = audio_file
+                # default_speaker = None
+                audio_source = st.radio(
+                    "Reference audio source",
+                    ["Upload file", "Record audio"],
+                    horizontal=True,
+                    key="v2_audio_source",
                 )
+                default_speaker = None
+                if audio_source == "Upload file":
+                    reference_audio = st.file_uploader(
+                        "Upload Reference Audio",
+                        type=["wav", "mp3", "flac"],
+                        key="v2_file_uploader",
+                    )
+                else:
+                    reference_audio = st.audio_input(
+                        "Record Reference Audio",
+                        key="v2_audio_input",
+                    )
+        with st.expander("Advanced Settings"):
+            speech_rate = st.slider("Speech Rate", 0.25, 2.0, 1.0, 0.25)
+            speed = st.slider("Speed", 0.5, 2.0, 1.0, 0.1)
+            expressive = st.slider("Expressive", 0.0, 1.0, 0.1, 0.05)
+            stability = st.slider("Stability", 0, 10, 1, 1)
+            clarity = st.slider("Clarity", 0.0, 1.0, 0.1, 0.1)
+            volume_level = st.slider("Volume Level", 0.5, 3.0, 1.0, 0.1)
+            stitch_request = st.checkbox("Stitch Request ()", value=False)
+    # Main content
+    col1, col2 = st.columns([2, 1])
+    with col1:
+        if 'input_text' not in st.session_state:
+            st.session_state['input_text'] = ''
+        if 'set_random_next_run' not in st.session_state:
+            st.session_state.set_random_next_run = False
+        if 'pending_random_text' not in st.session_state:
+            st.session_state.pending_random_text = ''
+        input_text = st.text_area(
+            "Input Text",
+            key='input_text',
+            placeholder="Enter the text you want to synthesize...",
+            height=130
+            )
+        btn_col1, btn_col2 = st.columns(2)
+        with btn_col1:
+            random_btn = st.button("🎲 Random Text", use_container_width=True)
+        with btn_col2:
+            generate_btn = st.button("🎵 Generate Speech", type="primary", use_container_width=True)
+    with col2:
+        st.markdown("### Add Pronunciation Pair")
+        key_col1, value_col2 = st.columns(2)
+        with key_col1:
+            pr_key = st.text_input(
+                "Pronunciation key 👇",
+                label_visibility="visible",
+                disabled=False,
+                placeholder="Enter word",
+                key="pr_key",
+            )
+        with value_col2:
+            pr_value = st.text_input(
+                "Pronunciation value 👇",
+                label_visibility="visible",
+                disabled=False,
+                placeholder="Enter correct pronunciation",
+                key="pr_value",
+            )
+        add_pair = st.button("Add Pronunciation Pair", type='primary', use_container_width=True)
+        if add_pair:
+            if pr_key.strip() and pr_value.strip():
+                st.session_state.pronunc_dict[pr_key.strip()] = pr_value.strip()
+                st.success(f"Added pronunciation pair: {pr_key.strip()} → {pr_value.strip()}")
+                # do NOT assign st.session_state.pr_key / pr_value here
+            else:
+                st.warning("Both key and value are required to add a pronunciation pair.")
+            st.markdown("""
+            If the model mispronounces some word incorrectly, you can correct it by adding the term as the Pronunciation Key and its phonetic spelling as the Pronunciation Value. For example, if AI/Cholestrol isn't pronounced correctly, respell it as ए आई/colestrol: enter AI/Cholestrol in the Pronunciation Key field and ए आई/colestrol in the Pronunciation Value field, then click **Add Pronunciation Pair**.
+            """)
+    if random_btn:
+        if language in utils.language_sentences.keys():
+            random_text = random.choice(utils.language_sentences[language])
+            st.session_state.pending_random_text = random_text
+            st.session_state.set_random_next_run = True
+            st.rerun()
         else:
+            st.warning(f"No sample sentences available for {language}")
+    if generate_btn:
+        session_id = utils.generate_session_id()
+        print(f"\n\nGenerate btn is pressed.....\nThis is the session ID : -{session_id}")
+        # Validate pronunciation input
+        # if pr_key.strip() and pr_value.strip():
+        pronunciation_dict_str = st.session_state.pronunc_dict
+        # else:
+        #     pronunciation_dict_str = {}
+        input_text = st.session_state.input_text
+        if not input_text.strip():
+            st.error("Please enter text to synthesize")
+        elif len(input_text) > 1000:
+            st.error(f"Text length must be less than 1000 characters. Current length: {len(input_text)}")
+        else:
+            try:
+                token = parameters.TTS_SECRET_KEY
+                if model == "V1":
+                    language_code = utils.V1_LANGUAGES[language]
                 else:
+                    language_code = utils.V2_LANGUAGES[language]
+                user_id = parameters.user_id
+                voice_path = None
+                # Determine voice_id based on mode
+                if voice_mode == "Default Speaker" and model == "V1":
+                    if language_code in list(utils.V1_SPEAKERS.keys()):
+                        voice_id = default_speaker
+                        status_msg = f"Using default speaker: {default_speaker} for {language}"
+                    else:
+                        st.error(f"Language {language} not available for {default_speaker}")
+                        st.stop()
+                elif voice_mode == "Default Speaker" and model == "V2":
+                    if language_code in list(utils.V2_SPEAKERS.keys()):
+                        voice_id = default_speaker
+                        status_msg = f"Using default speaker: {default_speaker} for {language}"
+                    else:
+                        st.error(f"Language {language} not available for {default_speaker}")
+                        st.stop()
                 else:
+                    if not reference_audio:
+                        st.warning("Please upload a reference audio file")
+                        st.stop()
+                    audio_hash = utils.get_audio_hash(reference_audio)
+                    cache_key = f"{audio_hash}_{language_code}_{model}"
+                    if cache_key in st.session_state.voice_cache:
+                        voice_id = st.session_state.voice_cache[cache_key]
+                        voice_path = cache_key
+                        status_msg = f"✓ Using cached voice ID for language: {language}"
+                    else:
+                        with st.spinner("Cloning voice..."):
+                            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+                                tmp_file.write(reference_audio.read())
+                                tmp_file.flush()
+                                if model == "V1":
+                                    result = utils.v1_clone_voice(tmp_file.name, user_id, token, language_code )
+                                else:
+                                    result = utils.v2_clone_voice(tmp_file.name, user_id, token)
+                                voice_id = result['voice_id']
+                                print(f"Voice Clone succesfully from mode {model} id is {voice_id}")
+                                reference_audio.seek(0)
+                                classes.upload_voice_clone_audio(reference_audio, voice_id)
+                                voice_path = cache_key
+                                st.session_state.voice_cache[cache_key] = voice_id
+                        status_msg = f"✓ Cloned voice successfully for language: {language}"
+                # Generate speech
+                with st.spinner("Generating speech..."):
+                    loop = asyncio.new_event_loop()
+                    asyncio.set_event_loop(loop)
+                    if model=="V1":
+                        sr, audio = loop.run_until_complete(
+                            utils.v1_generate_speech_async(
+                                session_id, voice_mode, voice_id, model, input_text, language_code, user_id,
+                                pronunciation_dict_str, speed, expressive, stability, clarity,
+                                volume_level, speech_rate, stitch_request
+                            )
                         )
+                    else:
+                        sr, audio = loop.run_until_complete(
+                            utils.v2_generate_speech_async(
+                                session_id, voice_mode, voice_id, model, input_text, language_code, user_id,
+                                pronunciation_dict_str, speed, expressive, stability, clarity,
+                                volume_level, speech_rate, stitch_request
+                            )
+                        )
+                    loop.close()
+                # st.success(status_msg)
+                # st.audio(audio, sample_rate=sr)
+                # st.session_state.show_feedback = True
+                # st.session_state.last_session_id = session_id
+                # st.success(status_msg)
+                # Store audio + session info in state, mark as available
+                st.session_state.last_msg = status_msg
+                st.session_state.last_audio = audio
+                st.session_state.last_sr = sr
+                st.session_state.last_session_id = session_id
+                st.session_state.has_audio = True
+                st.session_state.show_feedback = True
+            except Exception as e:
+                st.error(f"Error: {str(e)}")
+                st.session_state.show_feedback = False
+    st.markdown("---")
+    st.markdown("### 🎧 Output & Feedback")
+    # Column layout for audio + feedback
+    a_col, f_col = st.columns([1, 1])
+    with a_col:
+        if st.session_state.has_audio and st.session_state.last_audio is not None:
+            st.success(st.session_state.last_msg)
+            st.audio(st.session_state.last_audio, sample_rate=st.session_state.last_sr)
+        else:
+            st.markdown(
+                "<div style='opacity:0.4; border:1px dashed #888; padding:0.75rem; text-align:center;'>"
+                "Audio preview will appear here after you generate speech."
+                "</div>",
+                unsafe_allow_html=True,
+            )
+    with f_col:
+        # Disabled / enabled based on has_audio flag
+        disabled = not (st.session_state.show_feedback and st.session_state.has_audio)
         st.markdown(
+            "<div style='opacity:{};'>".format("1.0" if not disabled else "0.4"),
             unsafe_allow_html=True,
         )
+        rating_index = st.radio(
+            "Rate this audio:",
+            options=[0, 1, 2, 3, 4],
+            format_func=lambda i: "⭐" * (i + 1),
+            horizontal=True,
+            index=None,
+            key="rating_index",
+            disabled=disabled,
+        )
+        feedback_msg = st.text_area(
+            "✍️ Feedback (optional)",
+            placeholder="Enter your feedback here...",
+            height=80,
+            key="feedback_msg",
+            disabled=disabled,
+        )
+        submit_clicked = st.button(
+            "📤 Submit Feedback",
+            type="primary",
+            disabled=disabled,
+            key="submit_feedback_btn",
+            use_container_width=True
+        )
+        st.markdown("</div>", unsafe_allow_html=True)
+        if submit_clicked:
+            if rating_index is None:
+                st.warning("Please select a rating before submitting.")
+            else:
+                utils.update_rating(
+                    session_id=st.session_state.last_session_id,
+                    rating_index=rating_index,
+                    feedback_msg=feedback_msg or "",
+                )
+                # Optionally keep or reset feedback area
+                st.session_state.show_feedback = False
+    st.markdown("---")
+    st.caption("Ori TTS & Voice Cloning System | Powered by Oriserve")
+elif st.session_state.page == "Use":
+    with st.sidebar:
+        st.title("Use this app......")
+        st.markdown("---")
+    # About Page
+    st.markdown("### How to Use This App")
+    st.markdown("""
+        **Step 1: Select Model**
+        - Select between **V1** or **V2** model from the sidebar
+        **Step 2: 🌐 Select Language**
+        - Select your desired language from the dropdown
+        **Step 3: 🎤 Select Voice Mode**
+        - **Default Speaker**: Choose from pre-trained voices
+        - **Upload Audio**: Upload or Record your own reference audio (min 5 seconds) for voice cloning
+        **Step 4: ✍️ Enter Text**
+        - Type or paste the text you want to convert to speech
+        - Or you can select any random text by clicking on 🎲 Random Text button
+        **Step 5: ⚙️ Customize Voice Parameters (Optional)**
+        - Expand "Advanced Settings" in sidebar to fine-tune:
+            - Speech rate
+            - Speed
+            - Expressive
+            - Other voice parameters
+        **Step 6: 🎵 Generate Audio**
+        - Click the **"🎵 Generate Speech"** button
+        - Wait for the audio to be generated
+        - Play the audio directly in the browser
+        **Step7: Add Pronunciation Pair**
+        - <div>If the model mispronounces some word incorrectly,<br>
+                you can correct it by adding the term as the Pronunciation Key and <br>
+                its phonetical  spelling as the Pronunciation Value. <br>
+                For example, if <i><b style="color:red">AI/Cholestrol</b></i> isn't pronounced correctly, respell it as <i><b style = "color:green">ए आई/colestrol</b></i>: <br>
+                enter <i><b style="color:red">AI/Cholestrol</b></i> in the Pronunciation Key field and <i><b style = "color:green">ए आई/colestrol</b></i> in the Pronunciation Value field, then click Add Pronunciation Pair.</div>
+        **⭐ Provide Feedback**
+        - Rate the generated audio quality
+        - Give us your feedback
+        - Your feedback helps improve our system
+        """,
+        unsafe_allow_html=True)
+    st.markdown("---")
+    st.caption("Ori TTS & Voice Cloning System | Powered by Oriserve")
+else:
+    with st.sidebar:
+        st.title("About Us......")
+        st.markdown("---")
+    # About Page
+    st.markdown(
+        """
+        <style>
+            .features-container {
+                display: grid;
+                grid-template-columns: repeat(2, 1fr);
+                gap: 20px;
+            }
+            .feature-block {
+                padding: 15px;
+                border-radius: 8px;
+                transition: background-color 0.3s ease;
+                min-height: 200px;
+                display: flex;
+                flex-direction: column;
+                justify-content: flex-start;
+                border: 1px solid #e0e0e0;
+                background-color: #111827;
+                color: #e5e7eb;
+            }
+            .feature-block:hover {
+                background-color: #EA580C;
+                cursor: pointer;
+            }
+            .feature-title {
+                font-size: 1.4em;
+                font-weight: bold;
+                margin-bottom: 10px;
+            }
+            .feature-list {
+                font-size: 1.05em;
+                margin-left: 20px;
+                list-style-type: none;
+                padding-left: 0;
+            }
+            .feature-list li {
+                margin: 8px 0;
+            }
+            .section-header {
+                font-size: 1.8em;
+                font-weight: bold;
+                margin: 25px 0 15px 0;
+                color: #38bdf8;
+            }
+            .intro-text {
+                font-size: 1.1em;
+                line-height: 1.4;
+                margin-bottom: 20px;
+            }
+            .footer {
+                margin-top: 20px;
+                padding: 15px;
+                border-radius: 8px;
+                transition: background-color 0.3s ease;
+                min-height: 150px;
+                display: flex;
+                flex-direction: column;
+                justify-content: flex-start;
+                border: 1px solid #e0e0e0;
+                background-color: #020617;
+                color: #e5e7eb;
+            }
+            .footer:hover{
+                background-color: #3f3f46;
+            }
+            .footer .feature-list a.hf-link {
+                color: #FFFF;
+                text-decoration: none;
+                transition: all 0.3s ease;
+                display: inline-block;
+            }
+            .footer .feature-list a.hf-link:hover {
+                color: #EA580C;
+                font-weight: 600;
+                transform: translateX(10px);
+            }
+            .footer .feature-list span {
+                color: #FFFF;
+                text-decoration: none;
+                transition: all 0.3s ease;
+                display: inline-block;
+            }
+            .footer .feature-list span:hover {
+                color: #EA580C;
+                font-weight: 600;
+                text-decoration: underline;
+            }
+            @media (max-width: 768px) {
+                .features-container {
+                    grid-template-columns: 1fr;
+                }
+            }
+        </style>
+        <div style="text-align: center; font-size: 2.2em; font-weight: bold; margin-bottom: 20px;">
+            🚀 Welcome to ORI Text-to-Speech
+        </div>
+        <div class="section-header">🌟 About Our Technology</div>
+        <div class="intro-text">
+            Greetings from Oriserve! We're excited to showcase our refined Text-to-Speech capabilities—powered by generative voice synthesis to deliver
+            <strong>natural-sounding</strong> and <strong>professionally tuned</strong> speech output.
+        </div>
+        <div class="section-header">✨ Key Features</div>
+        <div class="features-container">
+            <div class="feature-block">
+                <div class="feature-title">🎯 Core Capabilities</div>
+                <ul class="feature-list">
+                    <li><strong>Robust voice models suited for production use</strong></li>
+                    <li><strong>Optimized for English and Hindi, with multilingual expansion underway</strong></li>
+                    <li><strong>Diverse voice styles for varied use cases</strong></li>
+                    <li><strong>Responsive audio generation with practical latency</strong></li>
+                </ul>
+            </div>
+            <div class="feature-block">
+                <div class="feature-title">🛠️ Advanced Controls</div>
+                <ul class="feature-list">
+                    <li><strong>Customizable voice parameters</strong></li>
+                    <li><strong>Expressiveness adjustment options</strong></li>
+                    <li><strong>Balance tuning for clarity and stability</strong></li>
+                </ul>
+            </div>
+            <div class="feature-block">
+                <div class="feature-title">💫 Special Features</div>
+                <ul class="feature-list">
+                    <li><strong>Basic context understanding during synthesis</strong></li>
+                    <li><strong>Text formatting optimized for speech</strong></li>
+                    <li><strong>Improved handling of common pronunciation cases</strong></li>
+                </ul>
+            </div>
+            <div class="feature-block">
+                <div class="feature-title">⚡ Processing Capabilities</div>
+                <ul class="feature-list">
+                    <li><strong>Near real-time synthesis performance</strong></li>
+                    <li><strong>Optimized latency for interactive use</strong></li>
+                    <li><strong>Audio streaming with first-byte latency as low as ~150 ms</strong></li>
+                </ul>
+            </div>
+            <div class="feature-block">
+                <div class="feature-title">🔊 Audio Quality</div>
+                <ul class="feature-list">
+                    <li><strong>Clear and natural-sounding speech</strong></li>
+                    <li><strong>Audio fidelity aligned with general production standards</strong></li>
+                    <li><strong>Consistent synthesis across sessions</strong></li>
+                </ul>
+            </div>
+            <div class="feature-block">
+                <div class="feature-title">📈 Future Development</div>
+                <ul class="feature-list">
+                    <li><strong>Continuous quality and performance updates</strong></li>
+                    <li><strong>More expressive and natural voice styles in progress</strong></li>
+                    <li><strong>Expanded language and dialect support coming soon</strong></li>
+                </ul>
+            </div>
+            <div class="feature-block">
+                <div class="feature-title">🚨 Disclaimer</div>
+                <ul class="feature-list">
+                    <li><strong>The voices and utterances produced by this application are generated by an AI model.</strong></li>
+                    <li><strong>By using the Voice Clone feature, you confirm you have the necessary rights to any uploaded audio.</strong></li>
+                    <li><strong>We make no warranty—express or implied—on the accuracy, appropriateness, or quality of the generated speech.</strong></li>
+                </ul>
+            </div>
+            <div class="feature-block">
+                <div class="feature-title">How to Reach Us</div>
+                <ul class="feature-list">
+                    <li><strong>Email : <span>ai-team@oriserve.com</span></strong></li>
+                    <li><strong>Huggingface : <a href="https://huggingface.co/Oriserve" class="hf-link">Oriserve Hugging Face</a></strong></li>
+                    <li><strong>GitHub : <a href="https://github.com/OriserveAI" class="hf-link">OriserveAI GitHub</a></strong></li>
+                    <li><strong>Website : <a href="https://oriserve.com/" class="hf-link">Oriserve website</a></strong></li>
+                </ul>
+            </div>
+        </div>
+        """,
+        unsafe_allow_html=True,
+    )
+    st.markdown("---")
+    st.caption("Ori TTS & Voice Cloning System | Powered by Oriserve")
+    pass

parameters.py CHANGED Viewed

@@ -20,4 +20,5 @@ s3_bucket_name = os.getenv("AWS_BUCKET_NAME")
 GLOBAL_PRONUNCIATION_DICT=os.getenv("GLOBAL_PRONUNCIATION_DICT")
 GLOBAL_PRONUNCIATION_DICT_PATH=f"s3://{s3_bucket_name}/{GLOBAL_PRONUNCIATION_DICT}"
 voice_clone_data_key = os.getenv("voice_clone_data_key")
-model="ori-tts-v1"

 GLOBAL_PRONUNCIATION_DICT=os.getenv("GLOBAL_PRONUNCIATION_DICT")
 GLOBAL_PRONUNCIATION_DICT_PATH=f"s3://{s3_bucket_name}/{GLOBAL_PRONUNCIATION_DICT}"
 voice_clone_data_key = os.getenv("voice_clone_data_key")
+model_v1=os.getenv("MODEL_NAME_V1")
+model_v2=os.getenv("MODEL_NAME_V2")

requirements.txt CHANGED Viewed

@@ -1,5 +1,6 @@
-gradio>=4.0.0
-gradio_toggle>=0.0.3
 pandas>=1.3.0
 numpy>=1.20.0
 librosa>=0.9.0
@@ -10,6 +11,4 @@ s3fs>=2022.1.0
 boto3>=1.20.0
 pytz>=2024.1
 pydantic==2.10.6
-openai

+altair
+pandas
+streamlit
 pandas>=1.3.0
 numpy>=1.20.0
 librosa>=0.9.0
 boto3>=1.20.0
 pytz>=2024.1
 pydantic==2.10.6
+openai

utils.py CHANGED Viewed

@@ -428,7 +428,7 @@ async def v1_generate_speech_async(
     # Use AsyncOpenAI streaming response (matches your original code)
     try:
         async with v1_client.audio.speech.with_streaming_response.create(
-            model=parameters.model,
             voice=send_voice_id,
             input=[text],
             extra_body=extra_body,
@@ -519,7 +519,7 @@ async def v2_generate_speech_async(
     # Use AsyncOpenAI streaming response (matches your original code)
     try:
         async with v2_client.audio.speech.with_streaming_response.create(
-            model="ori-tts-v2",
             voice=send_voice_id,
             input=[text],
             extra_body=extra_body

     # Use AsyncOpenAI streaming response (matches your original code)
     try:
         async with v1_client.audio.speech.with_streaming_response.create(
+            model=parameters.model_v1,
             voice=send_voice_id,
             input=[text],
             extra_body=extra_body,
     # Use AsyncOpenAI streaming response (matches your original code)
     try:
         async with v2_client.audio.speech.with_streaming_response.create(
+            model=parameters.model_v2,
             voice=send_voice_id,
             input=[text],
             extra_body=extra_body