Spaces:

Oriserve
/

OriTTS

Sleeping

App Files Files Community

ajajali09 commited on May 13, 2025

Commit

ebe23c1

1 Parent(s): 0faef2a

release 14/05

Browse files

Files changed (4) hide show

.gitignore +4 -0
__pycache__/S3_bucket.cpython-310.pyc +0 -0
__pycache__/utils.cpython-310.pyc +0 -0
app.py +188 -142

.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+__pycache__/
+*.pyc
+__pycache__/S3_bucket.cpython-310.pyc
+test.py

__pycache__/S3_bucket.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/S3_bucket.cpython-310.pyc and b/__pycache__/S3_bucket.cpython-310.pyc differ

__pycache__/utils.cpython-310.pyc CHANGED Viewed

Binary files a/__pycache__/utils.cpython-310.pyc and b/__pycache__/utils.cpython-310.pyc differ

app.py CHANGED Viewed

@@ -431,9 +431,9 @@ def tts_inference(
     refine_generation: bool = False,
     stream: bool = False,
 ) -> Union[Generator[Tuple[int, np.ndarray], None, None], Tuple[int, np.ndarray]]:
-    user_id = parameters.user_id
     if agent is None and recording_data is not None:
         audio_manager.update_current_recording(recording_data)
         clone_voice_name = process_voice_clone(recording_data, user_id)
@@ -441,132 +441,70 @@ def tts_inference(
             voice_name = str(clone_voice_name)
             print(f"The voice name, get from voice clone API:::--{voice_name}")
         else:
-            gr.Error("Sorry, we are facing some issues with cloning this voice.\nPlease reload the app and try again.")
-            print("Did not get any voice name from voice clone api:------")
     else:
         voice_name = [agents[agent]]
-    print("\nParameters Recieved:\n")
-    print("speechReqId", session_id)
-    print("text", [text])
-    print("language", [language_codes[language]])
-    print("voice_name", voice_name)
-    print("encoding", "default")
-    print("expressive", expressiveness)
-    print("stability", stability)
-    print("clarity", clarity)
-    print("speech_rate", speech_rate)
-    print("loudness", loudness)
-    print("refine_grneration", refine_generation)
-    print("\n\n")
-    try:
-        if not text or text.strip() == "":
-            raise gr.Error("Text input cannot be empty")
-        if len(text) > 1000:
-            raise gr.Error(
-                f"Text length must be less than 1000 characters. Current length: {len(text)}"
-            )
-        payload = json.dumps(
-            {
-                "speechReqId": session_id,
-                "user_id": user_id,
-                "text": [text],
-                "language": [language_codes[language]],
-                "voice_name": voice_name,
-                "encoding": "default",
-                "expressive": expressiveness,
-                "stability": stability,
-                "clarity": clarity,
-                "speech_rate": speech_rate,
-                "refine_generation": refine_generation,
-                "pronunciation_dict":pronunc_dict
-            }
-        )
-        with requests.post(
-            url=parameters.TTS_URL,
-            headers={"Authorization": f"Bearer {parameters.TTS_secret_key}"},
-            data=payload,
-            verify=False,
-            stream=True,
-        ) as response:
-            if response.status_code != 200:
-                print(
-                    f"API request failed with status {response.status_code} and error: {response.reason}"
-                )
-                log_initial_submission(
-                    code=response.status_code,
-                    session_id=session_id,
-                    language=language,
-                    input_method=None,
-                    agent_used=voice_name,
-                    voice_path=recording_data,
-                    text_input=text,
-                    expressiveness=expressiveness,
-                    stability=stability,
-                    clarity=clarity,
-                    speech_rate=speech_rate,
-                    loudness=loudness,
-                    refine_generation=refine_generation,
-                    err_code=response.status_code,
-                    err_msg=response.reason,
                 )
-                raise gr.Error(f"API Error: {response.status_code} - {response.reason}")
-            sample_rate = 24000
-            if stream:
-                # Streaming mode
-                last_chunk_time = time.time()
-                for chunk in response.iter_content(chunk_size=32768):
-                    chunk_received_time = time.time()
-                    chunk_delay = chunk_received_time - last_chunk_time
-                    last_chunk_time = chunk_received_time
-                    if chunk:
-                        start_processing_time = time.time()
-                        audio_chunk = np.frombuffer(chunk, dtype=np.int16)
-                        processing_time = time.time() - start_processing_time
-                        if len(audio_chunk) > 0:
-                            yield_time_start = time.time()
-                            yield (sample_rate, audio_chunk)
-                            yield_time_complete = time.time()
-                            print(
-                                f"Streaming chunk of size {len(audio_chunk)} - "
-                                f"Chunk delay: {chunk_delay:.4f}s, "
-                                f"Processing time: {processing_time:.4f}s, "
-                                f"Yielding time: {(yield_time_complete - yield_time_start):.4f}s"
-                            )
-            else:
-                # Non-streaming mode
-                start_time = time.time()
-                audio_bytes = b""
-                for chunk in response.iter_content(chunk_size=32768):
-                    if chunk:
-                        print(f"Streaming chunk of size {len(chunk)}")
-                        audio_bytes += chunk
-                if len(audio_bytes) > 0:
-                    complete_audio = np.frombuffer(audio_bytes, dtype=np.int16)
-                    processing_time = time.time() - start_time
-                    complete_audio = increase_volume(complete_audio, factor=loudness)
-                    yield (sample_rate, complete_audio)
-                    saved_path = save_generated_audio(complete_audio, session_id)
                     log_initial_submission(
                         code=response.status_code,
                         session_id=session_id,
                         language=language,
-                        input_method="Select Voice" if agent else "Voice clone",
                         agent_used=voice_name,
-                        voice_path=saved_path,
                         text_input=text,
                         expressiveness=expressiveness,
                         stability=stability,
@@ -574,15 +512,78 @@ def tts_inference(
                         speech_rate=speech_rate,
                         loudness=loudness,
                         refine_generation=refine_generation,
                     )
                 else:
-                    raise ValueError("No audio data received from API")
-    except requests.RequestException as e:
-        raise gr.Error(f"Network Error: Failed to connect to the API server - {str(e)}")
-    except Exception as e:
-        raise gr.Error(f"An unexpected error occurred: {str(e)}")
 def disable_rating_box():
@@ -599,7 +600,7 @@ def disable_rating_box():
 def tts_tab():
     with aws.fs.open(parameters.GLOBAL_PRONUNCIATION_DICT_PATH,'r') as f:
-        global_pronunc_dict = json.loads(f.read())
     pronunc_dict_state = gr.State(value=global_pronunc_dict)
     session_id = generate_session_id()
@@ -631,33 +632,33 @@ def tts_tab():
                         <div class="info-heading">🎯 Quick Start Guide</div>
                     """)
                 gr.Markdown("""
             🌐 **Select Language**
             * Choose from our listed languages
             🎤 **Choose Voice**
             * Select from the curated collection of high-quality voices
             * Each voice is optimized for natural speech patterns
             * You can give your own voice by clicking on Voice clone
             ✍️ **Enter Your Text**
             * Type or paste your text in the input box
             * Or you can give input by clicking Random Sentence
             ⚙️ **Customize Voice Parameters**
             * Fine-tune expressiveness for emotional depth
             * Adjust stability for consistent output
             * Control clarity for precise articulation
             🎵 **Generate Audio**
             * Click the generate button to create your audio
             * Processing typically takes a few seconds
             ⭐ **Provide Feedback**
             * Rate the generated audio quality
             * Give us your feedback
             * Your feedback helps improve our system
             💾 **Access Your Audio**
             * Download the generated audio for offline use
             """)
@@ -805,7 +806,7 @@ def tts_tab():
                     with gr.Row():
                         pronunc_dict_key_in = gr.Textbox(label="Pronunciation key",placeholder="Enter word")
                         pronunc_dict_key_out = gr.Textbox(label="Pronunciation Value",placeholder="Enter word with correct pronunciation")
                     add_btn = gr.Button("Add pronunciation pair", variant="primary")
                     add_btn.click(
@@ -875,6 +876,7 @@ def tts_tab():
                             <li>Add your pronunciation of any word that doesn't sound well</li>
                             <li>If you don't like the pronunciation of any word, then add your word with the key and value pair</li>
                             <li><b style = "color:red">Note:-</b>Pronunciation pairs are <i style="color:red">case sensitive</i></li>
                         </ul>
                     """)
@@ -1094,7 +1096,43 @@ def about_tab():
                     line-height: 1.4;
                     margin-bottom: 20px;
                 }
                 /* For mobile responsiveness */
                 @media (max-width: 768px) {
                     .features-container {
@@ -1103,7 +1141,7 @@ def about_tab():
                 }
             </style>
             <div style="text-align: center; font-size: 3em; font-weight: bold; margin-bottom: 20px;"> 🚀 Welcome to ORI Text-to-Speech </div>
             <div class="section-header">🌟 About Our Technology</div>
             <div class="intro-text">Greetings from Oriserve! We're excited to showcase our refined Text-to-Speech capabilities—powered by generative voice synthesis to deliver <strong>natural-sounding</strong> and <strong>professionally tuned</strong> speech output.</div>
@@ -1114,12 +1152,12 @@ def about_tab():
         <div class="feature-title">🎯 Core Capabilities</div>
         <ul class="feature-list">
             <li><strong>Robust voice models suited for production use</strong></li>
-            <li><strong>Optimized for English and Hindi, with multilingual expansion underway</strong></li>
             <li><strong>Diverse voice styles for varied use cases</strong></li>
             <li><strong>Responsive audio generation with practical latency</strong></li>
         </ul>
     </div>
     <div class="feature-block">
         <div class="feature-title">🛠️ Advanced Controls</div>
         <ul class="feature-list">
@@ -1128,7 +1166,7 @@ def about_tab():
             <li><strong>Balance tuning for clarity and stability</strong></li>
         </ul>
     </div>
     <div class="feature-block">
         <div class="feature-title">💫 Special Features</div>
         <ul class="feature-list">
@@ -1137,7 +1175,7 @@ def about_tab():
             <li><strong>Improved handling of common pronunciation cases</strong></li>
         </ul>
     </div>
     <div class="feature-block">
         <div class="feature-title">⚡ Processing Capabilities</div>
         <ul class="feature-list">
@@ -1146,7 +1184,7 @@ def about_tab():
             <li><strong>Audio streaming with first-byte latency as low as ~150 ms</strong></li>
         </ul>
     </div>
     <div class="feature-block">
         <div class="feature-title">🔊 Audio Quality</div>
         <ul class="feature-list">
@@ -1155,7 +1193,7 @@ def about_tab():
             <li><strong>Consistent synthesis across sessions</strong></li>
         </ul>
     </div>
     <div class="feature-block">
         <div class="feature-title">📈 Future Development</div>
         <ul class="feature-list">
@@ -1164,6 +1202,15 @@ def about_tab():
             <li><strong>Expanded language and dialect support coming soon</strong></li>
         </ul>
     </div>
 </div>
             """
@@ -1171,12 +1218,11 @@ def about_tab():
         return gr.Markdown("")
 def initialize_app():
     # await audio_manager.load_agent_voices(agents)
     try:
-        with gr.Blocks(js=js) as demo:
             with gr.Tabs() as tabs:
                 with gr.Tab("🗣️ TTS"):
                     tts_tab()
@@ -1189,4 +1235,4 @@ def initialize_app():
 demo = initialize_app()
-demo.launch()

     refine_generation: bool = False,
     stream: bool = False,
 ) -> Union[Generator[Tuple[int, np.ndarray], None, None], Tuple[int, np.ndarray]]:
+    user_id = parameters.user_id
+    make_request = True
     if agent is None and recording_data is not None:
         audio_manager.update_current_recording(recording_data)
         clone_voice_name = process_voice_clone(recording_data, user_id)
             voice_name = str(clone_voice_name)
             print(f"The voice name, get from voice clone API:::--{voice_name}")
         else:
+            gr.Info("Sorry, we are facing some issues with cloning this voice.\nPlease reload the app and try again.", title='Error')
+            print("Did not get any voice name from voice clone api")
+            make_request=False
     else:
         voice_name = [agents[agent]]
+    if make_request:
+        print("\nParameters Recieved:\n")
+        print("speechReqId", session_id)
+        print("text", [text])
+        print("language", [language_codes[language]])
+        print("voice_name", voice_name)
+        print("encoding", "default")
+        print("expressive", expressiveness)
+        print("stability", stability)
+        print("clarity", clarity)
+        print("speech_rate", speech_rate)
+        print("loudness", loudness)
+        print("refine_grneration", refine_generation)
+        print("\n\n")
+        try:
+            if not text or text.strip() == "":
+                raise gr.Error("Text input cannot be empty")
+            if len(text) > 1000:
+                raise gr.Error(
+                    f"Text length must be less than 1000 characters. Current length: {len(text)}"
                 )
+            payload = json.dumps(
+                {
+                    "speechReqId": session_id,
+                    "user_id": user_id,
+                    "text": [text],
+                    "language": [language_codes[language]],
+                    "voice_name": voice_name,
+                    "encoding": "default",
+                    "expressive": expressiveness,
+                    "stability": stability,
+                    "clarity": clarity,
+                    "speech_rate": speech_rate,
+                    "refine_generation": refine_generation,
+                    "pronunciation_dict":pronunc_dict
+                }
+            )
+            with requests.post(
+                url=parameters.TTS_URL,
+                headers={"Authorization": f"Bearer {parameters.TTS_secret_key}"},
+                data=payload,
+                verify=False,
+                stream=True,
+            ) as response:
+                if response.status_code != 200:
+                    print(
+                        f"API request failed with status {response.status_code} and error: {response.reason}"
+                    )
                     log_initial_submission(
                         code=response.status_code,
                         session_id=session_id,
                         language=language,
+                        input_method=None,
                         agent_used=voice_name,
+                        voice_path=recording_data,
                         text_input=text,
                         expressiveness=expressiveness,
                         stability=stability,
                         speech_rate=speech_rate,
                         loudness=loudness,
                         refine_generation=refine_generation,
+                        err_code=response.status_code,
+                        err_msg=response.reason,
                     )
+                    raise gr.Error(f"API Error: {response.status_code} - {response.reason}")
+                sample_rate = 24000
+                if stream:
+                    # Streaming mode
+                    last_chunk_time = time.time()
+                    for chunk in response.iter_content(chunk_size=32768):
+                        chunk_received_time = time.time()
+                        chunk_delay = chunk_received_time - last_chunk_time
+                        last_chunk_time = chunk_received_time
+                        if chunk:
+                            start_processing_time = time.time()
+                            audio_chunk = np.frombuffer(chunk, dtype=np.int16)
+                            processing_time = time.time() - start_processing_time
+                            if len(audio_chunk) > 0:
+                                yield_time_start = time.time()
+                                yield (sample_rate, audio_chunk)
+                                yield_time_complete = time.time()
+                                print(
+                                    f"Streaming chunk of size {len(audio_chunk)} - "
+                                    f"Chunk delay: {chunk_delay:.4f}s, "
+                                    f"Processing time: {processing_time:.4f}s, "
+                                    f"Yielding time: {(yield_time_complete - yield_time_start):.4f}s"
+                                )
                 else:
+                    # Non-streaming mode
+                    start_time = time.time()
+                    audio_bytes = b""
+                    for chunk in response.iter_content(chunk_size=32768):
+                        if chunk:
+                            print(f"Streaming chunk of size {len(chunk)}")
+                            audio_bytes += chunk
+                    if len(audio_bytes) > 0:
+                        complete_audio = np.frombuffer(audio_bytes, dtype=np.int16)
+                        processing_time = time.time() - start_time
+                        complete_audio = increase_volume(complete_audio, factor=loudness)
+                        yield (sample_rate, complete_audio)
+                        saved_path = save_generated_audio(complete_audio, session_id)
+                        log_initial_submission(
+                            code=response.status_code,
+                            session_id=session_id,
+                            language=language,
+                            input_method="Select Voice" if agent else "Voice clone",
+                            agent_used=voice_name,
+                            voice_path=saved_path,
+                            text_input=text,
+                            expressiveness=expressiveness,
+                            stability=stability,
+                            clarity=clarity,
+                            speech_rate=speech_rate,
+                            loudness=loudness,
+                            refine_generation=refine_generation,
+                        )
+                    else:
+                        raise ValueError("No audio data received from API")
+        except requests.RequestException as e:
+            raise gr.Error(f"Network Error: Failed to connect to the API server - {str(e)}")
+        except Exception as e:
+            raise gr.Error(f"An unexpected error occurred: {str(e)}")
 def disable_rating_box():
 def tts_tab():
     with aws.fs.open(parameters.GLOBAL_PRONUNCIATION_DICT_PATH,'r') as f:
+        global_pronunc_dict = json.loads(f.read())
     pronunc_dict_state = gr.State(value=global_pronunc_dict)
     session_id = generate_session_id()
                         <div class="info-heading">🎯 Quick Start Guide</div>
                     """)
                 gr.Markdown("""
             🌐 **Select Language**
             * Choose from our listed languages
             🎤 **Choose Voice**
             * Select from the curated collection of high-quality voices
             * Each voice is optimized for natural speech patterns
             * You can give your own voice by clicking on Voice clone
             ✍️ **Enter Your Text**
             * Type or paste your text in the input box
             * Or you can give input by clicking Random Sentence
             ⚙️ **Customize Voice Parameters**
             * Fine-tune expressiveness for emotional depth
             * Adjust stability for consistent output
             * Control clarity for precise articulation
             🎵 **Generate Audio**
             * Click the generate button to create your audio
             * Processing typically takes a few seconds
             ⭐ **Provide Feedback**
             * Rate the generated audio quality
             * Give us your feedback
             * Your feedback helps improve our system
             💾 **Access Your Audio**
             * Download the generated audio for offline use
             """)
                     with gr.Row():
                         pronunc_dict_key_in = gr.Textbox(label="Pronunciation key",placeholder="Enter word")
                         pronunc_dict_key_out = gr.Textbox(label="Pronunciation Value",placeholder="Enter word with correct pronunciation")
                     add_btn = gr.Button("Add pronunciation pair", variant="primary")
                     add_btn.click(
                             <li>Add your pronunciation of any word that doesn't sound well</li>
                             <li>If you don't like the pronunciation of any word, then add your word with the key and value pair</li>
                             <li><b style = "color:red">Note:-</b>Pronunciation pairs are <i style="color:red">case sensitive</i></li>
+                            <li>If the model mispronounces some word incorrectly, you can correct it by adding the term as the Pronunciation Key and its phonetical  spelling as the Pronunciation Value. For example, if <i><b style="color:red">AI/Cholestrol</b></i> isn't pronounced correctly, respell it as <i><b style = "color:green">ए आई/colestrol</b></i>: enter <i><b style="color:red">AI/Cholestrol</b></i> in the Pronunciation Key field and <i><b style = "color:green">ए आई/colestrol</b></i> in the Pronunciation Value field, then click Add Pronunciation Pair.</li>
                         </ul>
                     """)
                     line-height: 1.4;
                     margin-bottom: 20px;
                 }
+                .footer {
+                    margin-top:10px;
+                    padding: 15px;
+                    border-radius: 8px;
+                    transition: background-color 0.3s ease;
+                    min-height: 200px; /* Consistent height */
+                    display: flex;
+                    flex-direction: column;
+                    justify-content: flex-start;
+                    border: 1px solid #e0e0e0;
+                }
+                .footer:hover{
+                    background-color: #3f3f46;
+                }
+                .footer .feature-list a.hf-link {
+                    color: #FFFF;
+                    text-decoration: none;
+                    transition: all 0.3s ease;
+                    display: inline-block;
+                }
+                .footer .feature-list a.hf-link:hover {
+                    color: #EA580C;
+                    font-weight: 600;
+                    transform: translateX(10px);
+                }
+                .footer .feature-list spam {
+                    color: #FFFF;
+                    text-decoration: none;
+                    transition: all 0.3s ease;
+                    display: inline-block;
+                }
+                .footer .feature-list spam:hover {
+                    color: #EA580C;
+                    font-weight: 600;
+                    text-decoration: underline;
+                }
                 /* For mobile responsiveness */
                 @media (max-width: 768px) {
                     .features-container {
                 }
             </style>
             <div style="text-align: center; font-size: 3em; font-weight: bold; margin-bottom: 20px;"> 🚀 Welcome to ORI Text-to-Speech </div>
             <div class="section-header">🌟 About Our Technology</div>
             <div class="intro-text">Greetings from Oriserve! We're excited to showcase our refined Text-to-Speech capabilities—powered by generative voice synthesis to deliver <strong>natural-sounding</strong> and <strong>professionally tuned</strong> speech output.</div>
         <div class="feature-title">🎯 Core Capabilities</div>
         <ul class="feature-list">
             <li><strong>Robust voice models suited for production use</strong></li>
+            <li><strong>Optimized for English and Hindi, with multilingual expansion underway</strong></li>
             <li><strong>Diverse voice styles for varied use cases</strong></li>
             <li><strong>Responsive audio generation with practical latency</strong></li>
         </ul>
     </div>
     <div class="feature-block">
         <div class="feature-title">🛠️ Advanced Controls</div>
         <ul class="feature-list">
             <li><strong>Balance tuning for clarity and stability</strong></li>
         </ul>
     </div>
     <div class="feature-block">
         <div class="feature-title">💫 Special Features</div>
         <ul class="feature-list">
             <li><strong>Improved handling of common pronunciation cases</strong></li>
         </ul>
     </div>
     <div class="feature-block">
         <div class="feature-title">⚡ Processing Capabilities</div>
         <ul class="feature-list">
             <li><strong>Audio streaming with first-byte latency as low as ~150 ms</strong></li>
         </ul>
     </div>
     <div class="feature-block">
         <div class="feature-title">🔊 Audio Quality</div>
         <ul class="feature-list">
             <li><strong>Consistent synthesis across sessions</strong></li>
         </ul>
     </div>
     <div class="feature-block">
         <div class="feature-title">📈 Future Development</div>
         <ul class="feature-list">
             <li><strong>Expanded language and dialect support coming soon</strong></li>
         </ul>
     </div>
+   <div class = "footer">
+        <div class="feature-title">How to Reach Us</div>
+        <ul class="feature-list">
+            <li><strong>Email : <spam>ai-team@oriserve.com</spam></strong></li>
+            <li><strong>Huggingface : <a href="https://huggingface.co/Oriserve" class="hf-link">Oriserve huggingface</a></strong></li>
+            <li><strong>GitHub : <a href="https://github.com/OriserveAI" class="hf-link">OriserveAI github</a></strong></li>
+            <li><strong>Website : <a href="https://oriserve.com/" class="hf-link">Oriserve website</a></strong></li>
+        </ul>
+    </div>
 </div>
             """
         return gr.Markdown("")
 def initialize_app():
     # await audio_manager.load_agent_voices(agents)
     try:
+        with gr.Blocks(js=js,css="footer{display:none !important}") as demo:
             with gr.Tabs() as tabs:
                 with gr.Tab("🗣️ TTS"):
                     tts_tab()
 demo = initialize_app()
+demo.launch(show_api=False)