Spaces:

Oriserve
/

OriTTS

Running

App Files Files Community

ajajali09 commited on May 15, 2025

Commit

0c5c2a7

1 Parent(s): ebe23c1

add the function to save the audio given by user and change the feedback format

Browse files

Files changed (4) hide show

.gitignore +6 -0
app.py +48 -58
classes.py +69 -0
parameters.py +3 -1

.gitignore CHANGED Viewed

@@ -2,3 +2,9 @@ __pycache__/
 *.pyc
 __pycache__/S3_bucket.cpython-310.pyc
 test.py

 *.pyc
 __pycache__/S3_bucket.cpython-310.pyc
 test.py
+orittsenv/
+.env
+.gradio/
+.ruff_cache
+*.ipynb
+*.csv

app.py CHANGED Viewed

@@ -9,7 +9,6 @@ from datetime import datetime
 from typing import Generator, Tuple, Union,Dict
 import urllib3
 import warnings
 import gradio as gr
 import numpy as np
 import pandas as pd
@@ -31,10 +30,8 @@ load_dotenv()
 aws = AWS()
 audio_manager = classes.AudioStateManager()
 def unpack_pkl_data(s3_key=parameters.pkl_data_key):
     exists = aws.check_if_exists(object_key=s3_key)
     if not exists:
@@ -65,7 +62,6 @@ else:
 def generate_session_id():
     sid = str(uuid.uuid4())
-    print(f"New session started with session IDs: {sid}")
     return sid
@@ -130,7 +126,7 @@ def save_generated_audio(audio_data, session_id):
     except Exception as e:
         print(f"Error saving generated audio: {e}")
         return None
 def ensure_csv_exists(sep="|"):
     s3_csv_file_key = parameters.feedback_csv_key
@@ -142,6 +138,7 @@ def ensure_csv_exists(sep="|"):
             "language",
             "input_method",
             "agent_used",
             "voice_path",
             "text_input",
             "expressiveness",
@@ -200,6 +197,7 @@ def log_initial_submission(
     language,
     input_method,
     agent_used,
     voice_path,
     text_input,
     expressiveness=1.0,
@@ -225,6 +223,7 @@ def log_initial_submission(
                     "language": [language],
                     "input_method": [input_method],
                     "agent_used": [agent_used if agent_used else "None"],
                     "voice_path": [voice_path if voice_path else "None"],
                     "text_input": [text_input if text_input else "None"],
                     "expressiveness": [expressiveness],
@@ -271,6 +270,7 @@ def log_initial_submission(
                     "language": [language],
                     "input_method": [input_method],
                     "agent_used": [agent_used if agent_used else "None"],
                     "text_input": [text_input if text_input else "None"],
                     "expressiveness": [expressiveness],
                     "stability": [stability],
@@ -381,40 +381,23 @@ def handle_input_pronunc_pair(key,value,pronunc_dict):
     else:
         gr.Error("Tried to set key value pair in pronunciation dict with empty value please check input")
-def process_voice_clone(filepath, user_id):
     """
-    Make the clone of given audio
-    Parameters:
-    -filepath(str): The given audio path
-    -user_id(str): User id
-    Returns:
-    -voice name(str):The voice name of the cloned voice
     """
-    try:
-        url = parameters.voice_clone_URL
-        payload = {"user_id": user_id}
-        files = [("audio", ("clone_req.mp3", open(filepath, "rb"), "audio/mpeg"))]
-        headers = {"Authorization": f"Bearer {parameters.TTS_secret_key}"}
-        response = requests.request(
-            "POST", url, headers=headers, data=payload, files=files
-        )
-        if response.status_code==200:
-            response = response.json()
-            response = response["voice_id"]
-            return response
-        elif response.status_code==422:
-            print(response.text)
-        else:
-            response = response.json()
-            return None
-    except Exception as e:
-        print(f"An Error occurred: {e}")
-        return None
 def tts_inference(
     session_id: str,
@@ -423,6 +406,7 @@ def tts_inference(
     pronunc_dict: Dict[str,str],
     agent: str = None,
     recording_data: str = None,
     expressiveness: float = 1.0,
     stability: int = 100,
     clarity: float = 1.0,
@@ -431,21 +415,16 @@ def tts_inference(
     refine_generation: bool = False,
     stream: bool = False,
 ) -> Union[Generator[Tuple[int, np.ndarray], None, None], Tuple[int, np.ndarray]]:
-    user_id = parameters.user_id
     make_request = True
-    if agent is None and recording_data is not None:
-        audio_manager.update_current_recording(recording_data)
-        clone_voice_name = process_voice_clone(recording_data, user_id)
-        if clone_voice_name is not None:
-            voice_name = str(clone_voice_name)
-            print(f"The voice name, get from voice clone API:::--{voice_name}")
-        else:
-            gr.Info("Sorry, we are facing some issues with cloning this voice.\nPlease reload the app and try again.", title='Error')
-            print("Did not get any voice name from voice clone api")
-            make_request=False
-    else:
         voice_name = [agents[agent]]
     if make_request:
         print("\nParameters Recieved:\n")
         print("speechReqId", session_id)
@@ -473,7 +452,7 @@ def tts_inference(
             payload = json.dumps(
                 {
                     "speechReqId": session_id,
-                    "user_id": user_id,
                     "text": [text],
                     "language": [language_codes[language]],
                     "voice_name": voice_name,
@@ -486,7 +465,6 @@ def tts_inference(
                     "pronunciation_dict":pronunc_dict
                 }
             )
             with requests.post(
                 url=parameters.TTS_URL,
                 headers={"Authorization": f"Bearer {parameters.TTS_secret_key}"},
@@ -504,6 +482,7 @@ def tts_inference(
                         language=language,
                         input_method=None,
                         agent_used=voice_name,
                         voice_path=recording_data,
                         text_input=text,
                         expressiveness=expressiveness,
@@ -567,6 +546,7 @@ def tts_inference(
                             language=language,
                             input_method="Select Voice" if agent else "Voice clone",
                             agent_used=voice_name,
                             voice_path=saved_path,
                             text_input=text,
                             expressiveness=expressiveness,
@@ -603,11 +583,15 @@ def tts_tab():
         global_pronunc_dict = json.loads(f.read())
     pronunc_dict_state = gr.State(value=global_pronunc_dict)
-    session_id = generate_session_id()
     with gr.Column(elem_classes="input-container"):
         session_id_component = gr.Textbox(
-            elem_id="session_ID", value=session_id, visible=False, label="Session ID"
         )
         # Create a 2-column layout for the main content
@@ -997,7 +981,13 @@ def tts_tab():
     generate_button.click(
         fn=lambda: (gr.update(interactive=False)), outputs=generate_button
-    ).success(fn=generate_session_id, outputs=session_id_component).success(
         fn=tts_inference,
         inputs=[
             session_id_component,
@@ -1006,6 +996,7 @@ def tts_tab():
             pronunc_dict_state,
             agent_dropdown,
             voice_recording,
             expressiveness_slider,
             stability_slider,
             clarity_slider,
@@ -1223,11 +1214,10 @@ def initialize_app():
     try:
         with gr.Blocks(js=js,css="footer{display:none !important}") as demo:
-            with gr.Tabs() as tabs:
-                with gr.Tab("🗣️ TTS"):
-                    tts_tab()
-                with gr.Tab("ℹ️ About"):
-                    about_tab()
         return demo
     except Exception as e:
         print(f"An Error occurred: {e}")

 from typing import Generator, Tuple, Union,Dict
 import urllib3
 import warnings
 import gradio as gr
 import numpy as np
 import pandas as pd
 aws = AWS()
 audio_manager = classes.AudioStateManager()
 def unpack_pkl_data(s3_key=parameters.pkl_data_key):
     exists = aws.check_if_exists(object_key=s3_key)
     if not exists:
 def generate_session_id():
     sid = str(uuid.uuid4())
     return sid
     except Exception as e:
         print(f"Error saving generated audio: {e}")
         return None
 def ensure_csv_exists(sep="|"):
     s3_csv_file_key = parameters.feedback_csv_key
             "language",
             "input_method",
             "agent_used",
+            "user_id",
             "voice_path",
             "text_input",
             "expressiveness",
     language,
     input_method,
     agent_used,
+    user_id,
     voice_path,
     text_input,
     expressiveness=1.0,
                     "language": [language],
                     "input_method": [input_method],
                     "agent_used": [agent_used if agent_used else "None"],
+                    "user_id": [user_id],
                     "voice_path": [voice_path if voice_path else "None"],
                     "text_input": [text_input if text_input else "None"],
                     "expressiveness": [expressiveness],
                     "language": [language],
                     "input_method": [input_method],
                     "agent_used": [agent_used if agent_used else "None"],
+                    "user_id":[user_id],
                     "text_input": [text_input if text_input else "None"],
                     "expressiveness": [expressiveness],
                     "stability": [stability],
     else:
         gr.Error("Tried to set key value pair in pronunciation dict with empty value please check input")
+def get_or_process_voice_clone(filepath:str=None, prev_filepath:str=None, prev_voice_id:str=None):
     """
+    Only call process_voice_clone() if the audio file path has changed.
     """
+    user_id = parameters.user_id
+    if filepath is None:
+        return None, None
+    elif filepath == prev_filepath and prev_voice_id is not None:
+        print("\n\nVoice recording unchanged. Reusing previous voice clone.")
+        return prev_filepath, prev_voice_id
+    clone_voice_name = classes.process_voice_clone(filepath, user_id)
+    if clone_voice_name is not None:
+        return filepath, clone_voice_name
+    else:
+        return filepath, None
 def tts_inference(
     session_id: str,
     pronunc_dict: Dict[str,str],
     agent: str = None,
     recording_data: str = None,
+    clone_voice_name:str = None,
     expressiveness: float = 1.0,
     stability: int = 100,
     clarity: float = 1.0,
     refine_generation: bool = False,
     stream: bool = False,
 ) -> Union[Generator[Tuple[int, np.ndarray], None, None], Tuple[int, np.ndarray]]:
     make_request = True
+    if agent is not None:
         voice_name = [agents[agent]]
+    elif clone_voice_name is not None:
+        voice_name = clone_voice_name
+    else:
+        make_request = False
     if make_request:
         print("\nParameters Recieved:\n")
         print("speechReqId", session_id)
             payload = json.dumps(
                 {
                     "speechReqId": session_id,
+                    "user_id": parameters.user_id,
                     "text": [text],
                     "language": [language_codes[language]],
                     "voice_name": voice_name,
                     "pronunciation_dict":pronunc_dict
                 }
             )
             with requests.post(
                 url=parameters.TTS_URL,
                 headers={"Authorization": f"Bearer {parameters.TTS_secret_key}"},
                         language=language,
                         input_method=None,
                         agent_used=voice_name,
+                        user_id=parameters.user_id,
                         voice_path=recording_data,
                         text_input=text,
                         expressiveness=expressiveness,
                             language=language,
                             input_method="Select Voice" if agent else "Voice clone",
                             agent_used=voice_name,
+                            user_id=parameters.user_id,
                             voice_path=saved_path,
                             text_input=text,
                             expressiveness=expressiveness,
         global_pronunc_dict = json.loads(f.read())
     pronunc_dict_state = gr.State(value=global_pronunc_dict)
+    prev_recording_state = gr.State(value=None)
+    voice_id_state = gr.State(value=None)
+    glob_session_id = generate_session_id()
     with gr.Column(elem_classes="input-container"):
         session_id_component = gr.Textbox(
+            elem_id="session_ID", value=glob_session_id, visible=False, label="Session ID"
         )
         # Create a 2-column layout for the main content
     generate_button.click(
         fn=lambda: (gr.update(interactive=False)), outputs=generate_button
+    ).success(
+        fn=generate_session_id, outputs=session_id_component
+    ).success(
+        fn=get_or_process_voice_clone,
+        inputs=[voice_recording, prev_recording_state, voice_id_state],
+        outputs=[prev_recording_state, voice_id_state]
+    ).success(
         fn=tts_inference,
         inputs=[
             session_id_component,
             pronunc_dict_state,
             agent_dropdown,
             voice_recording,
+            voice_id_state,
             expressiveness_slider,
             stability_slider,
             clarity_slider,
     try:
         with gr.Blocks(js=js,css="footer{display:none !important}") as demo:
+            with gr.Tab("🗣️ TTS"):
+                tts_tab()
+            with gr.Tab("ℹ️ About"):
+                about_tab()
         return demo
     except Exception as e:
         print(f"An Error occurred: {e}")

classes.py CHANGED Viewed

@@ -2,7 +2,75 @@ from dataclasses import dataclass
 from typing import Optional, Dict
 import librosa
 import numpy as np
 @dataclass
 class AudioInfo:
@@ -56,3 +124,4 @@ class AudioStateManager:
                 "duration": self.current_recording_info.duration,
             }
         return None

 from typing import Optional, Dict
 import librosa
 import numpy as np
+import parameters
+from datetime import datetime
+import gradio as gr
+import io
+import requests
+from S3_bucket import AWS
+aws = AWS()
+def upload_voice_clone_audio(audio_path:str,clone_id:str=None):
+    if clone_id is None:
+        clone_id="failed"
+    s3_folder = parameters.voice_clone_data_key
+    s3_key = f"{s3_folder}/{datetime.now().strftime("%Y_%b_%d_%H_%M_%S")}_{clone_id}.wav"
+    try:
+        with open(audio_path, "rb") as f:
+            audio_file = io.BytesIO(f.read())
+        aws.s3_upload_wav(obj=audio_file,s3_key=s3_key)
+        print(f"Uploaded to s3://{datetime.now().strftime("%Y_%b_%d_%H_%M_%S")}_{clone_id}.wav")
+        return s3_key
+    except Exception as e:
+        print(f"Error uploading voice clone audio: {e}")
+        return None
+def process_voice_clone(filepath, user_id):
+    """
+    Make the clone of given audio
+    Parameters:
+    -filepath(str): The given audio path
+    -user_id(str): User id
+    Returns:
+    -voice name(str):The voice name of the cloned voice
+    """
+    try:
+        url = parameters.voice_clone_URL
+        payload = {"user_id": user_id}
+        files = [("audio", ("clone_req.mp3", open(filepath, "rb"), "audio/mpeg"))]
+        headers = {"Authorization": f"Bearer {parameters.TTS_secret_key}"}
+        response = requests.request(
+            "POST", url, headers=headers, data=payload, files=files
+        )
+        if response.status_code==200:
+            response = response.json()
+            response = response["voice_id"]
+            upload_voice_clone_audio(filepath, response)
+            print("\n\nThe voice cloning is successful.")
+            return response
+        elif response.status_code==422:
+            upload_voice_clone_audio(filepath, response)
+            print(response.text)
+            gr.Info("Sorry, we are facing some issues with cloning this voice.\nPlease reload the app and try again.", title='Error')
+            print("Failed to clone the voice.")
+        else:
+            upload_voice_clone_audio(filepath, response)
+            response = response.json()
+            gr.Info("Sorry, we are facing some issues with cloning this voice.\nPlease reload the app and try again.", title='Error')
+            print("Failed to clone the voice.")
+            return None
+    except Exception as e:
+        print(f"An Error occurred: {e}")
+        return None
 @dataclass
 class AudioInfo:
                 "duration": self.current_recording_info.duration,
             }
         return None

parameters.py CHANGED Viewed

@@ -14,4 +14,6 @@ user_id = os.getenv("user_id")
 aws_config = os.getenv("aws_config")
 s3_bucket_name = os.getenv("AWS_BUCKET_NAME")
 GLOBAL_PRONUNCIATION_DICT=os.getenv("GLOBAL_PRONUNCIATION_DICT")
-GLOBAL_PRONUNCIATION_DICT_PATH=f"s3://{s3_bucket_name}/{GLOBAL_PRONUNCIATION_DICT}"

 aws_config = os.getenv("aws_config")
 s3_bucket_name = os.getenv("AWS_BUCKET_NAME")
 GLOBAL_PRONUNCIATION_DICT=os.getenv("GLOBAL_PRONUNCIATION_DICT")
+GLOBAL_PRONUNCIATION_DICT_PATH=f"s3://{s3_bucket_name}/{GLOBAL_PRONUNCIATION_DICT}"
+voice_clone_data_key = os.getenv("voice_clone_data_key")