ajajali09 commited on
Commit
0c5c2a7
·
1 Parent(s): ebe23c1

add the function to save the audio given by user and change the feedback format

Browse files
Files changed (4) hide show
  1. .gitignore +6 -0
  2. app.py +48 -58
  3. classes.py +69 -0
  4. parameters.py +3 -1
.gitignore CHANGED
@@ -2,3 +2,9 @@ __pycache__/
2
  *.pyc
3
  __pycache__/S3_bucket.cpython-310.pyc
4
  test.py
 
 
 
 
 
 
 
2
  *.pyc
3
  __pycache__/S3_bucket.cpython-310.pyc
4
  test.py
5
+ orittsenv/
6
+ .env
7
+ .gradio/
8
+ .ruff_cache
9
+ *.ipynb
10
+ *.csv
app.py CHANGED
@@ -9,7 +9,6 @@ from datetime import datetime
9
  from typing import Generator, Tuple, Union,Dict
10
  import urllib3
11
  import warnings
12
-
13
  import gradio as gr
14
  import numpy as np
15
  import pandas as pd
@@ -31,10 +30,8 @@ load_dotenv()
31
 
32
  aws = AWS()
33
 
34
-
35
  audio_manager = classes.AudioStateManager()
36
 
37
-
38
  def unpack_pkl_data(s3_key=parameters.pkl_data_key):
39
  exists = aws.check_if_exists(object_key=s3_key)
40
  if not exists:
@@ -65,7 +62,6 @@ else:
65
 
66
  def generate_session_id():
67
  sid = str(uuid.uuid4())
68
- print(f"New session started with session IDs: {sid}")
69
  return sid
70
 
71
 
@@ -130,7 +126,7 @@ def save_generated_audio(audio_data, session_id):
130
  except Exception as e:
131
  print(f"Error saving generated audio: {e}")
132
  return None
133
-
134
 
135
  def ensure_csv_exists(sep="|"):
136
  s3_csv_file_key = parameters.feedback_csv_key
@@ -142,6 +138,7 @@ def ensure_csv_exists(sep="|"):
142
  "language",
143
  "input_method",
144
  "agent_used",
 
145
  "voice_path",
146
  "text_input",
147
  "expressiveness",
@@ -200,6 +197,7 @@ def log_initial_submission(
200
  language,
201
  input_method,
202
  agent_used,
 
203
  voice_path,
204
  text_input,
205
  expressiveness=1.0,
@@ -225,6 +223,7 @@ def log_initial_submission(
225
  "language": [language],
226
  "input_method": [input_method],
227
  "agent_used": [agent_used if agent_used else "None"],
 
228
  "voice_path": [voice_path if voice_path else "None"],
229
  "text_input": [text_input if text_input else "None"],
230
  "expressiveness": [expressiveness],
@@ -271,6 +270,7 @@ def log_initial_submission(
271
  "language": [language],
272
  "input_method": [input_method],
273
  "agent_used": [agent_used if agent_used else "None"],
 
274
  "text_input": [text_input if text_input else "None"],
275
  "expressiveness": [expressiveness],
276
  "stability": [stability],
@@ -381,40 +381,23 @@ def handle_input_pronunc_pair(key,value,pronunc_dict):
381
  else:
382
  gr.Error("Tried to set key value pair in pronunciation dict with empty value please check input")
383
 
384
- def process_voice_clone(filepath, user_id):
 
385
  """
386
- Make the clone of given audio
387
-
388
- Parameters:
389
- -filepath(str): The given audio path
390
- -user_id(str): User id
391
-
392
- Returns:
393
- -voice name(str):The voice name of the cloned voice
394
-
395
  """
396
- try:
397
- url = parameters.voice_clone_URL
398
- payload = {"user_id": user_id}
399
- files = [("audio", ("clone_req.mp3", open(filepath, "rb"), "audio/mpeg"))]
400
- headers = {"Authorization": f"Bearer {parameters.TTS_secret_key}"}
401
- response = requests.request(
402
- "POST", url, headers=headers, data=payload, files=files
403
- )
 
 
 
404
 
405
- if response.status_code==200:
406
-
407
- response = response.json()
408
- response = response["voice_id"]
409
- return response
410
- elif response.status_code==422:
411
- print(response.text)
412
- else:
413
- response = response.json()
414
- return None
415
- except Exception as e:
416
- print(f"An Error occurred: {e}")
417
- return None
418
 
419
  def tts_inference(
420
  session_id: str,
@@ -423,6 +406,7 @@ def tts_inference(
423
  pronunc_dict: Dict[str,str],
424
  agent: str = None,
425
  recording_data: str = None,
 
426
  expressiveness: float = 1.0,
427
  stability: int = 100,
428
  clarity: float = 1.0,
@@ -431,21 +415,16 @@ def tts_inference(
431
  refine_generation: bool = False,
432
  stream: bool = False,
433
  ) -> Union[Generator[Tuple[int, np.ndarray], None, None], Tuple[int, np.ndarray]]:
 
434
 
435
- user_id = parameters.user_id
436
  make_request = True
437
- if agent is None and recording_data is not None:
438
- audio_manager.update_current_recording(recording_data)
439
- clone_voice_name = process_voice_clone(recording_data, user_id)
440
- if clone_voice_name is not None:
441
- voice_name = str(clone_voice_name)
442
- print(f"The voice name, get from voice clone API:::--{voice_name}")
443
- else:
444
- gr.Info("Sorry, we are facing some issues with cloning this voice.\nPlease reload the app and try again.", title='Error')
445
- print("Did not get any voice name from voice clone api")
446
- make_request=False
447
- else:
448
  voice_name = [agents[agent]]
 
 
 
 
 
449
  if make_request:
450
  print("\nParameters Recieved:\n")
451
  print("speechReqId", session_id)
@@ -473,7 +452,7 @@ def tts_inference(
473
  payload = json.dumps(
474
  {
475
  "speechReqId": session_id,
476
- "user_id": user_id,
477
  "text": [text],
478
  "language": [language_codes[language]],
479
  "voice_name": voice_name,
@@ -486,7 +465,6 @@ def tts_inference(
486
  "pronunciation_dict":pronunc_dict
487
  }
488
  )
489
-
490
  with requests.post(
491
  url=parameters.TTS_URL,
492
  headers={"Authorization": f"Bearer {parameters.TTS_secret_key}"},
@@ -504,6 +482,7 @@ def tts_inference(
504
  language=language,
505
  input_method=None,
506
  agent_used=voice_name,
 
507
  voice_path=recording_data,
508
  text_input=text,
509
  expressiveness=expressiveness,
@@ -567,6 +546,7 @@ def tts_inference(
567
  language=language,
568
  input_method="Select Voice" if agent else "Voice clone",
569
  agent_used=voice_name,
 
570
  voice_path=saved_path,
571
  text_input=text,
572
  expressiveness=expressiveness,
@@ -603,11 +583,15 @@ def tts_tab():
603
  global_pronunc_dict = json.loads(f.read())
604
 
605
  pronunc_dict_state = gr.State(value=global_pronunc_dict)
606
- session_id = generate_session_id()
 
 
 
 
607
 
608
  with gr.Column(elem_classes="input-container"):
609
  session_id_component = gr.Textbox(
610
- elem_id="session_ID", value=session_id, visible=False, label="Session ID"
611
  )
612
 
613
  # Create a 2-column layout for the main content
@@ -997,7 +981,13 @@ def tts_tab():
997
 
998
  generate_button.click(
999
  fn=lambda: (gr.update(interactive=False)), outputs=generate_button
1000
- ).success(fn=generate_session_id, outputs=session_id_component).success(
 
 
 
 
 
 
1001
  fn=tts_inference,
1002
  inputs=[
1003
  session_id_component,
@@ -1006,6 +996,7 @@ def tts_tab():
1006
  pronunc_dict_state,
1007
  agent_dropdown,
1008
  voice_recording,
 
1009
  expressiveness_slider,
1010
  stability_slider,
1011
  clarity_slider,
@@ -1223,11 +1214,10 @@ def initialize_app():
1223
 
1224
  try:
1225
  with gr.Blocks(js=js,css="footer{display:none !important}") as demo:
1226
- with gr.Tabs() as tabs:
1227
- with gr.Tab("🗣️ TTS"):
1228
- tts_tab()
1229
- with gr.Tab("ℹ️ About"):
1230
- about_tab()
1231
  return demo
1232
  except Exception as e:
1233
  print(f"An Error occurred: {e}")
 
9
  from typing import Generator, Tuple, Union,Dict
10
  import urllib3
11
  import warnings
 
12
  import gradio as gr
13
  import numpy as np
14
  import pandas as pd
 
30
 
31
  aws = AWS()
32
 
 
33
  audio_manager = classes.AudioStateManager()
34
 
 
35
  def unpack_pkl_data(s3_key=parameters.pkl_data_key):
36
  exists = aws.check_if_exists(object_key=s3_key)
37
  if not exists:
 
62
 
63
  def generate_session_id():
64
  sid = str(uuid.uuid4())
 
65
  return sid
66
 
67
 
 
126
  except Exception as e:
127
  print(f"Error saving generated audio: {e}")
128
  return None
129
+
130
 
131
  def ensure_csv_exists(sep="|"):
132
  s3_csv_file_key = parameters.feedback_csv_key
 
138
  "language",
139
  "input_method",
140
  "agent_used",
141
+ "user_id",
142
  "voice_path",
143
  "text_input",
144
  "expressiveness",
 
197
  language,
198
  input_method,
199
  agent_used,
200
+ user_id,
201
  voice_path,
202
  text_input,
203
  expressiveness=1.0,
 
223
  "language": [language],
224
  "input_method": [input_method],
225
  "agent_used": [agent_used if agent_used else "None"],
226
+ "user_id": [user_id],
227
  "voice_path": [voice_path if voice_path else "None"],
228
  "text_input": [text_input if text_input else "None"],
229
  "expressiveness": [expressiveness],
 
270
  "language": [language],
271
  "input_method": [input_method],
272
  "agent_used": [agent_used if agent_used else "None"],
273
+ "user_id":[user_id],
274
  "text_input": [text_input if text_input else "None"],
275
  "expressiveness": [expressiveness],
276
  "stability": [stability],
 
381
  else:
382
  gr.Error("Tried to set key value pair in pronunciation dict with empty value please check input")
383
 
384
+
385
+ def get_or_process_voice_clone(filepath:str=None, prev_filepath:str=None, prev_voice_id:str=None):
386
  """
387
+ Only call process_voice_clone() if the audio file path has changed.
 
 
 
 
 
 
 
 
388
  """
389
+ user_id = parameters.user_id
390
+ if filepath is None:
391
+ return None, None
392
+ elif filepath == prev_filepath and prev_voice_id is not None:
393
+ print("\n\nVoice recording unchanged. Reusing previous voice clone.")
394
+ return prev_filepath, prev_voice_id
395
+ clone_voice_name = classes.process_voice_clone(filepath, user_id)
396
+ if clone_voice_name is not None:
397
+ return filepath, clone_voice_name
398
+ else:
399
+ return filepath, None
400
 
 
 
 
 
 
 
 
 
 
 
 
 
 
401
 
402
  def tts_inference(
403
  session_id: str,
 
406
  pronunc_dict: Dict[str,str],
407
  agent: str = None,
408
  recording_data: str = None,
409
+ clone_voice_name:str = None,
410
  expressiveness: float = 1.0,
411
  stability: int = 100,
412
  clarity: float = 1.0,
 
415
  refine_generation: bool = False,
416
  stream: bool = False,
417
  ) -> Union[Generator[Tuple[int, np.ndarray], None, None], Tuple[int, np.ndarray]]:
418
+
419
 
 
420
  make_request = True
421
+ if agent is not None:
 
 
 
 
 
 
 
 
 
 
422
  voice_name = [agents[agent]]
423
+ elif clone_voice_name is not None:
424
+ voice_name = clone_voice_name
425
+ else:
426
+ make_request = False
427
+
428
  if make_request:
429
  print("\nParameters Recieved:\n")
430
  print("speechReqId", session_id)
 
452
  payload = json.dumps(
453
  {
454
  "speechReqId": session_id,
455
+ "user_id": parameters.user_id,
456
  "text": [text],
457
  "language": [language_codes[language]],
458
  "voice_name": voice_name,
 
465
  "pronunciation_dict":pronunc_dict
466
  }
467
  )
 
468
  with requests.post(
469
  url=parameters.TTS_URL,
470
  headers={"Authorization": f"Bearer {parameters.TTS_secret_key}"},
 
482
  language=language,
483
  input_method=None,
484
  agent_used=voice_name,
485
+ user_id=parameters.user_id,
486
  voice_path=recording_data,
487
  text_input=text,
488
  expressiveness=expressiveness,
 
546
  language=language,
547
  input_method="Select Voice" if agent else "Voice clone",
548
  agent_used=voice_name,
549
+ user_id=parameters.user_id,
550
  voice_path=saved_path,
551
  text_input=text,
552
  expressiveness=expressiveness,
 
583
  global_pronunc_dict = json.loads(f.read())
584
 
585
  pronunc_dict_state = gr.State(value=global_pronunc_dict)
586
+
587
+ prev_recording_state = gr.State(value=None)
588
+ voice_id_state = gr.State(value=None)
589
+
590
+ glob_session_id = generate_session_id()
591
 
592
  with gr.Column(elem_classes="input-container"):
593
  session_id_component = gr.Textbox(
594
+ elem_id="session_ID", value=glob_session_id, visible=False, label="Session ID"
595
  )
596
 
597
  # Create a 2-column layout for the main content
 
981
 
982
  generate_button.click(
983
  fn=lambda: (gr.update(interactive=False)), outputs=generate_button
984
+ ).success(
985
+ fn=generate_session_id, outputs=session_id_component
986
+ ).success(
987
+ fn=get_or_process_voice_clone,
988
+ inputs=[voice_recording, prev_recording_state, voice_id_state],
989
+ outputs=[prev_recording_state, voice_id_state]
990
+ ).success(
991
  fn=tts_inference,
992
  inputs=[
993
  session_id_component,
 
996
  pronunc_dict_state,
997
  agent_dropdown,
998
  voice_recording,
999
+ voice_id_state,
1000
  expressiveness_slider,
1001
  stability_slider,
1002
  clarity_slider,
 
1214
 
1215
  try:
1216
  with gr.Blocks(js=js,css="footer{display:none !important}") as demo:
1217
+ with gr.Tab("🗣️ TTS"):
1218
+ tts_tab()
1219
+ with gr.Tab("ℹ️ About"):
1220
+ about_tab()
 
1221
  return demo
1222
  except Exception as e:
1223
  print(f"An Error occurred: {e}")
classes.py CHANGED
@@ -2,7 +2,75 @@ from dataclasses import dataclass
2
  from typing import Optional, Dict
3
  import librosa
4
  import numpy as np
 
 
 
 
 
 
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  @dataclass
8
  class AudioInfo:
@@ -56,3 +124,4 @@ class AudioStateManager:
56
  "duration": self.current_recording_info.duration,
57
  }
58
  return None
 
 
2
  from typing import Optional, Dict
3
  import librosa
4
  import numpy as np
5
+ import parameters
6
+ from datetime import datetime
7
+ import gradio as gr
8
+ import io
9
+ import requests
10
+ from S3_bucket import AWS
11
 
12
+ aws = AWS()
13
+
14
+ def upload_voice_clone_audio(audio_path:str,clone_id:str=None):
15
+ if clone_id is None:
16
+ clone_id="failed"
17
+
18
+ s3_folder = parameters.voice_clone_data_key
19
+ s3_key = f"{s3_folder}/{datetime.now().strftime("%Y_%b_%d_%H_%M_%S")}_{clone_id}.wav"
20
+ try:
21
+ with open(audio_path, "rb") as f:
22
+ audio_file = io.BytesIO(f.read())
23
+ aws.s3_upload_wav(obj=audio_file,s3_key=s3_key)
24
+ print(f"Uploaded to s3://{datetime.now().strftime("%Y_%b_%d_%H_%M_%S")}_{clone_id}.wav")
25
+ return s3_key
26
+ except Exception as e:
27
+ print(f"Error uploading voice clone audio: {e}")
28
+ return None
29
+
30
+ def process_voice_clone(filepath, user_id):
31
+ """
32
+ Make the clone of given audio
33
+
34
+ Parameters:
35
+ -filepath(str): The given audio path
36
+ -user_id(str): User id
37
+
38
+ Returns:
39
+ -voice name(str):The voice name of the cloned voice
40
+
41
+ """
42
+ try:
43
+ url = parameters.voice_clone_URL
44
+ payload = {"user_id": user_id}
45
+ files = [("audio", ("clone_req.mp3", open(filepath, "rb"), "audio/mpeg"))]
46
+ headers = {"Authorization": f"Bearer {parameters.TTS_secret_key}"}
47
+ response = requests.request(
48
+ "POST", url, headers=headers, data=payload, files=files
49
+ )
50
+
51
+ if response.status_code==200:
52
+ response = response.json()
53
+ response = response["voice_id"]
54
+ upload_voice_clone_audio(filepath, response)
55
+ print("\n\nThe voice cloning is successful.")
56
+ return response
57
+
58
+ elif response.status_code==422:
59
+ upload_voice_clone_audio(filepath, response)
60
+ print(response.text)
61
+ gr.Info("Sorry, we are facing some issues with cloning this voice.\nPlease reload the app and try again.", title='Error')
62
+ print("Failed to clone the voice.")
63
+
64
+ else:
65
+ upload_voice_clone_audio(filepath, response)
66
+ response = response.json()
67
+ gr.Info("Sorry, we are facing some issues with cloning this voice.\nPlease reload the app and try again.", title='Error')
68
+ print("Failed to clone the voice.")
69
+ return None
70
+ except Exception as e:
71
+ print(f"An Error occurred: {e}")
72
+ return None
73
+
74
 
75
  @dataclass
76
  class AudioInfo:
 
124
  "duration": self.current_recording_info.duration,
125
  }
126
  return None
127
+
parameters.py CHANGED
@@ -14,4 +14,6 @@ user_id = os.getenv("user_id")
14
  aws_config = os.getenv("aws_config")
15
  s3_bucket_name = os.getenv("AWS_BUCKET_NAME")
16
  GLOBAL_PRONUNCIATION_DICT=os.getenv("GLOBAL_PRONUNCIATION_DICT")
17
- GLOBAL_PRONUNCIATION_DICT_PATH=f"s3://{s3_bucket_name}/{GLOBAL_PRONUNCIATION_DICT}"
 
 
 
14
  aws_config = os.getenv("aws_config")
15
  s3_bucket_name = os.getenv("AWS_BUCKET_NAME")
16
  GLOBAL_PRONUNCIATION_DICT=os.getenv("GLOBAL_PRONUNCIATION_DICT")
17
+ GLOBAL_PRONUNCIATION_DICT_PATH=f"s3://{s3_bucket_name}/{GLOBAL_PRONUNCIATION_DICT}"
18
+ voice_clone_data_key = os.getenv("voice_clone_data_key")
19
+