|
|
import streamlit as st |
|
|
from openai import AsyncOpenAI |
|
|
import numpy as np |
|
|
import io |
|
|
import soundfile as sf |
|
|
import requests |
|
|
import hashlib |
|
|
import json |
|
|
import pickle |
|
|
import pandas as pd |
|
|
import uuid |
|
|
import urllib3 |
|
|
from typing import Generator, Tuple, Union,Dict |
|
|
import warnings |
|
|
import pytz |
|
|
from datetime import datetime |
|
|
import parameters |
|
|
from S3_bucket import AWS |
|
|
|
|
|
urllib3.disable_warnings() |
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
if "loaded_data" not in st.session_state: |
|
|
st.session_state.loaded_data = None |
|
|
|
|
|
ist = pytz.timezone("Asia/Kolkata") |
|
|
aws = AWS() |
|
|
|
|
|
v2_client = AsyncOpenAI(base_url=parameters.V2_TTS_URL, api_key=parameters.TTS_SECRET_KEY) |
|
|
v1_client = AsyncOpenAI(base_url=parameters.V1_TTS_URL, api_key=parameters.TTS_SECRET_KEY) |
|
|
|
|
|
with aws.fs.open(parameters.GLOBAL_PRONUNCIATION_DICT_PATH, 'r') as f: |
|
|
global_pronunc_dict = json.loads(f.read()) |
|
|
|
|
|
def get_audio_hash(file): |
|
|
"""Generate hash for audio file to cache voice cloning""" |
|
|
file.seek(0) |
|
|
data = file.read() |
|
|
file.seek(0) |
|
|
return hashlib.md5(data).hexdigest() |
|
|
|
|
|
def generate_session_id(): |
|
|
sid = str(uuid.uuid4()) |
|
|
return sid |
|
|
|
|
|
def unpack_pkl_data(s3_key=parameters.pkl_data_key): |
|
|
exists = aws.check_if_exists(object_key=s3_key) |
|
|
if not exists: |
|
|
return None |
|
|
try: |
|
|
with aws.fs.open(f"s3://{aws.bucket_name}/{s3_key}", "rb") as f: |
|
|
file_bytes = f.read() |
|
|
loaded_data = pickle.loads(file_bytes) |
|
|
print(f"pkl unpack successful") |
|
|
return loaded_data |
|
|
except Exception as e: |
|
|
print(f"{e}") |
|
|
return None |
|
|
|
|
|
st.session_state.loaded_data = unpack_pkl_data() |
|
|
|
|
|
if st.session_state.loaded_data: |
|
|
language_sentences = st.session_state.loaded_data["language_sentences"] |
|
|
agents = st.session_state.loaded_data["agents"] |
|
|
agents_name = agents.keys() |
|
|
V1_LANGUAGES = st.session_state.loaded_data['V1_LANGUAGES'] |
|
|
V2_LANGUAGES = st.session_state.loaded_data['V2_LANGUAGES'] |
|
|
V1_SPEAKERS = st.session_state.loaded_data['V1_SPEAKERS'] |
|
|
V2_SPEAKERS = st.session_state.loaded_data['V2_SPEAKERS'] |
|
|
else: |
|
|
st.stop() |
|
|
|
|
|
|
|
|
def save_generated_audio(audio_data, session_id): |
|
|
s3_folder = parameters.audio_data_key |
|
|
s3_key = f"{s3_folder}/{session_id}_{uuid.uuid4()}.wav" |
|
|
|
|
|
try: |
|
|
audio_byte_obj = audio_data.tobytes() |
|
|
audio_file = io.BytesIO(audio_byte_obj) |
|
|
aws.s3_upload_wav(obj=audio_file, s3_key=s3_key) |
|
|
return s3_key |
|
|
except Exception as e: |
|
|
return None |
|
|
|
|
|
def audio_header_creater(audio, channels=1, sample_rate=8000, bits_per_sample=16): |
|
|
"""Create WAV header for raw audio data""" |
|
|
audio_duration = len(audio) |
|
|
riff = b"RIFF" |
|
|
chunk = np.array([audio_duration+36], dtype=np.int32).tobytes() |
|
|
wavfmt = b"WAVEfmt " |
|
|
bits16 = b"\x10\x00\x00\x00" |
|
|
audio_format = b"\x01\x00" |
|
|
channel_bytes = np.array([channels], dtype=np.int16).tobytes() |
|
|
sample_rate_bytes = np.array([sample_rate], dtype=np.int32).tobytes() |
|
|
byte_rate = np.array([sample_rate*channels*bits_per_sample / 8], dtype=np.int32).tobytes() |
|
|
bytes_in_frame = np.array([channels*bits_per_sample/8], dtype=np.int16).tobytes() |
|
|
bits_per_sample_bytes = np.array([bits_per_sample], dtype=np.int16).tobytes() |
|
|
data_bytes = b"data" |
|
|
file_size = np.array([audio_duration], dtype=np.int32).tobytes() |
|
|
header = riff+chunk+wavfmt+bits16+audio_format+channel_bytes+sample_rate_bytes+byte_rate+bytes_in_frame+bits_per_sample_bytes+data_bytes+file_size |
|
|
return header |
|
|
|
|
|
def ensure_csv_exists(sep="|"): |
|
|
s3_csv_file_key = parameters.feedback_csv_key |
|
|
exists = aws.check_if_exists(object_key=s3_csv_file_key) |
|
|
if not exists: |
|
|
columns = [ |
|
|
"timestamp", |
|
|
"session_id", |
|
|
"language", |
|
|
"input_method", |
|
|
"agent_used", |
|
|
"user_id", |
|
|
"voice_path", |
|
|
"text_input", |
|
|
"expressiveness", |
|
|
"stability", |
|
|
"clarity", |
|
|
"speech_rate", |
|
|
"loudness", |
|
|
"refine_generation", |
|
|
"model_name" |
|
|
"rating", |
|
|
"feedback", |
|
|
] |
|
|
|
|
|
df = pd.DataFrame(columns=columns) |
|
|
csv_buffer = io.StringIO() |
|
|
df.to_csv(csv_buffer, index=False, sep=sep) |
|
|
aws.s3.put_object( |
|
|
Key=s3_csv_file_key, Bucket=aws.bucket_name, Body=csv_buffer.getvalue() |
|
|
) |
|
|
return s3_csv_file_key |
|
|
return s3_csv_file_key |
|
|
|
|
|
def ensure_error_logs_csv_exists(sep="|"): |
|
|
s3_csv_file_key = parameters.err_csv_key |
|
|
exists = aws.check_if_exists(object_key=s3_csv_file_key) |
|
|
if not exists: |
|
|
columns = [ |
|
|
"timestamp", |
|
|
"err_code", |
|
|
"err_msg", |
|
|
"session_id", |
|
|
"language", |
|
|
"input_method", |
|
|
"text_input", |
|
|
"expressiveness", |
|
|
"stability", |
|
|
"clarity", |
|
|
"speech_rate", |
|
|
"loudness", |
|
|
"refine_generation", |
|
|
"model_name" |
|
|
] |
|
|
|
|
|
df = pd.DataFrame(columns=columns) |
|
|
csv_buffer = io.StringIO() |
|
|
df.to_csv(csv_buffer, index=False, sep=sep) |
|
|
aws.s3.put_object( |
|
|
Key=s3_csv_file_key, Bucket=aws.bucket_name, Body=csv_buffer.getvalue() |
|
|
) |
|
|
return s3_csv_file_key |
|
|
return s3_csv_file_key |
|
|
|
|
|
def log_initial_submission( |
|
|
code: int, |
|
|
session_id, |
|
|
language, |
|
|
input_method, |
|
|
agent_used, |
|
|
user_id, |
|
|
voice_path, |
|
|
text_input, |
|
|
model_name, |
|
|
expressiveness=1.0, |
|
|
stability=100, |
|
|
clarity=1.0, |
|
|
speech_rate=1.0, |
|
|
loudness=1.0, |
|
|
refine_generation=False, |
|
|
err_code=None, |
|
|
err_msg=None, |
|
|
sep="|", |
|
|
): |
|
|
timestamp = datetime.now(ist).strftime("%Y-%m-%d %H:%M:%S") |
|
|
|
|
|
if code == 200: |
|
|
try: |
|
|
s3_csv_file = ensure_csv_exists(sep=sep) |
|
|
|
|
|
new_row = pd.DataFrame( |
|
|
{ |
|
|
"timestamp": [timestamp], |
|
|
"session_id": [session_id], |
|
|
"model_name":[model_name], |
|
|
"language": [language], |
|
|
"input_method": [input_method], |
|
|
"agent_used": [agent_used if agent_used else "None"], |
|
|
"user_id": [user_id], |
|
|
"voice_path": [voice_path if voice_path else "None"], |
|
|
"text_input": [text_input if text_input else "None"], |
|
|
"expressiveness": [expressiveness], |
|
|
"stability": [stability], |
|
|
"clarity": [clarity], |
|
|
"speech_rate": [speech_rate], |
|
|
"loudness": [loudness], |
|
|
"rating": [None], |
|
|
} |
|
|
) |
|
|
|
|
|
if aws.check_if_exists(object_key=s3_csv_file): |
|
|
with aws.fs.open(f"s3://{aws.bucket_name}/{s3_csv_file}", "r") as f: |
|
|
existing_data = pd.read_csv(f, sep=sep) |
|
|
|
|
|
updated_data = pd.concat([existing_data, new_row], ignore_index=True) |
|
|
|
|
|
csv_buffer = io.StringIO() |
|
|
updated_data.to_csv(csv_buffer, index=False, sep=sep) |
|
|
aws.s3.put_object( |
|
|
Key=s3_csv_file, Bucket=aws.bucket_name, Body=csv_buffer.getvalue() |
|
|
) |
|
|
else: |
|
|
csv_buffer = io.StringIO() |
|
|
new_row.to_csv(csv_buffer, index=False, sep=sep) |
|
|
aws.s3.put_object( |
|
|
Key=s3_csv_file, Bucket=aws.bucket_name, Body=csv_buffer.getvalue() |
|
|
) |
|
|
|
|
|
return "Audio generated and saved!" |
|
|
except Exception as e: |
|
|
return f"Error: Could not save data - {str(e)}" |
|
|
else: |
|
|
try: |
|
|
err_csv_file = ensure_error_logs_csv_exists(sep=sep) |
|
|
|
|
|
new_row = pd.DataFrame( |
|
|
{ |
|
|
"timestamp": [timestamp], |
|
|
"err_code": [err_code], |
|
|
"err_msg": [err_msg], |
|
|
"session_id": [session_id], |
|
|
"language": [language], |
|
|
"input_method": [input_method], |
|
|
"agent_used": [agent_used if agent_used else "None"], |
|
|
"user_id":[user_id], |
|
|
"text_input": [text_input if text_input else "None"], |
|
|
"expressiveness": [expressiveness], |
|
|
"stability": [stability], |
|
|
"clarity": [clarity], |
|
|
"speech_rate": [speech_rate], |
|
|
"loudness": [loudness], |
|
|
"refine_generation": [refine_generation], |
|
|
"model_name": [model_name] |
|
|
} |
|
|
) |
|
|
|
|
|
if aws.check_if_exists(object_key=err_csv_file): |
|
|
with aws.fs.open(f"s3://{aws.bucket_name}/{err_csv_file}", "r") as f: |
|
|
existing_data = pd.read_csv(f, sep=sep) |
|
|
|
|
|
updated_data = pd.concat([existing_data, new_row], ignore_index=True) |
|
|
|
|
|
csv_buffer = io.StringIO() |
|
|
updated_data.to_csv(csv_buffer, index=False, sep=sep) |
|
|
aws.s3.put_object( |
|
|
Key=err_csv_file, Bucket=aws.bucket_name, Body=csv_buffer.getvalue() |
|
|
) |
|
|
else: |
|
|
csv_buffer = io.StringIO() |
|
|
new_row.to_csv(csv_buffer, index=False, sep=sep) |
|
|
aws.s3.put_object( |
|
|
Key=err_csv_file, Bucket=aws.bucket_name, Body=csv_buffer.getvalue() |
|
|
) |
|
|
|
|
|
return "Error logging complete!!!" |
|
|
|
|
|
except Exception as e: |
|
|
return f"Error: Could not save error data - {str(e)}" |
|
|
|
|
|
def update_rating(session_id, rating_index, feedback_msg: str): |
|
|
rating = int(rating_index + 1) |
|
|
star_dict = {1: "⭐", 2: "⭐⭐", 3: "⭐⭐⭐", 4: "⭐⭐⭐⭐", 5: "⭐⭐⭐⭐⭐"} |
|
|
|
|
|
try: |
|
|
s3_csv_file = ensure_csv_exists(sep="|") |
|
|
|
|
|
if not aws.check_if_exists(object_key=s3_csv_file): |
|
|
return "Error: No data found" |
|
|
|
|
|
with aws.fs.open(f"s3://{aws.bucket_name}/{s3_csv_file}", "r") as f: |
|
|
df = pd.read_csv(f, sep="|") |
|
|
|
|
|
if session_id in df["session_id"].values: |
|
|
latest_row_index = ( |
|
|
df[df["session_id"] == session_id] |
|
|
.sort_values("timestamp", ascending=False) |
|
|
.index[0] |
|
|
) |
|
|
|
|
|
df.loc[latest_row_index, "rating"] = int(rating) |
|
|
df.loc[latest_row_index, "feedback"] = feedback_msg[:1000] |
|
|
|
|
|
|
|
|
csv_buffer = io.StringIO() |
|
|
df.to_csv(csv_buffer, index=False, sep="|") |
|
|
aws.s3.put_object( |
|
|
Key=s3_csv_file, Bucket=aws.bucket_name, Body=csv_buffer.getvalue() |
|
|
) |
|
|
return ( |
|
|
f"Your rating of {star_dict[rating]} submitted successfully!!\nThank you for the feedback!!", |
|
|
|
|
|
st.success(f"Your rating of {star_dict[rating]} submitted successfully!!\nThank you for the feedback!!") |
|
|
) |
|
|
else: |
|
|
return ( |
|
|
f"Could not find Session {session_id} in tracks\nMake sure to press Generate button Once!!!" |
|
|
), None |
|
|
except Exception as e: |
|
|
return f"Error: Could not update rating - {str(e)}", None |
|
|
|
|
|
|
|
|
def increase_volume(audio_array, factor=10): |
|
|
""" |
|
|
Increase the volume of an audio signal safely. |
|
|
Parameters: |
|
|
- audio_array (numpy.ndarray): The audio waveform array (assumed to be float32 or int16). |
|
|
- factor (float): The amplification factor (>1 increases volume, <1 decreases it). |
|
|
Returns: |
|
|
- numpy.ndarray: The amplified audio array, clipped to avoid distortion. |
|
|
""" |
|
|
if audio_array.dtype == np.int16: |
|
|
max_val = np.iinfo(np.int16).max |
|
|
elif audio_array.dtype == np.float32: |
|
|
max_val = 1.0 |
|
|
else: |
|
|
raise ValueError("Unsupported audio format. Use int16 or float32.") |
|
|
|
|
|
|
|
|
amplified_audio = np.clip(audio_array * factor, -max_val, max_val) |
|
|
|
|
|
return amplified_audio.astype(audio_array.dtype) |
|
|
|
|
|
|
|
|
def handle_input_pronunc_pair(key,value,pronunc_dict): |
|
|
if key and value: |
|
|
pronunc_dict[key] = value |
|
|
st.Success(f"Succesfully added {key} into the pronunciation dictionary") |
|
|
return pronunc_dict |
|
|
else: |
|
|
st.warning("Tried to set key value pair in pronunciation dict with empty value please check input") |
|
|
|
|
|
|
|
|
def v2_clone_voice(audio_path, user_id, token): |
|
|
"""Clone voice using reference audio""" |
|
|
url = parameters.V2_VOICE_CLONE_URL |
|
|
data = {"user_id": user_id,} |
|
|
files = {"audio": open(audio_path, "rb")} |
|
|
headers = {"Authorization": f"Bearer {token}"} |
|
|
try: |
|
|
response = requests.post(url, data=data, files=files, headers=headers) |
|
|
response.raise_for_status() |
|
|
return response.json() |
|
|
except requests.exceptions.RequestException as e: |
|
|
st.warning("Something went wrong Voice cloing. Please try later.") |
|
|
raise Exception(f"Voice cloning failed: {e}") |
|
|
|
|
|
def v1_clone_voice(audio_path, user_id, token, lang_code): |
|
|
"""Clone voice using reference audio""" |
|
|
url = parameters.V1_VOICE_CLONE_URL |
|
|
data = {"user_id": user_id,} |
|
|
files = {"audio": open(audio_path, "rb")} |
|
|
headers = {"Authorization": f"Bearer {token}", "Language":lang_code} |
|
|
try: |
|
|
response = requests.post(url, data=data, files=files, headers=headers) |
|
|
response.raise_for_status() |
|
|
return response.json() |
|
|
except requests.exceptions.RequestException as e: |
|
|
st.warning("Something went wrong Voice cloing. Please try later.") |
|
|
raise Exception(f"Voice cloning failed: {e}") |
|
|
|
|
|
async def v1_generate_speech_async( |
|
|
session_id:str, |
|
|
voice_mode:str, |
|
|
voice_id:str, |
|
|
model:str, |
|
|
text:str, |
|
|
language_code:str, |
|
|
user_id:str, |
|
|
pronunciation_dict:Dict[str, str], |
|
|
speed:float =1.0, |
|
|
expressive:float=0.1, |
|
|
stability:int=1, |
|
|
clarity:float=1.0, |
|
|
volume_level:float=1.0, |
|
|
speech_rate:float=1.0, |
|
|
stitch_request:bool=False, |
|
|
) -> Tuple[int, np.ndarray]: |
|
|
"""Generate speech using AsyncOpenAI client with streaming""" |
|
|
audio_chunks = [] |
|
|
|
|
|
extra_body = { |
|
|
"language": [language_code], |
|
|
"user_id": user_id, |
|
|
"speed": speed, |
|
|
"expressive":expressive, |
|
|
"stability":stability, |
|
|
"clarity":clarity, |
|
|
"volume_level":volume_level, |
|
|
"stitch_request":stitch_request, |
|
|
"pronunciation_dict": pronunciation_dict |
|
|
} |
|
|
extra_headers={ |
|
|
"Language": language_code, |
|
|
} |
|
|
|
|
|
request_to = model |
|
|
if voice_mode == "Default Speaker": |
|
|
if language_code in ['en', 'hi', 'hing']: |
|
|
send_voice_id = [agents.get(voice_id, "Unkown")] |
|
|
else: |
|
|
send_voice_id = [voice_id] |
|
|
else: |
|
|
send_voice_id = voice_id |
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
async with v1_client.audio.speech.with_streaming_response.create( |
|
|
model=parameters.model_v1, |
|
|
voice=send_voice_id, |
|
|
input=[text], |
|
|
extra_body=extra_body, |
|
|
extra_headers=extra_headers |
|
|
) as response: |
|
|
async for chunk in response.iter_bytes(chunk_size=1024): |
|
|
audio_chunks.append(chunk) |
|
|
audio_data = b''.join(audio_chunks) |
|
|
header = audio_header_creater(audio_data, sample_rate=16_000) |
|
|
audio = io.BytesIO(header + audio_data) |
|
|
aud, sr = sf.read(audio) |
|
|
saved_path = save_generated_audio(aud, session_id) |
|
|
log_initial_submission( |
|
|
code=response.status_code, |
|
|
session_id=session_id, |
|
|
language=language_code, |
|
|
input_method=voice_mode, |
|
|
agent_used=voice_id, |
|
|
user_id=user_id, |
|
|
voice_path=saved_path, |
|
|
text_input=text, |
|
|
model_name=request_to, |
|
|
expressiveness=expressive, |
|
|
stability=stability, |
|
|
clarity=clarity, |
|
|
speech_rate=speech_rate, |
|
|
loudness=volume_level |
|
|
) |
|
|
return sr, aud |
|
|
except Exception as e: |
|
|
print(f"Error:- {e}") |
|
|
st.warning("Something went wrong in Audios Generation. Pleace try later.") |
|
|
|
|
|
async def v2_generate_speech_async( |
|
|
session_id: str, |
|
|
voice_mode : str, |
|
|
voice_id : str, |
|
|
model: str, |
|
|
text: str, |
|
|
language_code: str, |
|
|
user_id: str, |
|
|
pronunciation_dict : Dict[str, str], |
|
|
speed: float = 1.0, |
|
|
expressive: float = 0.1, |
|
|
stability: int = 1, |
|
|
clarity: float = 1.0, |
|
|
volume_level:float = 1.0, |
|
|
speech_rate: float = 1.0, |
|
|
stitch_request:bool = False, |
|
|
) -> Tuple[int, np.ndarray]: |
|
|
"""Generate speech using AsyncOpenAI client with streaming""" |
|
|
audio_chunks = [] |
|
|
extra_body = { |
|
|
"language": [language_code], |
|
|
'user_id':user_id, |
|
|
"speed": speed, |
|
|
"expressive":expressive, |
|
|
"stability":stability, |
|
|
"clarity":clarity, |
|
|
"volume_level":volume_level, |
|
|
"stitch_request":stitch_request, |
|
|
"pronunciation_dict": pronunciation_dict |
|
|
} |
|
|
request_to = model |
|
|
if voice_mode == "Default Speaker": |
|
|
send_voice_id = [voice_id] |
|
|
else: |
|
|
send_voice_id = voice_id |
|
|
|
|
|
print(f"\n\nPayload:::---\nModel:-{model}\nSpeaker:-{send_voice_id}\nText:-{text}\nExtra Body: {extra_body}") |
|
|
|
|
|
try: |
|
|
async with v2_client.audio.speech.with_streaming_response.create( |
|
|
model=parameters.model_v2, |
|
|
voice=send_voice_id, |
|
|
input=[text], |
|
|
extra_body=extra_body |
|
|
) as response: |
|
|
async for chunk in response.iter_bytes(chunk_size=1024): |
|
|
audio_chunks.append(chunk) |
|
|
audio_data = b''.join(audio_chunks) |
|
|
header = audio_header_creater(audio_data, sample_rate=24_000) |
|
|
audio = io.BytesIO(header + audio_data) |
|
|
aud, sr = sf.read(audio) |
|
|
saved_path = save_generated_audio(aud, session_id) |
|
|
log_initial_submission( |
|
|
code=response.status_code, |
|
|
session_id=session_id, |
|
|
language=language_code, |
|
|
input_method=voice_mode, |
|
|
agent_used=voice_id, |
|
|
user_id=user_id, |
|
|
voice_path=saved_path, |
|
|
text_input=text, |
|
|
model_name=request_to, |
|
|
expressiveness=expressive, |
|
|
stability=stability, |
|
|
clarity=clarity, |
|
|
speech_rate=speech_rate, |
|
|
loudness=volume_level |
|
|
) |
|
|
return sr, aud |
|
|
except Exception as e: |
|
|
print(f'Error:-{e}') |
|
|
st.warning("Something went wrong in Audios Generation. Pleace try later.") |