ChatLingo / app.py
MikeMai's picture
Speech to text and Text to speech function.
a144be7 verified
from openai import OpenAI
from dotenv import load_dotenv
import MAIAI
# from deep_translator import GoogleTranslator
# import speech_recognition as sr
# import assemblyai as aai
# import pyttsx3
from gtts import gTTS
import gradio as gr
from gradio.themes.base import Base
# Load environment variables
load_dotenv()
language_map = {
"Afrikaans": "af",
"Albanian": "sq",
"Amharic": "am",
"Arabic": "ar",
"Armenian": "hy",
"Azerbaijani": "az",
"Basque": "eu",
"Bengali": "bn",
"Bosnian": "bs",
"Bulgarian": "bg",
"Catalan": "ca",
"Cebuano": "ceb",
"中文" :"zh-CN",
"繁体中文": "zh-TW",
"简体中文": "zh-CN",
"Chinese" : "zh-CN",
"Chinese (Simplified)": "zh-CN",
"Chinese (Traditional)": "zh-TW",
"Corsican": "co",
"Croatian": "hr",
"Czech": "cs",
"Danish": "da",
"Dutch": "nl",
"English": "en",
"Esperanto": "eo",
"Estonian": "et",
"Filipino": "fil",
"Finnish": "fi",
"French": "fr",
"Frisian": "fy",
"Galician": "gl",
"Georgian": "ka",
"German": "de",
"Greek": "el",
"Gujarati": "gu",
"Haitian Creole": "ht",
"Hausa": "ha",
"Hawaiian": "haw",
"Hebrew": "he",
"Hindi": "hi",
"Hmong": "hmn",
"Hungarian": "hu",
"Icelandic": "is",
"Igbo": "ig",
"Indonesian": "id",
"Irish": "ga",
"Italian": "it",
"Japanese": "ja",
"Javanese": "jv",
"Kannada": "kn",
"Kazakh": "kk",
"Khmer": "km",
"Kinyarwanda": "rw",
"Korean": "ko",
"Kurdish": "ku",
"Kyrgyz": "ky",
"Lao": "lo",
"Latin": "la",
"Latvian": "lv",
"Lithuanian": "lt",
"Luxembourgish": "lb",
"Macedonian": "mk",
"Malagasy": "mg",
"Malay": "ms",
"Malayalam": "ml",
"Maltese": "mt",
"Maori": "mi",
"Marathi": "mr",
"Mongolian": "mn",
"Myanmar (Burmese)": "my",
"Nepali": "ne",
"Norwegian": "no",
"Nyanja (Chichewa)": "ny",
"Odia (Oriya)": "or",
"Pashto": "ps",
"Persian": "fa",
"Polish": "pl",
"Portuguese": "pt",
"Punjabi": "pa",
"Romanian": "ro",
"Russian": "ru",
"Samoan": "sm",
"Scots Gaelic": "gd",
"Serbian": "sr",
"Sesotho": "st",
"Shona": "sn",
"Sindhi": "sd",
"Sinhala (Sinhalese)": "si",
"Slovak": "sk",
"Slovenian": "sl",
"Somali": "so",
"Spanish": "es",
"Sundanese": "su",
"Swahili": "sw",
"Swedish": "sv",
"Tajik": "tg",
"Tamil": "ta",
"Tatar": "tt",
"Telugu": "te",
"Thai": "th",
"Turkish": "tr",
"Turkmen": "tk",
"Ukrainian": "uk",
"Urdu": "ur",
"Uyghur": "ug",
"Uzbek": "uz",
"Vietnamese": "vi",
"Welsh": "cy",
"Xhosa": "xh",
"Yiddish": "yi",
"Yoruba": "yo",
"Zulu": "zu"
}
# def pytts(input_text):
# if input_text:
# engine = pyttsx3.init()
# # # Get available voices and print them out
# # voices = engine.getProperty('voices')
# # for index, voice in enumerate(voices):
# # print(f"Voice {index}: {voice.id} - {voice.languages} - {voice.gender} - {voice.name}")
# # # Set voice (change index based on what is available on your system)
# # engine.setProperty('voice', voices[1].id) # Change the index to switch voices
# # # Set speech rate
# # rate = engine.getProperty('rate')
# # engine.setProperty('rate', rate - 50) # Decrease rate; increase to make it faster
# # # Set volume
# # volume = engine.getProperty('volume')
# # engine.setProperty('volume', volume + 0.25) # Increase volume; decrease to lower the volume
# # Speak text
# engine.say(input_text)
# engine.runAndWait()
def gtts(input_text,language='English'):
if input_text:
# Map the user-friendly language name to the IETF tag
lang = language_map.get(language, 'en') # Default to 'en' if language not found
tts = gTTS(text=input_text, lang=lang, slow=False)
audio_file = "output.mp3"
tts.save(audio_file)
return audio_file
# def assembly_speech_to_text(audio_file_path):
# aai.settings.api_key = "e00881b941ff47ea914594c40f6dbc20"
# transcriber = aai.Transcriber()
# transcript = transcriber.transcribe(audio_file_path)
# return transcript.text
# def google_speech_to_text(audio_file_path):
# if audio_file_path:
# recognizer = sr.Recognizer()
# with sr.AudioFile(audio_file_path) as source:
# audio_data = recognizer.record(source)
# try:
# text = recognizer.recognize_google(audio_data)
# return text
# except sr.UnknownValueError:
# return "Google Speech Recognition could not understand audio"
# except sr.RequestError as e:
# return f"Could not request results from Google Speech Recognition service; {e}"
def openai_speech_to_text(audio_file_path):
if audio_file_path:
client = OpenAI()
audio_file= open(audio_file_path, "rb")
transcription = client.audio.transcriptions.create(
model="whisper-1",
response_format="text",
file=audio_file
)
return transcription
def chat(text, history, native_language, language, persona, tone = "Casual", model = "gpt-4o-mini"):
print(tone, native_language, language, persona)
# if audio != None:
# text = speechtotext(audio)
casual = "This is in a casual, internet texting context, use of local slangs is encouraged." if tone == "Casual" else ""
teacher = MAIAI.Agent(model=model, temperature=0.5, role=f"You are a {language} teacher teaching {native_language} speaking student.")
responder = MAIAI.Agent(model=model, temperature=0.5, role=f"""You are {language} speaking {persona}. Respond to the user's text in {language}. Refer to Chat History for context. Keep the conversation going. {casual}""")
translator = MAIAI.Agent(model=model, temperature=0.5, role=f"You are a language translator")
feedback_task = MAIAI.Task(
agent=teacher,
goal=f"""Text: {text}
Point out and translate any non-{language} from the text into {language}.
Correct any linguistic error in the text and give example driven feedback on how to improve the text.
You MUST give your feedback in {native_language}.
{casual}
"""
)
respond_task = MAIAI.Task(
agent=responder,
goal=f"""{text}
Respond to the text above in {language}.
Refer to Chat History for context.
Chat History: {history}"""
)
feedback = feedback_task.execute()
response = respond_task.execute()
translate_task = MAIAI.Task(
agent=translator,
goal=f"translate {response} from {language} to {native_language}"
)
translation = translate_task.execute()
# # Get IETF tags for target and native languages
# native_lang = language_map.get(native_language, 'en')
# # Translate the response to the target language
# translated_response = GoogleTranslator(source='auto', target=native_lang).translate(response)
output = f"""
***Feedback:***
{feedback}
-----------
***{persona}:***
{response}
({translation})
"""
history.append((text, output))
return "", history, response
# Sample Function Call ------------------------------------
# feedback,response = chat("Soy jugando Demonslayer! Y tu?", "English","Spanish","friendly lady",,casual_tone="Casual")
# print(f"""
# Feedback: {feedback}
# Reply: {response}
# """)
# Gradio Custom Chatbot -------------------------------------------------------
with gr.Blocks(fill_height=True, theme=Base()) as demo:
chatbot = gr.Chatbot(
elem_id="chatbot",
bubble_full_width=False,
scale=1,
)
with gr.Row():
chat_input = gr.Textbox(interactive=True, scale=8,
inputs=[gr.components.Audio(sources="microphone", type="filepath", label="Speak or upload audio")],
value=openai_speech_to_text)
submit_button = gr.Button("Submit", scale=1)
with gr.Row():
response = gr.Textbox(visible=False, label="Read out Chat Response")
output_audio = gr.Audio(label="Reply Audio", type="filepath", scale = 9)
read_out_loud = gr.Button("Read Reply", scale = 1)
with gr.Accordion(label = "Settings"):
native_language = gr.components.Dropdown(choices=["English","中文","Spanish"], value="English", allow_custom_value=True, label="I speak")
language = gr.components.Dropdown(choices=["English","中文","Spanish"], value="English", allow_custom_value=True, label="I want to learn")
persona = gr.components.Textbox(value = "LinguAI Chatbot", label="I want to talk to")
tone = gr.components.Dropdown(choices=["Casual","Formal"], value="Casual", label="Tone")
chat_input.submit(chat, [chat_input, chatbot, native_language, language, persona, tone], [chat_input, chatbot, response])
submit_button.click(chat, [chat_input, chatbot, native_language, language, persona, tone], [chat_input, chatbot, response])
read_out_loud.click(gtts,[response,language],output_audio)
demo.launch()