from openai import OpenAI from dotenv import load_dotenv import MAIAI # from deep_translator import GoogleTranslator # import speech_recognition as sr # import assemblyai as aai # import pyttsx3 from gtts import gTTS import gradio as gr from gradio.themes.base import Base # Load environment variables load_dotenv() language_map = { "Afrikaans": "af", "Albanian": "sq", "Amharic": "am", "Arabic": "ar", "Armenian": "hy", "Azerbaijani": "az", "Basque": "eu", "Bengali": "bn", "Bosnian": "bs", "Bulgarian": "bg", "Catalan": "ca", "Cebuano": "ceb", "中文" :"zh-CN", "繁体中文": "zh-TW", "简体中文": "zh-CN", "Chinese" : "zh-CN", "Chinese (Simplified)": "zh-CN", "Chinese (Traditional)": "zh-TW", "Corsican": "co", "Croatian": "hr", "Czech": "cs", "Danish": "da", "Dutch": "nl", "English": "en", "Esperanto": "eo", "Estonian": "et", "Filipino": "fil", "Finnish": "fi", "French": "fr", "Frisian": "fy", "Galician": "gl", "Georgian": "ka", "German": "de", "Greek": "el", "Gujarati": "gu", "Haitian Creole": "ht", "Hausa": "ha", "Hawaiian": "haw", "Hebrew": "he", "Hindi": "hi", "Hmong": "hmn", "Hungarian": "hu", "Icelandic": "is", "Igbo": "ig", "Indonesian": "id", "Irish": "ga", "Italian": "it", "Japanese": "ja", "Javanese": "jv", "Kannada": "kn", "Kazakh": "kk", "Khmer": "km", "Kinyarwanda": "rw", "Korean": "ko", "Kurdish": "ku", "Kyrgyz": "ky", "Lao": "lo", "Latin": "la", "Latvian": "lv", "Lithuanian": "lt", "Luxembourgish": "lb", "Macedonian": "mk", "Malagasy": "mg", "Malay": "ms", "Malayalam": "ml", "Maltese": "mt", "Maori": "mi", "Marathi": "mr", "Mongolian": "mn", "Myanmar (Burmese)": "my", "Nepali": "ne", "Norwegian": "no", "Nyanja (Chichewa)": "ny", "Odia (Oriya)": "or", "Pashto": "ps", "Persian": "fa", "Polish": "pl", "Portuguese": "pt", "Punjabi": "pa", "Romanian": "ro", "Russian": "ru", "Samoan": "sm", "Scots Gaelic": "gd", "Serbian": "sr", "Sesotho": "st", "Shona": "sn", "Sindhi": "sd", "Sinhala (Sinhalese)": "si", "Slovak": "sk", "Slovenian": "sl", "Somali": "so", "Spanish": "es", "Sundanese": "su", "Swahili": "sw", "Swedish": "sv", "Tajik": "tg", "Tamil": "ta", "Tatar": "tt", "Telugu": "te", "Thai": "th", "Turkish": "tr", "Turkmen": "tk", "Ukrainian": "uk", "Urdu": "ur", "Uyghur": "ug", "Uzbek": "uz", "Vietnamese": "vi", "Welsh": "cy", "Xhosa": "xh", "Yiddish": "yi", "Yoruba": "yo", "Zulu": "zu" } # def pytts(input_text): # if input_text: # engine = pyttsx3.init() # # # Get available voices and print them out # # voices = engine.getProperty('voices') # # for index, voice in enumerate(voices): # # print(f"Voice {index}: {voice.id} - {voice.languages} - {voice.gender} - {voice.name}") # # # Set voice (change index based on what is available on your system) # # engine.setProperty('voice', voices[1].id) # Change the index to switch voices # # # Set speech rate # # rate = engine.getProperty('rate') # # engine.setProperty('rate', rate - 50) # Decrease rate; increase to make it faster # # # Set volume # # volume = engine.getProperty('volume') # # engine.setProperty('volume', volume + 0.25) # Increase volume; decrease to lower the volume # # Speak text # engine.say(input_text) # engine.runAndWait() def gtts(input_text,language='English'): if input_text: # Map the user-friendly language name to the IETF tag lang = language_map.get(language, 'en') # Default to 'en' if language not found tts = gTTS(text=input_text, lang=lang, slow=False) audio_file = "output.mp3" tts.save(audio_file) return audio_file # def assembly_speech_to_text(audio_file_path): # aai.settings.api_key = "e00881b941ff47ea914594c40f6dbc20" # transcriber = aai.Transcriber() # transcript = transcriber.transcribe(audio_file_path) # return transcript.text # def google_speech_to_text(audio_file_path): # if audio_file_path: # recognizer = sr.Recognizer() # with sr.AudioFile(audio_file_path) as source: # audio_data = recognizer.record(source) # try: # text = recognizer.recognize_google(audio_data) # return text # except sr.UnknownValueError: # return "Google Speech Recognition could not understand audio" # except sr.RequestError as e: # return f"Could not request results from Google Speech Recognition service; {e}" def openai_speech_to_text(audio_file_path): if audio_file_path: client = OpenAI() audio_file= open(audio_file_path, "rb") transcription = client.audio.transcriptions.create( model="whisper-1", response_format="text", file=audio_file ) return transcription def chat(text, history, native_language, language, persona, tone = "Casual", model = "gpt-4o-mini"): print(tone, native_language, language, persona) # if audio != None: # text = speechtotext(audio) casual = "This is in a casual, internet texting context, use of local slangs is encouraged." if tone == "Casual" else "" teacher = MAIAI.Agent(model=model, temperature=0.5, role=f"You are a {language} teacher teaching {native_language} speaking student.") responder = MAIAI.Agent(model=model, temperature=0.5, role=f"""You are {language} speaking {persona}. Respond to the user's text in {language}. Refer to Chat History for context. Keep the conversation going. {casual}""") translator = MAIAI.Agent(model=model, temperature=0.5, role=f"You are a language translator") feedback_task = MAIAI.Task( agent=teacher, goal=f"""Text: {text} Point out and translate any non-{language} from the text into {language}. Correct any linguistic error in the text and give example driven feedback on how to improve the text. You MUST give your feedback in {native_language}. {casual} """ ) respond_task = MAIAI.Task( agent=responder, goal=f"""{text} Respond to the text above in {language}. Refer to Chat History for context. Chat History: {history}""" ) feedback = feedback_task.execute() response = respond_task.execute() translate_task = MAIAI.Task( agent=translator, goal=f"translate {response} from {language} to {native_language}" ) translation = translate_task.execute() # # Get IETF tags for target and native languages # native_lang = language_map.get(native_language, 'en') # # Translate the response to the target language # translated_response = GoogleTranslator(source='auto', target=native_lang).translate(response) output = f""" ***Feedback:*** {feedback} ----------- ***{persona}:*** {response} ({translation}) """ history.append((text, output)) return "", history, response # Sample Function Call ------------------------------------ # feedback,response = chat("Soy jugando Demonslayer! Y tu?", "English","Spanish","friendly lady",,casual_tone="Casual") # print(f""" # Feedback: {feedback} # Reply: {response} # """) # Gradio Custom Chatbot ------------------------------------------------------- with gr.Blocks(fill_height=True, theme=Base()) as demo: chatbot = gr.Chatbot( elem_id="chatbot", bubble_full_width=False, scale=1, ) with gr.Row(): chat_input = gr.Textbox(interactive=True, scale=8, inputs=[gr.components.Audio(sources="microphone", type="filepath", label="Speak or upload audio")], value=openai_speech_to_text) submit_button = gr.Button("Submit", scale=1) with gr.Row(): response = gr.Textbox(visible=False, label="Read out Chat Response") output_audio = gr.Audio(label="Reply Audio", type="filepath", scale = 9) read_out_loud = gr.Button("Read Reply", scale = 1) with gr.Accordion(label = "Settings"): native_language = gr.components.Dropdown(choices=["English","中文","Spanish"], value="English", allow_custom_value=True, label="I speak") language = gr.components.Dropdown(choices=["English","中文","Spanish"], value="English", allow_custom_value=True, label="I want to learn") persona = gr.components.Textbox(value = "LinguAI Chatbot", label="I want to talk to") tone = gr.components.Dropdown(choices=["Casual","Formal"], value="Casual", label="Tone") chat_input.submit(chat, [chat_input, chatbot, native_language, language, persona, tone], [chat_input, chatbot, response]) submit_button.click(chat, [chat_input, chatbot, native_language, language, persona, tone], [chat_input, chatbot, response]) read_out_loud.click(gtts,[response,language],output_audio) demo.launch()