import google.generativeai as genai import gradio as gr import numpy as np import soundfile as sf import time import uuid # ✅ API key inserted directly (only do this in dev or trusted environments) genai.configure(api_key="AIzaSyBas_7s1hD9cfAJuRHn-K4vrYZbqE-eXEE") PROMPT_TEMPLATE = """ You are a native speaker and expert linguist of the {language} language, specializing in pronunciation coaching. Your task is to analyze an audio recording of spoken {language}, compare it with the reference phrase, and provide a detailed pronunciation assessment. Input: 1. An audio file of spoken {language}. 2. A word, phrase, or sentence to compare with the audio. Your task: - Detect the phrase in the audio. - Compare pronunciation to the reference. - Identify errors in vowel sounds, consonant articulation, stress, intonation, linking, and missing words. - Provide recommendations for improvement. - Rate the overall pronunciation on a scale from 0% to 100%. If the audio does not contain the input phrase, say: "The audio does not contain the phrase." Your Output Format: Phrase (Input): {word_phrase} Phrase (Detected): [Detected phrase from audio] Comparison: [Similarities/differences] Problem Areas: [List and describe pronunciation issues] Recommendations for Improvement: [Personalized guidance per issue] Overall Pronunciation Rating: [XX]% """ def upload_audio(audio): sample_rate, data = audio data = np.array(data) guid_string = str(uuid.uuid4()) filename = f"media/{guid_string}.wav" if data.ndim == 2: data = data.T elif data.ndim != 1: return "Unexpected audio data format" sf.write(filename, data, sample_rate) ref = genai.upload_file(path=filename) return ref def create_prompt(language, word_phrase): return PROMPT_TEMPLATE.format(language=language, word_phrase=word_phrase) def evaluate_audio_pronunciation(audio_file_id, prompt, model="gemini-2.0-flash"): prompt = [prompt, audio_file_id] model = genai.GenerativeModel(model) response = model.generate_content(contents=prompt) total_token_count = response.usage_metadata.total_token_count return response.text, response.usage_metadata.prompt_token_count, total_token_count def orchestrate(audio, language, word_phrase, model): start_time = time.time() audio_file_id = upload_audio(audio) prompt = create_prompt(language, word_phrase) response, input_tokens, total_tokens = evaluate_audio_pronunciation( audio_file_id, prompt, model ) end_time = time.time() return response, f"{end_time - start_time:.2f} seconds", input_tokens, total_tokens, model ui_blocks = gr.Blocks() input_audio = gr.Audio( sources=["microphone", "upload"], waveform_options=gr.WaveformOptions( waveform_color="#01C6FF", waveform_progress_color="#0066B4", skip_length=2, show_controls=False, ), ) get_prompt_ui_block = gr.Interface( fn=orchestrate, inputs=[ input_audio, gr.Textbox(label="Language (e.g., Arabic, Spanish, French, Japanese)", lines=1), gr.Textbox(label="Word or Phrase to Compare", lines=1), gr.Radio( ["gemini-1.5-flash-8b", "gemini-2.0-flash", "gemini-2.0-flash-lite-preview-02-05", "gemini-1.5-flash"], info="Choose Gemini Model", ), ], outputs=[ gr.Textbox(label="Response"), gr.Textbox(label="Evaluation Time"), gr.Textbox(label="Input Tokens"), gr.Textbox(label="Total Tokens"), gr.Textbox(label="Model Used"), ], allow_flagging="never" ) with ui_blocks: gr.TabbedInterface( [get_prompt_ui_block], ["Multilingual Pronunciation Evaluation"] ) if __name__ == "__main__": ui_blocks.launch()