myspeakapp / app.py
shibly100's picture
Upload 7 files
499ba07 verified
import google.generativeai as genai
import gradio as gr
import numpy as np
import soundfile as sf
import time
import uuid
# ✅ API key inserted directly (only do this in dev or trusted environments)
genai.configure(api_key="AIzaSyBas_7s1hD9cfAJuRHn-K4vrYZbqE-eXEE")
PROMPT_TEMPLATE = """
You are a native speaker and expert linguist of the {language} language, specializing in pronunciation coaching. Your task is to analyze an audio recording of spoken {language}, compare it with the reference phrase, and provide a detailed pronunciation assessment.
Input:
1. An audio file of spoken {language}.
2. A word, phrase, or sentence to compare with the audio.
Your task:
- Detect the phrase in the audio.
- Compare pronunciation to the reference.
- Identify errors in vowel sounds, consonant articulation, stress, intonation, linking, and missing words.
- Provide recommendations for improvement.
- Rate the overall pronunciation on a scale from 0% to 100%.
If the audio does not contain the input phrase, say: "The audio does not contain the phrase."
Your Output Format:
Phrase (Input): {word_phrase}
Phrase (Detected): [Detected phrase from audio]
Comparison:
[Similarities/differences]
Problem Areas:
[List and describe pronunciation issues]
Recommendations for Improvement:
[Personalized guidance per issue]
Overall Pronunciation Rating:
[XX]%
"""
def upload_audio(audio):
sample_rate, data = audio
data = np.array(data)
guid_string = str(uuid.uuid4())
filename = f"media/{guid_string}.wav"
if data.ndim == 2:
data = data.T
elif data.ndim != 1:
return "Unexpected audio data format"
sf.write(filename, data, sample_rate)
ref = genai.upload_file(path=filename)
return ref
def create_prompt(language, word_phrase):
return PROMPT_TEMPLATE.format(language=language, word_phrase=word_phrase)
def evaluate_audio_pronunciation(audio_file_id, prompt, model="gemini-2.0-flash"):
prompt = [prompt, audio_file_id]
model = genai.GenerativeModel(model)
response = model.generate_content(contents=prompt)
total_token_count = response.usage_metadata.total_token_count
return response.text, response.usage_metadata.prompt_token_count, total_token_count
def orchestrate(audio, language, word_phrase, model):
start_time = time.time()
audio_file_id = upload_audio(audio)
prompt = create_prompt(language, word_phrase)
response, input_tokens, total_tokens = evaluate_audio_pronunciation(
audio_file_id, prompt, model
)
end_time = time.time()
return response, f"{end_time - start_time:.2f} seconds", input_tokens, total_tokens, model
ui_blocks = gr.Blocks()
input_audio = gr.Audio(
sources=["microphone", "upload"],
waveform_options=gr.WaveformOptions(
waveform_color="#01C6FF",
waveform_progress_color="#0066B4",
skip_length=2,
show_controls=False,
),
)
get_prompt_ui_block = gr.Interface(
fn=orchestrate,
inputs=[
input_audio,
gr.Textbox(label="Language (e.g., Arabic, Spanish, French, Japanese)", lines=1),
gr.Textbox(label="Word or Phrase to Compare", lines=1),
gr.Radio(
["gemini-1.5-flash-8b", "gemini-2.0-flash", "gemini-2.0-flash-lite-preview-02-05", "gemini-1.5-flash"],
info="Choose Gemini Model",
),
],
outputs=[
gr.Textbox(label="Response"),
gr.Textbox(label="Evaluation Time"),
gr.Textbox(label="Input Tokens"),
gr.Textbox(label="Total Tokens"),
gr.Textbox(label="Model Used"),
],
allow_flagging="never"
)
with ui_blocks:
gr.TabbedInterface(
[get_prompt_ui_block],
["Multilingual Pronunciation Evaluation"]
)
if __name__ == "__main__":
ui_blocks.launch()