Spaces:
Sleeping
Sleeping
File size: 3,785 Bytes
499ba07 e27f614 499ba07 e27f614 499ba07 e27f614 499ba07 e27f614 499ba07 e27f614 499ba07 e27f614 499ba07 e27f614 499ba07 e27f614 499ba07 e27f614 499ba07 e27f614 499ba07 e27f614 499ba07 e27f614 499ba07 e27f614 499ba07 e27f614 499ba07 e27f614 499ba07 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 | import google.generativeai as genai
import gradio as gr
import numpy as np
import soundfile as sf
import time
import uuid
# ✅ API key inserted directly (only do this in dev or trusted environments)
genai.configure(api_key="AIzaSyBas_7s1hD9cfAJuRHn-K4vrYZbqE-eXEE")
PROMPT_TEMPLATE = """
You are a native speaker and expert linguist of the {language} language, specializing in pronunciation coaching. Your task is to analyze an audio recording of spoken {language}, compare it with the reference phrase, and provide a detailed pronunciation assessment.
Input:
1. An audio file of spoken {language}.
2. A word, phrase, or sentence to compare with the audio.
Your task:
- Detect the phrase in the audio.
- Compare pronunciation to the reference.
- Identify errors in vowel sounds, consonant articulation, stress, intonation, linking, and missing words.
- Provide recommendations for improvement.
- Rate the overall pronunciation on a scale from 0% to 100%.
If the audio does not contain the input phrase, say: "The audio does not contain the phrase."
Your Output Format:
Phrase (Input): {word_phrase}
Phrase (Detected): [Detected phrase from audio]
Comparison:
[Similarities/differences]
Problem Areas:
[List and describe pronunciation issues]
Recommendations for Improvement:
[Personalized guidance per issue]
Overall Pronunciation Rating:
[XX]%
"""
def upload_audio(audio):
sample_rate, data = audio
data = np.array(data)
guid_string = str(uuid.uuid4())
filename = f"media/{guid_string}.wav"
if data.ndim == 2:
data = data.T
elif data.ndim != 1:
return "Unexpected audio data format"
sf.write(filename, data, sample_rate)
ref = genai.upload_file(path=filename)
return ref
def create_prompt(language, word_phrase):
return PROMPT_TEMPLATE.format(language=language, word_phrase=word_phrase)
def evaluate_audio_pronunciation(audio_file_id, prompt, model="gemini-2.0-flash"):
prompt = [prompt, audio_file_id]
model = genai.GenerativeModel(model)
response = model.generate_content(contents=prompt)
total_token_count = response.usage_metadata.total_token_count
return response.text, response.usage_metadata.prompt_token_count, total_token_count
def orchestrate(audio, language, word_phrase, model):
start_time = time.time()
audio_file_id = upload_audio(audio)
prompt = create_prompt(language, word_phrase)
response, input_tokens, total_tokens = evaluate_audio_pronunciation(
audio_file_id, prompt, model
)
end_time = time.time()
return response, f"{end_time - start_time:.2f} seconds", input_tokens, total_tokens, model
ui_blocks = gr.Blocks()
input_audio = gr.Audio(
sources=["microphone", "upload"],
waveform_options=gr.WaveformOptions(
waveform_color="#01C6FF",
waveform_progress_color="#0066B4",
skip_length=2,
show_controls=False,
),
)
get_prompt_ui_block = gr.Interface(
fn=orchestrate,
inputs=[
input_audio,
gr.Textbox(label="Language (e.g., Arabic, Spanish, French, Japanese)", lines=1),
gr.Textbox(label="Word or Phrase to Compare", lines=1),
gr.Radio(
["gemini-1.5-flash-8b", "gemini-2.0-flash", "gemini-2.0-flash-lite-preview-02-05", "gemini-1.5-flash"],
info="Choose Gemini Model",
),
],
outputs=[
gr.Textbox(label="Response"),
gr.Textbox(label="Evaluation Time"),
gr.Textbox(label="Input Tokens"),
gr.Textbox(label="Total Tokens"),
gr.Textbox(label="Model Used"),
],
allow_flagging="never"
)
with ui_blocks:
gr.TabbedInterface(
[get_prompt_ui_block],
["Multilingual Pronunciation Evaluation"]
)
if __name__ == "__main__":
ui_blocks.launch()
|