File size: 3,785 Bytes
499ba07
e27f614
499ba07
 
 
 
e27f614
499ba07
 
e27f614
499ba07
 
e27f614
499ba07
 
 
e27f614
499ba07
 
 
 
 
 
e27f614
499ba07
e27f614
499ba07
 
 
e27f614
499ba07
 
e27f614
499ba07
 
e27f614
499ba07
 
e27f614
499ba07
 
e27f614
499ba07
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e27f614
 
499ba07
 
 
 
 
 
 
 
e27f614
 
499ba07
 
 
 
 
e27f614
 
499ba07
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import google.generativeai as genai
import gradio as gr
import numpy as np
import soundfile as sf
import time
import uuid

# ✅ API key inserted directly (only do this in dev or trusted environments)
genai.configure(api_key="AIzaSyBas_7s1hD9cfAJuRHn-K4vrYZbqE-eXEE")

PROMPT_TEMPLATE = """
You are a native speaker and expert linguist of the {language} language, specializing in pronunciation coaching. Your task is to analyze an audio recording of spoken {language}, compare it with the reference phrase, and provide a detailed pronunciation assessment.

Input:
1. An audio file of spoken {language}.
2. A word, phrase, or sentence to compare with the audio.

Your task:
- Detect the phrase in the audio.
- Compare pronunciation to the reference.
- Identify errors in vowel sounds, consonant articulation, stress, intonation, linking, and missing words.
- Provide recommendations for improvement.
- Rate the overall pronunciation on a scale from 0% to 100%.

If the audio does not contain the input phrase, say: "The audio does not contain the phrase."

Your Output Format:
Phrase (Input): {word_phrase}
Phrase (Detected): [Detected phrase from audio]

Comparison:
[Similarities/differences]

Problem Areas:
[List and describe pronunciation issues]

Recommendations for Improvement:
[Personalized guidance per issue]

Overall Pronunciation Rating:
[XX]%
"""

def upload_audio(audio):
    sample_rate, data = audio
    data = np.array(data)
    guid_string = str(uuid.uuid4())
    filename = f"media/{guid_string}.wav"

    if data.ndim == 2:
        data = data.T
    elif data.ndim != 1:
        return "Unexpected audio data format"

    sf.write(filename, data, sample_rate)
    ref = genai.upload_file(path=filename)
    return ref


def create_prompt(language, word_phrase):
    return PROMPT_TEMPLATE.format(language=language, word_phrase=word_phrase)


def evaluate_audio_pronunciation(audio_file_id, prompt, model="gemini-2.0-flash"):
    prompt = [prompt, audio_file_id]
    model = genai.GenerativeModel(model)
    response = model.generate_content(contents=prompt)
    total_token_count = response.usage_metadata.total_token_count
    return response.text, response.usage_metadata.prompt_token_count, total_token_count


def orchestrate(audio, language, word_phrase, model):
    start_time = time.time()
    audio_file_id = upload_audio(audio)
    prompt = create_prompt(language, word_phrase)
    response, input_tokens, total_tokens = evaluate_audio_pronunciation(
        audio_file_id, prompt, model
    )
    end_time = time.time()
    return response, f"{end_time - start_time:.2f} seconds", input_tokens, total_tokens, model


ui_blocks = gr.Blocks()

input_audio = gr.Audio(
    sources=["microphone", "upload"],
    waveform_options=gr.WaveformOptions(
        waveform_color="#01C6FF",
        waveform_progress_color="#0066B4",
        skip_length=2,
        show_controls=False,
    ),
)

get_prompt_ui_block = gr.Interface(
    fn=orchestrate,
    inputs=[
        input_audio,
        gr.Textbox(label="Language (e.g., Arabic, Spanish, French, Japanese)", lines=1),
        gr.Textbox(label="Word or Phrase to Compare", lines=1),
        gr.Radio(
            ["gemini-1.5-flash-8b", "gemini-2.0-flash", "gemini-2.0-flash-lite-preview-02-05", "gemini-1.5-flash"],
            info="Choose Gemini Model",
        ),
    ],
    outputs=[
        gr.Textbox(label="Response"),
        gr.Textbox(label="Evaluation Time"),
        gr.Textbox(label="Input Tokens"),
        gr.Textbox(label="Total Tokens"),
        gr.Textbox(label="Model Used"),
    ],
    allow_flagging="never"
)

with ui_blocks:
    gr.TabbedInterface(
        [get_prompt_ui_block],
        ["Multilingual Pronunciation Evaluation"]
    )

if __name__ == "__main__":
    ui_blocks.launch()