File size: 5,032 Bytes
296c2c3
 
 
 
 
 
17d89c7
 
7aebe7c
296c2c3
 
17d89c7
296c2c3
 
 
 
 
 
 
 
 
d8f63f1
296c2c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aa276b4
296c2c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ab65756
 
99c8c91
ab65756
296c2c3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import io
import gradio as gr
import numpy as np
from groq import Groq
from difflib import SequenceMatcher
import soundfile as sf
import os

api_key = os.getenv("Profero")

# Initialize Groq client with API key
client = Groq(api_key= api_key)

# Initialize score tracking
score = 0
attempts = 0

# Function to generate a word using Llama model
def generate_word():
    try:
        completion = client.chat.completions.create(
            model="llama-3.3-70b-versatile",
            messages=[
                {"role": "system", "content": "You are an experienced English professor with over 20 years experience teaching English and you are also a native speaker. You are trying to teach proper English pronunciation by generating words or phrases for user to pronounce and you judge them if it is correct or not. Make sure just a single word or a very concise phrase. Don't mention any other word apart from the word generated."},
                {"role": "user", "content": "Generate a word for pronunciation."}
            ],
            temperature=1.4,
            max_tokens=4096,
            top_p=1,
            stream=True,
        )
        # Process streaming response
        word = ""
        for chunk in completion:
            delta_content = chunk.choices[0].delta.content
            if delta_content:
                word += delta_content
        word = word.strip().strip('"')
        return word
    except Exception as e:
        return f"Error generating word: {e}"

# Function to check pronunciation
def check_pronunciation(audio, word):
    global score, attempts
    attempts += 1
    try:
        # Determine the source of the audio and handle accordingly
        if isinstance(audio, tuple):  # If the audio is a tuple, it's an uploaded file
            audio_filename = "user_audio.wav"
            sf.write(audio_filename, audio[1], samplerate=44100, format='WAV')
        else:  # If it's not a tuple, it's recorded from the microphone
            audio_filename = "user_audio.m4a"
            with open(audio_filename, "wb") as f:
                f.write(audio)  # Save the recorded audio as .m4a

        # Transcribe using Groq's Whisper API
        with open(audio_filename, "rb") as file:
            transcription = client.audio.transcriptions.create(
                file=(audio_filename, file.read()),
                model="distil-whisper-large-v3-en",
                temperature=0.28,
                response_format="verbose_json",
            )

        transcription_text = transcription.text  # Corrected line

        # Compare transcription with the expected word
        similarity = SequenceMatcher(None, transcription_text.lower(), word.lower()).ratio()
        if similarity > 0.8:  # Threshold for correct pronunciation
            score += 1
            result_text = f"Correct! Expected: {word}. You said: {transcription_text}"
        else:
            result_text = f"Incorrect. Expected: {word}. You said: {transcription_text}"

        return result_text, score
    except Exception as e:
        return f"Error checking pronunciation: {e}", score, None

# Function to reset the test and display percentage
def reset_test():
    global score, attempts
    if attempts > 0:
        percentage = (score / attempts) * 100
    else:
        percentage = 0
    final_score = (f"Your final score is {score}/{attempts}. "
                   f"Percentage: {percentage:.2f}%")
    score = 0
    attempts = 0
    return final_score


# Gradio Interface
with gr.Blocks() as interface:
    gr.HTML("""
    <h1 style="text-align: center; font-weight: bold;">Profero</h1>
    <p style="text-align: center;">Profero is an interactive application designed to help users improve their English pronunciation skills. Users can practice pronouncing words generated by an advanced language model and receive immediate feedback on their performance. The application provides real-time transcription, scoring, and feedback to enhance learning and accuracy. You can upload a .WAV audio file or use your microphone to pronounce the word displayed. When using a microphone, make sure to trim your sound before submitting.</p>
    """)
    word_output = gr.Textbox(label="Word to Pronounce")
    result_output = gr.Textbox(label="Result")
    score_output = gr.Textbox(label="Score")

    # Initialize with a word
    initial_word = generate_word()
    word_output.value = initial_word

    # Generate new word on button click
    word_button = gr.Button("Get New Word")
    word_button.click(fn=generate_word, outputs=word_output)

    # Audio input for pronunciation checking
    audio_input = gr.Audio(type="numpy")  # Handling both microphone and uploaded files
    submit_button = gr.Button("Submit Pronunciation")
    submit_button.click(fn=check_pronunciation, inputs=[audio_input, word_output], outputs=[result_output, score_output])

    # Reset button to stop and show score
    stop_button = gr.Button("Stop")
    stop_button.click(fn=reset_test, outputs=score_output)

interface.launch()