File size: 7,853 Bytes
31906ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fa6cf9b
 
 
 
e3f109a
3ef02b3
e3f109a
 
3ef02b3
b54a482
 
3ef02b3
 
 
 
 
 
 
 
 
 
 
 
 
 
b54a482
3ef02b3
31906ce
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import os
import streamlit as st
import torch
import torch.nn.functional as F
import librosa
import speech_recognition as sr
# from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq

# # Load pretrained model and processor
# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
# model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# Initialize Groq client
groq_client = Groq(api_key="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")

# Function to transcribe audio into text
def transcribe_audio(audio_file):
    recognizer = sr.Recognizer()
    try:
        with sr.AudioFile(audio_file) as source:
            audio_data = recognizer.record(source)  # Read the entire audio file
            text = recognizer.recognize_google(audio_data, language='ar-SA')  # Arabic transcription
            return text
    except sr.UnknownValueError:
        return None
    except sr.RequestError:
        return None

# Function to convert Arabic text to Romanized text
def romanize_arabic(text):
    romanized_mapping = {
        "ุงู„ู„ู‡": "Allahu",
        "ุงูƒุจุฑ": "akbar",
        "ุงุดู‡ุฏ": "Ashhadu",
        "ุงู†": "an",
        "ู„ุง": "la",
        "ุงู„ู‡": "ilaha",
        "ุงู„ุง": "illa",
        "ู…ุญู…ุฏ": "Muhammad",
        "ุฑุณูˆู„": "Rasul",
        "ุญูŠ": "Hayya",
        "ุนู„ู‰": "'ala",
        "ุงู„ุตู„ุงู‡": "as-salah",
        "ุงู„ูู„ุงุญ": "al-falah",
        "ู„ุง": "la",
        "ุงู„ุง": "illa",
    }
    
    words = text.split()
    romanized_text = ' '.join(romanized_mapping.get(word, word) for word in words)
    return romanized_text

# Function to convert audio file into embeddings
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import librosa

# Load pretrained model and processor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# Function to convert audio file into embeddings
from io import BytesIO
import librosa

# Updated function for Streamlit-compatible audio processing
from io import BytesIO
from pydub import AudioSegment

def get_audio_embedding(file_input):
    # Convert Streamlit file input to BytesIO if it's not a string path
    if not isinstance(file_input, str):
        file_input = BytesIO(file_input.read())

    # Convert the audio to WAV format using pydub (supports various formats like mp3, m4a, etc.)
    try:
        # Read the audio file using pydub and convert to WAV format
        audio = AudioSegment.from_file(file_input)
        wav_io = BytesIO()
        audio.export(wav_io, format="wav")
        wav_io.seek(0)  # Move back to the start of the BytesIO object

        # Load the converted WAV file using librosa
        audio_data, sr = librosa.load(wav_io, sr=16000)
    except Exception as e:
        raise ValueError(f"Failed to process the audio file: {str(e)}")

    # Convert audio to embeddings using Wav2Vec2
    inputs = feature_extractor(audio_data, sampling_rate=sr, return_tensors="pt", padding=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings


# Function to calculate cosine similarity for embeddings
def compare_embeddings(embedding_1, embedding_2):
    similarity = F.cosine_similarity(embedding_1, embedding_2, dim=1)
    return similarity.item()

# Function to calculate text similarity using Cosine Similarity
def compare_text_similarity(text1, text2):
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    cosine_sim = cosine_similarity(vectors)
    return cosine_sim[0][1]

# LLM feedback function using Groq
def generate_llm_feedback(similarity_score):
    feedback_prompt = f"""
    A user has just pronounced part of the Azaan, and the similarity score between their pronunciation and the reference Azaan is {similarity_score:.2f}.
    
    Based on this score:
    - If the score is above 0.9, the pronunciation is excellent.
    - If the score is between 0.7 and 0.9, the pronunciation is good but may need slight improvement.
    - If the score is below 0.7, the pronunciation requires significant improvement.
    
    Provide detailed feedback for the user about their pronunciation, considering their score of {similarity_score:.2f}.
    """

    chat_completion = groq_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": feedback_prompt,
            }
        ],
        model="llama3-8b-8192",
    )

    return chat_completion.choices[0].message.content

# Custom CSS for styling
st.markdown(
    """
    <style>
    .main {
        background-color: #f5f5f5;
        font-family: 'Arial', sans-serif;
    }
    .title {
        text-align: center;
        color: #2a9d8f;
    }
    .subtitle {
        text-align: center;
        color: #264653;
    }
    .footer {
        text-align: center;
        font-size: 0.8em;
        color: #555;
    }
    .feedback {
        background-color: #e9c6c6;
        border-radius: 10px;
        padding: 20px;
        margin: 10px;
        box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
    }
    </style>
    """,
    unsafe_allow_html=True
)

# Streamlit UI
def main():
    st.title("๐Ÿ”” Azaan Pronunciation Evaluation")
    st.markdown("<h3 class='subtitle'>Welcome to the Azaan Pronunciation Evaluation!</h3>", unsafe_allow_html=True)

    st.subheader("Upload Your Audio")
    uploaded_file = st.file_uploader("Choose an audio file...", type=["wav", "mp3", "m4a"])

    if uploaded_file is not None:
        st.audio(uploaded_file, format='audio/wav')

        # Step 1: Transcribe expert audio and user audio
        expert_audio_path = "Hafiz muqeem.wav" # Change this to the correct path
        st.write("๐ŸŽค Step 1: Checking if the words match...")
        
        # Transcribe user audio
        user_text = transcribe_audio(uploaded_file)
        expert_text = transcribe_audio(expert_audio_path)

        if user_text and expert_text:
            st.write("โœ… Transcription successful!")
            st.write(f"**Expert Azaan Text:** {expert_text}")
            st.write(f"**Your Azaan Text:** {user_text}")

            # Step 2: Romanize and compare texts
            user_romanized = romanize_arabic(user_text)
            expert_romanized = romanize_arabic(expert_text)

            text_similarity = compare_text_similarity(user_romanized, expert_romanized)
            st.write(f"๐Ÿ“ Text Similarity Score: {text_similarity:.2f}")

            if text_similarity >= 0.1:
                st.success("โœ… Great! Your words match well enough. Now, let's evaluate your pronunciation.")

                # Step 3: Evaluate pronunciation similarity
                expert_embedding = get_audio_embedding(expert_audio_path)
                user_embedding = get_audio_embedding(uploaded_file)

                pronunciation_similarity = compare_embeddings(expert_embedding, user_embedding)
                st.write(f"๐Ÿ”Š Pronunciation Similarity Score: {pronunciation_similarity:.2f}")

                # Get feedback
                feedback = generate_llm_feedback(pronunciation_similarity)
                st.markdown(f"<div class='feedback'>{feedback}</div>", unsafe_allow_html=True)
            else:
                st.warning("โš ๏ธ Your words do not match sufficiently. Please try again.")
        else:
            st.error("โŒ There was an error transcribing one or both audio files.")

    st.markdown("<div class='footer'>ยฉ 2024 Azaan Pronunciation Evaluation Tool</div>", unsafe_allow_html=True)

if __name__ == "__main__":
    main()