Spaces:
Sleeping
Sleeping
File size: 7,853 Bytes
31906ce fa6cf9b e3f109a 3ef02b3 e3f109a 3ef02b3 b54a482 3ef02b3 b54a482 3ef02b3 31906ce |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 |
import os
import streamlit as st
import torch
import torch.nn.functional as F
import librosa
import speech_recognition as sr
# from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq
# # Load pretrained model and processor
# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
# model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
# Initialize Groq client
groq_client = Groq(api_key="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")
# Function to transcribe audio into text
def transcribe_audio(audio_file):
recognizer = sr.Recognizer()
try:
with sr.AudioFile(audio_file) as source:
audio_data = recognizer.record(source) # Read the entire audio file
text = recognizer.recognize_google(audio_data, language='ar-SA') # Arabic transcription
return text
except sr.UnknownValueError:
return None
except sr.RequestError:
return None
# Function to convert Arabic text to Romanized text
def romanize_arabic(text):
romanized_mapping = {
"ุงููู": "Allahu",
"ุงูุจุฑ": "akbar",
"ุงุดูุฏ": "Ashhadu",
"ุงู": "an",
"ูุง": "la",
"ุงูู": "ilaha",
"ุงูุง": "illa",
"ู
ุญู
ุฏ": "Muhammad",
"ุฑุณูู": "Rasul",
"ุญู": "Hayya",
"ุนูู": "'ala",
"ุงูุตูุงู": "as-salah",
"ุงูููุงุญ": "al-falah",
"ูุง": "la",
"ุงูุง": "illa",
}
words = text.split()
romanized_text = ' '.join(romanized_mapping.get(word, word) for word in words)
return romanized_text
# Function to convert audio file into embeddings
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import librosa
# Load pretrained model and processor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
# Function to convert audio file into embeddings
from io import BytesIO
import librosa
# Updated function for Streamlit-compatible audio processing
from io import BytesIO
from pydub import AudioSegment
def get_audio_embedding(file_input):
# Convert Streamlit file input to BytesIO if it's not a string path
if not isinstance(file_input, str):
file_input = BytesIO(file_input.read())
# Convert the audio to WAV format using pydub (supports various formats like mp3, m4a, etc.)
try:
# Read the audio file using pydub and convert to WAV format
audio = AudioSegment.from_file(file_input)
wav_io = BytesIO()
audio.export(wav_io, format="wav")
wav_io.seek(0) # Move back to the start of the BytesIO object
# Load the converted WAV file using librosa
audio_data, sr = librosa.load(wav_io, sr=16000)
except Exception as e:
raise ValueError(f"Failed to process the audio file: {str(e)}")
# Convert audio to embeddings using Wav2Vec2
inputs = feature_extractor(audio_data, sampling_rate=sr, return_tensors="pt", padding=True)
with torch.no_grad():
embeddings = model(**inputs).last_hidden_state.mean(dim=1)
return embeddings
# Function to calculate cosine similarity for embeddings
def compare_embeddings(embedding_1, embedding_2):
similarity = F.cosine_similarity(embedding_1, embedding_2, dim=1)
return similarity.item()
# Function to calculate text similarity using Cosine Similarity
def compare_text_similarity(text1, text2):
vectorizer = CountVectorizer().fit_transform([text1, text2])
vectors = vectorizer.toarray()
cosine_sim = cosine_similarity(vectors)
return cosine_sim[0][1]
# LLM feedback function using Groq
def generate_llm_feedback(similarity_score):
feedback_prompt = f"""
A user has just pronounced part of the Azaan, and the similarity score between their pronunciation and the reference Azaan is {similarity_score:.2f}.
Based on this score:
- If the score is above 0.9, the pronunciation is excellent.
- If the score is between 0.7 and 0.9, the pronunciation is good but may need slight improvement.
- If the score is below 0.7, the pronunciation requires significant improvement.
Provide detailed feedback for the user about their pronunciation, considering their score of {similarity_score:.2f}.
"""
chat_completion = groq_client.chat.completions.create(
messages=[
{
"role": "user",
"content": feedback_prompt,
}
],
model="llama3-8b-8192",
)
return chat_completion.choices[0].message.content
# Custom CSS for styling
st.markdown(
"""
<style>
.main {
background-color: #f5f5f5;
font-family: 'Arial', sans-serif;
}
.title {
text-align: center;
color: #2a9d8f;
}
.subtitle {
text-align: center;
color: #264653;
}
.footer {
text-align: center;
font-size: 0.8em;
color: #555;
}
.feedback {
background-color: #e9c6c6;
border-radius: 10px;
padding: 20px;
margin: 10px;
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
}
</style>
""",
unsafe_allow_html=True
)
# Streamlit UI
def main():
st.title("๐ Azaan Pronunciation Evaluation")
st.markdown("<h3 class='subtitle'>Welcome to the Azaan Pronunciation Evaluation!</h3>", unsafe_allow_html=True)
st.subheader("Upload Your Audio")
uploaded_file = st.file_uploader("Choose an audio file...", type=["wav", "mp3", "m4a"])
if uploaded_file is not None:
st.audio(uploaded_file, format='audio/wav')
# Step 1: Transcribe expert audio and user audio
expert_audio_path = "Hafiz muqeem.wav" # Change this to the correct path
st.write("๐ค Step 1: Checking if the words match...")
# Transcribe user audio
user_text = transcribe_audio(uploaded_file)
expert_text = transcribe_audio(expert_audio_path)
if user_text and expert_text:
st.write("โ
Transcription successful!")
st.write(f"**Expert Azaan Text:** {expert_text}")
st.write(f"**Your Azaan Text:** {user_text}")
# Step 2: Romanize and compare texts
user_romanized = romanize_arabic(user_text)
expert_romanized = romanize_arabic(expert_text)
text_similarity = compare_text_similarity(user_romanized, expert_romanized)
st.write(f"๐ Text Similarity Score: {text_similarity:.2f}")
if text_similarity >= 0.1:
st.success("โ
Great! Your words match well enough. Now, let's evaluate your pronunciation.")
# Step 3: Evaluate pronunciation similarity
expert_embedding = get_audio_embedding(expert_audio_path)
user_embedding = get_audio_embedding(uploaded_file)
pronunciation_similarity = compare_embeddings(expert_embedding, user_embedding)
st.write(f"๐ Pronunciation Similarity Score: {pronunciation_similarity:.2f}")
# Get feedback
feedback = generate_llm_feedback(pronunciation_similarity)
st.markdown(f"<div class='feedback'>{feedback}</div>", unsafe_allow_html=True)
else:
st.warning("โ ๏ธ Your words do not match sufficiently. Please try again.")
else:
st.error("โ There was an error transcribing one or both audio files.")
st.markdown("<div class='footer'>ยฉ 2024 Azaan Pronunciation Evaluation Tool</div>", unsafe_allow_html=True)
if __name__ == "__main__":
main()
|