Adhan_prep / app.py
DataMine's picture
Update app.py
3ef02b3 verified
import os
import streamlit as st
import torch
import torch.nn.functional as F
import librosa
import speech_recognition as sr
# from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq
# # Load pretrained model and processor
# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
# model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
# Initialize Groq client
groq_client = Groq(api_key="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")
# Function to transcribe audio into text
def transcribe_audio(audio_file):
recognizer = sr.Recognizer()
try:
with sr.AudioFile(audio_file) as source:
audio_data = recognizer.record(source) # Read the entire audio file
text = recognizer.recognize_google(audio_data, language='ar-SA') # Arabic transcription
return text
except sr.UnknownValueError:
return None
except sr.RequestError:
return None
# Function to convert Arabic text to Romanized text
def romanize_arabic(text):
romanized_mapping = {
"ุงู„ู„ู‡": "Allahu",
"ุงูƒุจุฑ": "akbar",
"ุงุดู‡ุฏ": "Ashhadu",
"ุงู†": "an",
"ู„ุง": "la",
"ุงู„ู‡": "ilaha",
"ุงู„ุง": "illa",
"ู…ุญู…ุฏ": "Muhammad",
"ุฑุณูˆู„": "Rasul",
"ุญูŠ": "Hayya",
"ุนู„ู‰": "'ala",
"ุงู„ุตู„ุงู‡": "as-salah",
"ุงู„ูู„ุงุญ": "al-falah",
"ู„ุง": "la",
"ุงู„ุง": "illa",
}
words = text.split()
romanized_text = ' '.join(romanized_mapping.get(word, word) for word in words)
return romanized_text
# Function to convert audio file into embeddings
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import librosa
# Load pretrained model and processor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")
# Function to convert audio file into embeddings
from io import BytesIO
import librosa
# Updated function for Streamlit-compatible audio processing
from io import BytesIO
from pydub import AudioSegment
def get_audio_embedding(file_input):
# Convert Streamlit file input to BytesIO if it's not a string path
if not isinstance(file_input, str):
file_input = BytesIO(file_input.read())
# Convert the audio to WAV format using pydub (supports various formats like mp3, m4a, etc.)
try:
# Read the audio file using pydub and convert to WAV format
audio = AudioSegment.from_file(file_input)
wav_io = BytesIO()
audio.export(wav_io, format="wav")
wav_io.seek(0) # Move back to the start of the BytesIO object
# Load the converted WAV file using librosa
audio_data, sr = librosa.load(wav_io, sr=16000)
except Exception as e:
raise ValueError(f"Failed to process the audio file: {str(e)}")
# Convert audio to embeddings using Wav2Vec2
inputs = feature_extractor(audio_data, sampling_rate=sr, return_tensors="pt", padding=True)
with torch.no_grad():
embeddings = model(**inputs).last_hidden_state.mean(dim=1)
return embeddings
# Function to calculate cosine similarity for embeddings
def compare_embeddings(embedding_1, embedding_2):
similarity = F.cosine_similarity(embedding_1, embedding_2, dim=1)
return similarity.item()
# Function to calculate text similarity using Cosine Similarity
def compare_text_similarity(text1, text2):
vectorizer = CountVectorizer().fit_transform([text1, text2])
vectors = vectorizer.toarray()
cosine_sim = cosine_similarity(vectors)
return cosine_sim[0][1]
# LLM feedback function using Groq
def generate_llm_feedback(similarity_score):
feedback_prompt = f"""
A user has just pronounced part of the Azaan, and the similarity score between their pronunciation and the reference Azaan is {similarity_score:.2f}.
Based on this score:
- If the score is above 0.9, the pronunciation is excellent.
- If the score is between 0.7 and 0.9, the pronunciation is good but may need slight improvement.
- If the score is below 0.7, the pronunciation requires significant improvement.
Provide detailed feedback for the user about their pronunciation, considering their score of {similarity_score:.2f}.
"""
chat_completion = groq_client.chat.completions.create(
messages=[
{
"role": "user",
"content": feedback_prompt,
}
],
model="llama3-8b-8192",
)
return chat_completion.choices[0].message.content
# Custom CSS for styling
st.markdown(
"""
<style>
.main {
background-color: #f5f5f5;
font-family: 'Arial', sans-serif;
}
.title {
text-align: center;
color: #2a9d8f;
}
.subtitle {
text-align: center;
color: #264653;
}
.footer {
text-align: center;
font-size: 0.8em;
color: #555;
}
.feedback {
background-color: #e9c6c6;
border-radius: 10px;
padding: 20px;
margin: 10px;
box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
}
</style>
""",
unsafe_allow_html=True
)
# Streamlit UI
def main():
st.title("๐Ÿ”” Azaan Pronunciation Evaluation")
st.markdown("<h3 class='subtitle'>Welcome to the Azaan Pronunciation Evaluation!</h3>", unsafe_allow_html=True)
st.subheader("Upload Your Audio")
uploaded_file = st.file_uploader("Choose an audio file...", type=["wav", "mp3", "m4a"])
if uploaded_file is not None:
st.audio(uploaded_file, format='audio/wav')
# Step 1: Transcribe expert audio and user audio
expert_audio_path = "Hafiz muqeem.wav" # Change this to the correct path
st.write("๐ŸŽค Step 1: Checking if the words match...")
# Transcribe user audio
user_text = transcribe_audio(uploaded_file)
expert_text = transcribe_audio(expert_audio_path)
if user_text and expert_text:
st.write("โœ… Transcription successful!")
st.write(f"**Expert Azaan Text:** {expert_text}")
st.write(f"**Your Azaan Text:** {user_text}")
# Step 2: Romanize and compare texts
user_romanized = romanize_arabic(user_text)
expert_romanized = romanize_arabic(expert_text)
text_similarity = compare_text_similarity(user_romanized, expert_romanized)
st.write(f"๐Ÿ“ Text Similarity Score: {text_similarity:.2f}")
if text_similarity >= 0.1:
st.success("โœ… Great! Your words match well enough. Now, let's evaluate your pronunciation.")
# Step 3: Evaluate pronunciation similarity
expert_embedding = get_audio_embedding(expert_audio_path)
user_embedding = get_audio_embedding(uploaded_file)
pronunciation_similarity = compare_embeddings(expert_embedding, user_embedding)
st.write(f"๐Ÿ”Š Pronunciation Similarity Score: {pronunciation_similarity:.2f}")
# Get feedback
feedback = generate_llm_feedback(pronunciation_similarity)
st.markdown(f"<div class='feedback'>{feedback}</div>", unsafe_allow_html=True)
else:
st.warning("โš ๏ธ Your words do not match sufficiently. Please try again.")
else:
st.error("โŒ There was an error transcribing one or both audio files.")
st.markdown("<div class='footer'>ยฉ 2024 Azaan Pronunciation Evaluation Tool</div>", unsafe_allow_html=True)
if __name__ == "__main__":
main()