Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| import torch | |
| import torch.nn.functional as F | |
| import librosa | |
| import speech_recognition as sr | |
| # from transformers import Wav2Vec2Processor, Wav2Vec2Model | |
| from sklearn.feature_extraction.text import CountVectorizer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from groq import Groq | |
| # # Load pretrained model and processor | |
| # processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h") | |
| # model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h") | |
| # Initialize Groq client | |
| groq_client = Groq(api_key="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") | |
| # Function to transcribe audio into text | |
| def transcribe_audio(audio_file): | |
| recognizer = sr.Recognizer() | |
| try: | |
| with sr.AudioFile(audio_file) as source: | |
| audio_data = recognizer.record(source) # Read the entire audio file | |
| text = recognizer.recognize_google(audio_data, language='ar-SA') # Arabic transcription | |
| return text | |
| except sr.UnknownValueError: | |
| return None | |
| except sr.RequestError: | |
| return None | |
| # Function to convert Arabic text to Romanized text | |
| def romanize_arabic(text): | |
| romanized_mapping = { | |
| "ุงููู": "Allahu", | |
| "ุงูุจุฑ": "akbar", | |
| "ุงุดูุฏ": "Ashhadu", | |
| "ุงู": "an", | |
| "ูุง": "la", | |
| "ุงูู": "ilaha", | |
| "ุงูุง": "illa", | |
| "ู ุญู ุฏ": "Muhammad", | |
| "ุฑุณูู": "Rasul", | |
| "ุญู": "Hayya", | |
| "ุนูู": "'ala", | |
| "ุงูุตูุงู": "as-salah", | |
| "ุงูููุงุญ": "al-falah", | |
| "ูุง": "la", | |
| "ุงูุง": "illa", | |
| } | |
| words = text.split() | |
| romanized_text = ' '.join(romanized_mapping.get(word, word) for word in words) | |
| return romanized_text | |
| # Function to convert audio file into embeddings | |
| import torch | |
| from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model | |
| import librosa | |
| # Load pretrained model and processor | |
| feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h") | |
| model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h") | |
| # Function to convert audio file into embeddings | |
| from io import BytesIO | |
| import librosa | |
| # Updated function for Streamlit-compatible audio processing | |
| from io import BytesIO | |
| from pydub import AudioSegment | |
| def get_audio_embedding(file_input): | |
| # Convert Streamlit file input to BytesIO if it's not a string path | |
| if not isinstance(file_input, str): | |
| file_input = BytesIO(file_input.read()) | |
| # Convert the audio to WAV format using pydub (supports various formats like mp3, m4a, etc.) | |
| try: | |
| # Read the audio file using pydub and convert to WAV format | |
| audio = AudioSegment.from_file(file_input) | |
| wav_io = BytesIO() | |
| audio.export(wav_io, format="wav") | |
| wav_io.seek(0) # Move back to the start of the BytesIO object | |
| # Load the converted WAV file using librosa | |
| audio_data, sr = librosa.load(wav_io, sr=16000) | |
| except Exception as e: | |
| raise ValueError(f"Failed to process the audio file: {str(e)}") | |
| # Convert audio to embeddings using Wav2Vec2 | |
| inputs = feature_extractor(audio_data, sampling_rate=sr, return_tensors="pt", padding=True) | |
| with torch.no_grad(): | |
| embeddings = model(**inputs).last_hidden_state.mean(dim=1) | |
| return embeddings | |
| # Function to calculate cosine similarity for embeddings | |
| def compare_embeddings(embedding_1, embedding_2): | |
| similarity = F.cosine_similarity(embedding_1, embedding_2, dim=1) | |
| return similarity.item() | |
| # Function to calculate text similarity using Cosine Similarity | |
| def compare_text_similarity(text1, text2): | |
| vectorizer = CountVectorizer().fit_transform([text1, text2]) | |
| vectors = vectorizer.toarray() | |
| cosine_sim = cosine_similarity(vectors) | |
| return cosine_sim[0][1] | |
| # LLM feedback function using Groq | |
| def generate_llm_feedback(similarity_score): | |
| feedback_prompt = f""" | |
| A user has just pronounced part of the Azaan, and the similarity score between their pronunciation and the reference Azaan is {similarity_score:.2f}. | |
| Based on this score: | |
| - If the score is above 0.9, the pronunciation is excellent. | |
| - If the score is between 0.7 and 0.9, the pronunciation is good but may need slight improvement. | |
| - If the score is below 0.7, the pronunciation requires significant improvement. | |
| Provide detailed feedback for the user about their pronunciation, considering their score of {similarity_score:.2f}. | |
| """ | |
| chat_completion = groq_client.chat.completions.create( | |
| messages=[ | |
| { | |
| "role": "user", | |
| "content": feedback_prompt, | |
| } | |
| ], | |
| model="llama3-8b-8192", | |
| ) | |
| return chat_completion.choices[0].message.content | |
| # Custom CSS for styling | |
| st.markdown( | |
| """ | |
| <style> | |
| .main { | |
| background-color: #f5f5f5; | |
| font-family: 'Arial', sans-serif; | |
| } | |
| .title { | |
| text-align: center; | |
| color: #2a9d8f; | |
| } | |
| .subtitle { | |
| text-align: center; | |
| color: #264653; | |
| } | |
| .footer { | |
| text-align: center; | |
| font-size: 0.8em; | |
| color: #555; | |
| } | |
| .feedback { | |
| background-color: #e9c6c6; | |
| border-radius: 10px; | |
| padding: 20px; | |
| margin: 10px; | |
| box-shadow: 0 0 10px rgba(0, 0, 0, 0.1); | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True | |
| ) | |
| # Streamlit UI | |
| def main(): | |
| st.title("๐ Azaan Pronunciation Evaluation") | |
| st.markdown("<h3 class='subtitle'>Welcome to the Azaan Pronunciation Evaluation!</h3>", unsafe_allow_html=True) | |
| st.subheader("Upload Your Audio") | |
| uploaded_file = st.file_uploader("Choose an audio file...", type=["wav", "mp3", "m4a"]) | |
| if uploaded_file is not None: | |
| st.audio(uploaded_file, format='audio/wav') | |
| # Step 1: Transcribe expert audio and user audio | |
| expert_audio_path = "Hafiz muqeem.wav" # Change this to the correct path | |
| st.write("๐ค Step 1: Checking if the words match...") | |
| # Transcribe user audio | |
| user_text = transcribe_audio(uploaded_file) | |
| expert_text = transcribe_audio(expert_audio_path) | |
| if user_text and expert_text: | |
| st.write("โ Transcription successful!") | |
| st.write(f"**Expert Azaan Text:** {expert_text}") | |
| st.write(f"**Your Azaan Text:** {user_text}") | |
| # Step 2: Romanize and compare texts | |
| user_romanized = romanize_arabic(user_text) | |
| expert_romanized = romanize_arabic(expert_text) | |
| text_similarity = compare_text_similarity(user_romanized, expert_romanized) | |
| st.write(f"๐ Text Similarity Score: {text_similarity:.2f}") | |
| if text_similarity >= 0.1: | |
| st.success("โ Great! Your words match well enough. Now, let's evaluate your pronunciation.") | |
| # Step 3: Evaluate pronunciation similarity | |
| expert_embedding = get_audio_embedding(expert_audio_path) | |
| user_embedding = get_audio_embedding(uploaded_file) | |
| pronunciation_similarity = compare_embeddings(expert_embedding, user_embedding) | |
| st.write(f"๐ Pronunciation Similarity Score: {pronunciation_similarity:.2f}") | |
| # Get feedback | |
| feedback = generate_llm_feedback(pronunciation_similarity) | |
| st.markdown(f"<div class='feedback'>{feedback}</div>", unsafe_allow_html=True) | |
| else: | |
| st.warning("โ ๏ธ Your words do not match sufficiently. Please try again.") | |
| else: | |
| st.error("โ There was an error transcribing one or both audio files.") | |
| st.markdown("<div class='footer'>ยฉ 2024 Azaan Pronunciation Evaluation Tool</div>", unsafe_allow_html=True) | |
| if __name__ == "__main__": | |
| main() | |