Spaces:

DataMine
/

Adhan_prep

Sleeping

File size: 7,853 Bytes

import os
import streamlit as st
import torch
import torch.nn.functional as F
import librosa
import speech_recognition as sr
# from transformers import Wav2Vec2Processor, Wav2Vec2Model
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from groq import Groq

# # Load pretrained model and processor
# processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
# model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# Initialize Groq client
groq_client = Groq(api_key="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")

# Function to transcribe audio into text
def transcribe_audio(audio_file):
    recognizer = sr.Recognizer()
    try:
        with sr.AudioFile(audio_file) as source:
            audio_data = recognizer.record(source)  # Read the entire audio file
            text = recognizer.recognize_google(audio_data, language='ar-SA')  # Arabic transcription
            return text
    except sr.UnknownValueError:
        return None
    except sr.RequestError:
        return None

# Function to convert Arabic text to Romanized text
def romanize_arabic(text):
    romanized_mapping = {
        "الله": "Allahu",
        "اكبر": "akbar",
        "اشهد": "Ashhadu",
        "ان": "an",
        "لا": "la",
        "اله": "ilaha",
        "الا": "illa",
        "محمد": "Muhammad",
        "رسول": "Rasul",
        "حي": "Hayya",
        "على": "'ala",
        "الصلاه": "as-salah",
        "الفلاح": "al-falah",
        "لا": "la",
        "الا": "illa",
    }
    
    words = text.split()
    romanized_text = ' '.join(romanized_mapping.get(word, word) for word in words)
    return romanized_text

# Function to convert audio file into embeddings
import torch
from transformers import Wav2Vec2FeatureExtractor, Wav2Vec2Model
import librosa

# Load pretrained model and processor
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained("facebook/wav2vec2-base-960h")
model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base-960h")

# Function to convert audio file into embeddings
from io import BytesIO
import librosa

# Updated function for Streamlit-compatible audio processing
from io import BytesIO
from pydub import AudioSegment

def get_audio_embedding(file_input):
    # Convert Streamlit file input to BytesIO if it's not a string path
    if not isinstance(file_input, str):
        file_input = BytesIO(file_input.read())

    # Convert the audio to WAV format using pydub (supports various formats like mp3, m4a, etc.)
    try:
        # Read the audio file using pydub and convert to WAV format
        audio = AudioSegment.from_file(file_input)
        wav_io = BytesIO()
        audio.export(wav_io, format="wav")
        wav_io.seek(0)  # Move back to the start of the BytesIO object

        # Load the converted WAV file using librosa
        audio_data, sr = librosa.load(wav_io, sr=16000)
    except Exception as e:
        raise ValueError(f"Failed to process the audio file: {str(e)}")

    # Convert audio to embeddings using Wav2Vec2
    inputs = feature_extractor(audio_data, sampling_rate=sr, return_tensors="pt", padding=True)
    with torch.no_grad():
        embeddings = model(**inputs).last_hidden_state.mean(dim=1)
    return embeddings


# Function to calculate cosine similarity for embeddings
def compare_embeddings(embedding_1, embedding_2):
    similarity = F.cosine_similarity(embedding_1, embedding_2, dim=1)
    return similarity.item()

# Function to calculate text similarity using Cosine Similarity
def compare_text_similarity(text1, text2):
    vectorizer = CountVectorizer().fit_transform([text1, text2])
    vectors = vectorizer.toarray()
    cosine_sim = cosine_similarity(vectors)
    return cosine_sim[0][1]

# LLM feedback function using Groq
def generate_llm_feedback(similarity_score):
    feedback_prompt = f"""
    A user has just pronounced part of the Azaan, and the similarity score between their pronunciation and the reference Azaan is {similarity_score:.2f}.
    
    Based on this score:
    - If the score is above 0.9, the pronunciation is excellent.
    - If the score is between 0.7 and 0.9, the pronunciation is good but may need slight improvement.
    - If the score is below 0.7, the pronunciation requires significant improvement.
    
    Provide detailed feedback for the user about their pronunciation, considering their score of {similarity_score:.2f}.
    """

    chat_completion = groq_client.chat.completions.create(
        messages=[
            {
                "role": "user",
                "content": feedback_prompt,
            }
        ],
        model="llama3-8b-8192",
    )

    return chat_completion.choices[0].message.content

# Custom CSS for styling
st.markdown(
    """
    <style>
    .main {
        background-color: #f5f5f5;
        font-family: 'Arial', sans-serif;
    }
    .title {
        text-align: center;
        color: #2a9d8f;
    }
    .subtitle {
        text-align: center;
        color: #264653;
    }
    .footer {
        text-align: center;
        font-size: 0.8em;
        color: #555;
    }
    .feedback {
        background-color: #e9c6c6;
        border-radius: 10px;
        padding: 20px;
        margin: 10px;
        box-shadow: 0 0 10px rgba(0, 0, 0, 0.1);
    }
    </style>
    """,
    unsafe_allow_html=True
)

# Streamlit UI
def main():
    st.title("🔔 Azaan Pronunciation Evaluation")
    st.markdown("<h3 class='subtitle'>Welcome to the Azaan Pronunciation Evaluation!</h3>", unsafe_allow_html=True)

    st.subheader("Upload Your Audio")
    uploaded_file = st.file_uploader("Choose an audio file...", type=["wav", "mp3", "m4a"])

    if uploaded_file is not None:
        st.audio(uploaded_file, format='audio/wav')

        # Step 1: Transcribe expert audio and user audio
        expert_audio_path = "Hafiz muqeem.wav" # Change this to the correct path
        st.write("🎤 Step 1: Checking if the words match...")
        
        # Transcribe user audio
        user_text = transcribe_audio(uploaded_file)
        expert_text = transcribe_audio(expert_audio_path)

        if user_text and expert_text:
            st.write("✅ Transcription successful!")
            st.write(f"**Expert Azaan Text:** {expert_text}")
            st.write(f"**Your Azaan Text:** {user_text}")

            # Step 2: Romanize and compare texts
            user_romanized = romanize_arabic(user_text)
            expert_romanized = romanize_arabic(expert_text)

            text_similarity = compare_text_similarity(user_romanized, expert_romanized)
            st.write(f"📝 Text Similarity Score: {text_similarity:.2f}")

            if text_similarity >= 0.1:
                st.success("✅ Great! Your words match well enough. Now, let's evaluate your pronunciation.")

                # Step 3: Evaluate pronunciation similarity
                expert_embedding = get_audio_embedding(expert_audio_path)
                user_embedding = get_audio_embedding(uploaded_file)

                pronunciation_similarity = compare_embeddings(expert_embedding, user_embedding)
                st.write(f"🔊 Pronunciation Similarity Score: {pronunciation_similarity:.2f}")

                # Get feedback
                feedback = generate_llm_feedback(pronunciation_similarity)
                st.markdown(f"<div class='feedback'>{feedback}</div>", unsafe_allow_html=True)
            else:
                st.warning("⚠️ Your words do not match sufficiently. Please try again.")
        else:
            st.error("❌ There was an error transcribing one or both audio files.")

    st.markdown("<div class='footer'>© 2024 Azaan Pronunciation Evaluation Tool</div>", unsafe_allow_html=True)

if __name__ == "__main__":
    main()