Spaces:

danarcat
/

PronunciationChecker

Sleeping

App Files Files Community

Karl El Hajal commited on Jan 17, 2025

Commit

4baa40f

1 Parent(s): d6fcab3

Added code + requirements

Browse files

Files changed (4) hide show

app.py +68 -0
audio_preprocessing.py +103 -0
pronunciation_checker.py +87 -0
requirements.txt +86 -0

app.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# -*- coding: utf-8 -*-
+# SPDX-FileContributor: Karl El Hajal
+# SPDX-FileContributor: Ali Dulaimi
+import gradio as gr
+import tempfile
+import matplotlib.pyplot as plt
+from src.pronunciation_checker import PronunciationChecker
+def check_pronunciation(reference_audio, input_audio):
+    pronunciation_checker = PronunciationChecker("microsoft/wavlm-large")
+    # Extract features from both audio files
+    layer = 6
+    ref_wav, sr = PronunciationChecker.preprocess_wav(reference_audio)
+    comparison_wav, _ = PronunciationChecker.preprocess_wav(input_audio)
+    # Check if waveforms are not empty
+    if ref_wav is None or comparison_wav is None:
+        raise ValueError("One or both of the waveforms are empty.")
+    # Extract features
+    ref_features, ref_wav, sr = pronunciation_checker.extract_features(ref_wav, layer)
+    input_features, comparison_wav, _ = pronunciation_checker.extract_features(comparison_wav, layer)
+    # Compute DTW
+    dist_matrix, path = PronunciationChecker.compute_dtw(ref_features, input_features)
+    # Check if DTW path is valid
+    if path is None or dist_matrix is None:
+        raise ValueError("DTW computation failed.")
+    PronunciationChecker.plot_waveform_with_overlay(ref_wav, sr, dist_matrix, path, "ref")
+    # Save the visualization to a temporary image file
+    with tempfile.NamedTemporaryFile(delete=False, suffix='.png') as tmp:
+        tmp_path = tmp.name
+        plt.savefig(tmp_path)
+        plt.close()
+    # Return the image file path for Gradio to display
+    return tmp_path
+pronunciation_checker = PronunciationChecker("microsoft/wavlm-large")
+# Create Gradio interface
+demo = gr.Interface(
+    fn=check_pronunciation,
+    inputs=[
+        gr.Audio(
+            type="filepath",
+            label="Reference Audio",
+            format="wav"
+        ),
+        gr.Audio(
+            type="filepath",
+            label="Input Audio",
+            format="wav"
+        ),
+    ],
+    outputs=gr.Image(type="filepath"),
+    title="Pronunciation Checker",
+    description="Compare pronunciation using WavLM and visualize with DTW overlays."
+)
+if __name__ == "__main__":
+    demo.launch(share=True, height=700)

audio_preprocessing.py ADDED Viewed

	@@ -0,0 +1,103 @@

+# SPDX-FileContributor: Karl El Hajal
+import numpy as np
+import webrtcvad
+from pydub import AudioSegment
+VAD_SR = 16000
+VAD_MODE = 3  # Aggressiveness level (0-3, where 3 is the most aggressive)
+VAD_FRAME_DURATION = 10  # Frame duration in milliseconds
+def get_speech_segments_webrtcvad(audio_array, sample_rate, frame_duration, vad_mode):
+    vad = webrtcvad.Vad(vad_mode)
+    # Convert the frame duration to samples
+    frame_duration_samples = int(sample_rate * frame_duration / 1000)
+    # Detect speech regions using VAD
+    speech_segments = []
+    start = -1
+    for i in range(0, len(audio_array), frame_duration_samples):
+        frame = audio_array[i : i + frame_duration_samples]
+        if len(frame) < 160:
+            is_speech = False
+        else:
+            frame = frame.tobytes()
+            is_speech = vad.is_speech(frame, sample_rate)
+        if is_speech and start == -1:
+            start = i
+        elif not is_speech and start != -1:
+            end = i
+            speech_segments.append((start, end))
+            start = -1
+    return speech_segments
+def get_start_end_using_vad(audio, sample_rate):
+    audio_array = np.array(audio.get_array_of_samples())
+    speech_segments = get_speech_segments_webrtcvad(audio_array, sample_rate, VAD_FRAME_DURATION, VAD_MODE)
+    if len(speech_segments) == 0:
+        speech_segments = get_speech_segments_webrtcvad(audio_array, sample_rate, VAD_FRAME_DURATION, VAD_MODE - 1)
+    start_sample = speech_segments[0][0]
+    end_sample = speech_segments[-1][1]
+    start_time = float(start_sample / VAD_SR)
+    end_time = float(end_sample / VAD_SR)
+    return start_time, end_time
+def trim_silences(audio, target_sr):
+    audio_copy = audio[:]
+    audio_copy = audio_copy.set_frame_rate(VAD_SR)
+    start_time, end_time = get_start_end_using_vad(audio_copy, VAD_SR)
+    start_sample_orig_sr = int(start_time * target_sr)
+    end_sample_orig_sr = int(end_time * target_sr)
+    filtered_audio_array = np.array(audio.get_array_of_samples())
+    filtered_audio_array = filtered_audio_array[start_sample_orig_sr:end_sample_orig_sr]
+    filtered_audio = AudioSegment(
+        filtered_audio_array.tobytes(),
+        frame_rate=target_sr,
+        sample_width=audio.sample_width,
+        channels=audio.channels,
+    )
+    return filtered_audio
+def match_target_amplitude(audio, target_dBFS):
+    change_in_dBFS = target_dBFS - audio.dBFS
+    return audio.apply_gain(change_in_dBFS)
+def process_wav(wav_path, target_sr, do_trim_silences=True):
+    audio = AudioSegment.from_file(wav_path)
+    # Convert audio to mono
+    if audio.channels > 1:
+        audio = audio.set_channels(1)
+    # Resample audio
+    audio = audio.set_frame_rate(target_sr)
+    # Convert the audio to 16-bit PCM format
+    audio = audio.set_sample_width(2)
+    # Remove silences
+    if do_trim_silences:
+        audio = trim_silences(audio, target_sr)
+    # Loudness normalization to -20dB
+    audio = match_target_amplitude(audio, -20.0)
+    return audio

pronunciation_checker.py ADDED Viewed

	@@ -0,0 +1,87 @@

+# SPDX-FileContributor: Karl El Hajal
+import torch
+import torchaudio
+import numpy as np
+import matplotlib.pyplot as plt
+from transformers import AutoFeatureExtractor, AutoModel
+from scipy.spatial.distance import cdist
+from dtw import accelerated_dtw
+from src.audio_preprocessing import process_wav
+class PronunciationChecker:
+    def __init__(self, model_name = "microsoft/wavlm-large"):
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.model_name = model_name
+        self.processor = AutoFeatureExtractor.from_pretrained(self.model_name)
+        self.model = AutoModel.from_pretrained(self.model_name).eval().to(self.device)
+    @staticmethod
+    def preprocess_wav(wav_path):
+        temp_audio_path = "temp.wav"
+        audio_segment = process_wav(wav_path, 16000, do_trim_silences=True)
+        audio_segment.export(temp_audio_path, format="wav")
+        wav, sr = torchaudio.load(temp_audio_path)
+        return wav, sr
+    def extract_features(self, wav, layer=None):
+        inputs = self.processor(wav.squeeze().to(self.device), sampling_rate=16000, return_tensors="pt", padding=True)
+        inputs = {key: val.to(self.device) for key, val in inputs.items()}
+        with torch.no_grad():
+            outputs = self.model(**inputs, output_hidden_states=True)
+        if layer is None:
+            features = outputs.last_hidden_state
+        else:
+            hidden_states = outputs.hidden_states
+            features = hidden_states[layer]
+        features = features.squeeze().cpu().numpy()
+        return features, wav.squeeze().cpu().numpy(), 16000
+    @staticmethod
+    def compute_dtw(ref_features, input_features):
+        # distance_metric = "euclidean"
+        distance_metric = "cosine"
+        dist_matrix = cdist(ref_features, input_features, metric=distance_metric)
+        _, _, acc, path = accelerated_dtw(ref_features, input_features, dist=distance_metric)
+        return dist_matrix, path
+    @staticmethod
+    def plot_waveform_with_overlay(wav, sr, dist_matrix, path, wav_type='ref'):
+        feature_stride = 320
+        time_ref = np.linspace(0, len(wav) / sr, len(wav))
+        fig, ax = plt.subplots(figsize=(15, 6))
+        # Plot the reference waveform
+        ax.plot(time_ref, wav, label="Waveform", color="blue", alpha=0.7)
+        # Overlay colors based on DTW distances
+        for (i, j) in zip(*path):
+            if wav_type == "ref":
+              index = i
+            else:
+              index = j
+            start_time = index * feature_stride / sr
+            end_time = (index + 1) * feature_stride / sr
+            dist = dist_matrix[i, j]
+            norm_dist = (dist - dist_matrix.min()) / (dist_matrix.max() - dist_matrix.min())
+            green_color = float(norm_dist<0.5)
+            red_color = float(norm_dist>=0.5)
+            # green_color = 1 - norm_dist
+            # red_color = norm_dist
+            color = (red_color, green_color, 0)  # Green to Red
+            ax.axvspan(start_time, end_time, facecolor=color, alpha=0.7)
+        ax.set_xlabel("Time (s)")
+        ax.set_ylabel("Amplitude")
+        ax.set_title("Waveform with DTW Distance Overlay")
+        ax.legend()
+        return fig

requirements.txt ADDED Viewed

	@@ -0,0 +1,86 @@

+aiofiles==23.2.1
+annotated-types==0.7.0
+anyio==4.8.0
+certifi==2024.12.14
+charset-normalizer==3.4.1
+click==8.1.8
+contourpy==1.3.1
+cycler==0.12.1
+dtw==1.4.0
+exceptiongroup==1.2.2
+fastapi==0.115.6
+ffmpy==0.5.0
+filelock==3.16.1
+fonttools==4.55.3
+fsspec==2024.12.0
+gradio==5.12.0
+gradio_client==1.5.4
+h11==0.14.0
+httpcore==1.0.7
+httpx==0.28.1
+huggingface-hub==0.27.1
+idna==3.10
+Jinja2==3.1.5
+kiwisolver==1.4.8
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.10.0
+mdurl==0.1.2
+mpmath==1.3.0
+networkx==3.4.2
+numpy==2.2.1
+nvidia-cublas-cu12==12.4.5.8
+nvidia-cuda-cupti-cu12==12.4.127
+nvidia-cuda-nvrtc-cu12==12.4.127
+nvidia-cuda-runtime-cu12==12.4.127
+nvidia-cudnn-cu12==9.1.0.70
+nvidia-cufft-cu12==11.2.1.3
+nvidia-curand-cu12==10.3.5.147
+nvidia-cusolver-cu12==11.6.1.9
+nvidia-cusparse-cu12==12.3.1.170
+nvidia-nccl-cu12==2.21.5
+nvidia-nvjitlink-cu12==12.4.127
+nvidia-nvtx-cu12==12.4.127
+orjson==3.10.14
+packaging==24.2
+pandas==2.2.3
+pillow==11.1.0
+pip==22.0.2
+pydantic==2.10.5
+pydantic_core==2.27.2
+pydub==0.25.1
+Pygments==2.19.1
+pyparsing==3.2.1
+python-dateutil==2.9.0.post0
+python-multipart==0.0.20
+pytz==2024.2
+PyYAML==6.0.2
+regex==2024.11.6
+requests==2.32.3
+rich==13.9.4
+ruff==0.9.2
+safehttpx==0.1.6
+safetensors==0.5.2
+scipy==1.15.1
+semantic-version==2.10.0
+setuptools==59.6.0
+shellingham==1.5.4
+six==1.17.0
+sniffio==1.3.1
+starlette==0.41.3
+sympy==1.13.1
+tokenizers==0.21.0
+tomlkit==0.13.2
+torch==2.5.1
+torchaudio==2.5.1
+tqdm==4.67.1
+transformers==4.48.0
+triton==3.1.0
+typer==0.15.1
+typing_extensions==4.12.2
+tzdata==2024.2
+urllib3==2.3.0
+uvicorn==0.34.0
+webrtcvad==2.0.10
+websockets==14.1
+wheel==0.37.1