Spaces:

danarcat
/

PronunciationChecker

Sleeping

App Files Files Community

0ahkd1 commited on Feb 2

Commit

d4d0c2c

1 Parent(s): 69e2bf6

refactored red-green decision code

Browse files

Files changed (3) hide show

app.py +4 -5
src/audio_preprocessing.py +20 -13
src/pronunciation_checker.py +9 -38

app.py CHANGED Viewed

@@ -26,17 +26,16 @@ def check_pronunciation(reference_audio, input_audio, threshold, wavlm_layer, la
     log_timing("Start")
     # Extract features from both audio files
     ref_wav, sr = pronunciation_checker.preprocess_wav(reference_audio)
     log_timing("Reference Audio Preprocessing")
     comparison_wav, _ = pronunciation_checker.preprocess_wav(input_audio)
     log_timing("Input Audio Preprocessing")
-    # ref_wav = denoise_audio(ref_wav)
-    # comparison_wav = denoise_audio(comparison_wav)
-    # log_timing("Audio Denoising")
     # Check if waveforms are not empty
     if ref_wav is None or comparison_wav is None:

     log_timing("Start")
+    # ref_wav = denoise_audio(ref_wav)
+    input_audio = denoise_audio(input_audio)
+    log_timing("Input Audio Denoising")
     # Extract features from both audio files
     ref_wav, sr = pronunciation_checker.preprocess_wav(reference_audio)
     log_timing("Reference Audio Preprocessing")
     comparison_wav, _ = pronunciation_checker.preprocess_wav(input_audio)
     log_timing("Input Audio Preprocessing")
     # Check if waveforms are not empty
     if ref_wav is None or comparison_wav is None:

src/audio_preprocessing.py CHANGED Viewed

@@ -4,7 +4,6 @@ import numpy as np
 import webrtcvad
 from pydub import AudioSegment
 import subprocess
-import tempfile
 VAD_SR = 16000
@@ -106,19 +105,27 @@ def process_wav(wav_path, target_sr, do_trim_silences=True):
     return audio
-def assess_pronunciation_quality(dist_matrix, path):
-    # Extract distances along the alignment path
-    path_distances = [dist_matrix[i, j] for i, j in zip(*path)]
-    num_wav_frames = len(dist_matrix) # For the reference wav
     wav_distances = [0] * num_wav_frames
     for (i, j) in zip(*path):
-        wav_distances[i] = dist_matrix[i, j] # For the reference wav
-    threshold = 0.3
     # Analyze normalized distances
-    num_red_segments = sum(1 for d in wav_distances if d >= threshold)
     total_segments = len(wav_distances)
     red_percentage = num_red_segments / total_segments if total_segments > 0 else 0.0
@@ -128,9 +135,9 @@ def assess_pronunciation_quality(dist_matrix, path):
     # Print debug information
     print(f"Raw distance stats:")
-    print(f"  Min distance: {min(path_distances):.4f}")
-    print(f"  Max distance: {max(path_distances):.4f}")
-    print(f"  Mean distance: {np.mean(path_distances):.4f}")
     print(f"\nNormalized distance stats:")
     print(f"  Number of red segments (>= 0.5): {num_red_segments}")
     print(f"  Total segments: {total_segments}")
@@ -140,7 +147,7 @@ def assess_pronunciation_quality(dist_matrix, path):
 def denoise_audio(input_audio_path):
     output_audio_path = input_audio_path.replace(".wav", "_denoised.wav")
     subprocess.run(["denoise", input_audio_path, output_audio_path, "--plot"], check=True)

 import webrtcvad
 from pydub import AudioSegment
 import subprocess
 VAD_SR = 16000
     return audio
+def get_red_green_segments(dist_matrix, path, wav_type='ref', threshold=0.3):
+    if wav_type == "ref":
+        num_wav_frames = len(dist_matrix)
+    else:
+        num_wav_frames = len(dist_matrix[0])
     wav_distances = [0] * num_wav_frames
     for (i, j) in zip(*path):
+        wav_distances[i] = dist_matrix[i, j]
+    red_segments = [i for i, d in enumerate(wav_distances) if d >= threshold]
+    green_segments = [i for i, d in enumerate(wav_distances) if d < threshold]
+    return red_segments, green_segments, wav_distances
+def assess_pronunciation_quality(dist_matrix, path):
+    # _ is green_segments
+    red_segments, _, wav_distances = get_red_green_segments(dist_matrix, path, wav_type=None)
     # Analyze normalized distances
+    num_red_segments = len(red_segments)
     total_segments = len(wav_distances)
     red_percentage = num_red_segments / total_segments if total_segments > 0 else 0.0
     # Print debug information
     print(f"Raw distance stats:")
+    print(f"  Min distance: {min(wav_distances):.4f}")
+    print(f"  Max distance: {max(wav_distances):.4f}")
+    print(f"  Mean distance: {np.mean(wav_distances):.4f}")
     print(f"\nNormalized distance stats:")
     print(f"  Number of red segments (>= 0.5): {num_red_segments}")
     print(f"  Total segments: {total_segments}")
 def denoise_audio(input_audio_path):
+    assert isinstance(input_audio_path, str), "Input path must be a string"
     output_audio_path = input_audio_path.replace(".wav", "_denoised.wav")
     subprocess.run(["denoise", input_audio_path, output_audio_path, "--plot"], check=True)

src/pronunciation_checker.py CHANGED Viewed

@@ -1,14 +1,13 @@
 # SPDX-FileContributor: Karl El Hajal
 import torch
-import torchaudio
 import numpy as np
 import matplotlib.pyplot as plt
 from transformers import AutoFeatureExtractor, AutoModel
 from scipy.spatial.distance import cdist
 from dtw import accelerated_dtw
-from src.audio_preprocessing import process_wav
 class PronunciationChecker:
     def __init__(self, model_name = "microsoft/wavlm-large"):
@@ -76,43 +75,19 @@ class PronunciationChecker:
         fig, ax = plt.subplots(3, 1, figsize=(15, 10), gridspec_kw={'height_ratios': [5, 1, 1]})
-        # Plot the reference waveform
         ax[0].plot(time_ref, wav, label="Waveform", color="blue", alpha=0.7)
-        # DTW distance overlay
-        if wav_type == "ref":
-            num_wav_frames = len(dist_matrix)
-        else:
-            num_wav_frames = len(dist_matrix[0])
-        wav_distances = [0] * num_wav_frames
-        for (i, j) in zip(*path):
-            index = i if wav_type == "ref" else j
-            wav_distances[index] = dist_matrix[i, j]
-        # cur_index = -1
-        # for (i, j) in zip(*path):
-        #     if wav_type == "ref":
-        #         index = i
-        #     else:
-        #         index = j
-        #     if index == cur_index:
-        #         continue
-        #     wav_distances[index] = dist_matrix[i, j]
-        #     cur_index = index
-        # Overlay colors based on DTW distances
-        for index in range(0, num_wav_frames):
             start_time = index * scaling_factor
             end_time = (index + 1) * scaling_factor
-            norm_dist = wav_distances[index]
-            green_color = float(norm_dist < threshold)
-            red_color = float(norm_dist >= threshold)
-            color = (red_color, green_color, 0)  # Green to Red
-            ax[0].axvspan(start_time, end_time, facecolor=color, alpha=0.5)
         ax[0].set_xlabel("Time (s)")
         ax[0].set_ylabel("Amplitude")
@@ -122,8 +97,6 @@ class PronunciationChecker:
         ax[1].set_xlim(ax[0].get_xlim())
         ax[2].set_xlim(ax[0].get_xlim())
-        print(input_number)
         if labels_data:
             for start, end, grapheme, *boolean_labels in labels_data:
                 ax[0].axvline(start, color='gray', linestyle='--', alpha=0.7)
@@ -143,6 +116,4 @@ class PronunciationChecker:
         ax[2].set_title("Boolean Labels")
         ax[2].grid(False)
-        return fig

 # SPDX-FileContributor: Karl El Hajal
 import torch
 import numpy as np
 import matplotlib.pyplot as plt
 from transformers import AutoFeatureExtractor, AutoModel
 from scipy.spatial.distance import cdist
 from dtw import accelerated_dtw
+from src.audio_preprocessing import process_wav, get_red_green_segments
 class PronunciationChecker:
     def __init__(self, model_name = "microsoft/wavlm-large"):
         fig, ax = plt.subplots(3, 1, figsize=(15, 10), gridspec_kw={'height_ratios': [5, 1, 1]})
         ax[0].plot(time_ref, wav, label="Waveform", color="blue", alpha=0.7)
+        red_segments, green_segments, _ = get_red_green_segments(dist_matrix, path, threshold, wav_type=wav_type)
+        for index in green_segments:
             start_time = index * scaling_factor
             end_time = (index + 1) * scaling_factor
+            ax[0].axvspan(start_time, end_time, facecolor=(0, 1, 0), alpha=0.5)
+        for index in red_segments:
+            start_time = index * scaling_factor
+            end_time = (index + 1) * scaling_factor
+            ax[0].axvspan(start_time, end_time, facecolor=(1, 0, 0), alpha=0.5)
         ax[0].set_xlabel("Time (s)")
         ax[0].set_ylabel("Amplitude")
         ax[1].set_xlim(ax[0].get_xlim())
         ax[2].set_xlim(ax[0].get_xlim())
         if labels_data:
             for start, end, grapheme, *boolean_labels in labels_data:
                 ax[0].axvline(start, color='gray', linestyle='--', alpha=0.7)
         ax[2].set_title("Boolean Labels")
         ax[2].grid(False)
+        return fig