Spaces:

danarcat
/

PronunciationChecker

Sleeping

File size: 4,903 Bytes

# SPDX-FileContributor: Karl El Hajal

import numpy as np
import webrtcvad
from pydub import AudioSegment
import subprocess


VAD_SR = 16000
VAD_MODE = 3  # Aggressiveness level (0-3, where 3 is the most aggressive)
VAD_FRAME_DURATION = 10  # Frame duration in milliseconds

def get_speech_segments_webrtcvad(audio_array, sample_rate, frame_duration, vad_mode):
    vad = webrtcvad.Vad(vad_mode)

    # Convert the frame duration to samples
    frame_duration_samples = int(sample_rate * frame_duration / 1000)

    # Detect speech regions using VAD
    speech_segments = []
    start = -1
    for i in range(0, len(audio_array), frame_duration_samples):
        frame = audio_array[i : i + frame_duration_samples]

        if len(frame) < 160:
            is_speech = False
        else:
            frame = frame.tobytes()
            is_speech = vad.is_speech(frame, sample_rate)

        if is_speech and start == -1:
            start = i
        elif not is_speech and start != -1:
            end = i
            speech_segments.append((start, end))
            start = -1

    return speech_segments


def get_start_end_using_vad(audio, sample_rate):
    audio_array = np.array(audio.get_array_of_samples())

    speech_segments = get_speech_segments_webrtcvad(audio_array, sample_rate, VAD_FRAME_DURATION, VAD_MODE)
    if len(speech_segments) == 0:
        speech_segments = get_speech_segments_webrtcvad(audio_array, sample_rate, VAD_FRAME_DURATION, VAD_MODE - 1)

    start_sample = speech_segments[0][0]
    end_sample = speech_segments[-1][1]

    start_time = float(start_sample / VAD_SR)
    end_time = float(end_sample / VAD_SR)

    return start_time, end_time


def trim_silences(audio, target_sr):
    audio_copy = audio[:]

    audio_copy = audio_copy.set_frame_rate(VAD_SR)

    start_time, end_time = get_start_end_using_vad(audio_copy, VAD_SR)

    start_sample_orig_sr = int(start_time * target_sr)
    end_sample_orig_sr = int(end_time * target_sr)

    filtered_audio_array = np.array(audio.get_array_of_samples())
    filtered_audio_array = filtered_audio_array[start_sample_orig_sr:end_sample_orig_sr]

    filtered_audio = AudioSegment(
        filtered_audio_array.tobytes(),
        frame_rate=target_sr,
        sample_width=audio.sample_width,
        channels=audio.channels,
    )

    return filtered_audio


def match_target_amplitude(audio, target_dBFS):
    change_in_dBFS = target_dBFS - audio.dBFS
    return audio.apply_gain(change_in_dBFS)


def process_wav(wav_path, target_sr, do_trim_silences=True):
    audio = AudioSegment.from_file(wav_path)

    # Convert audio to mono
    if audio.channels > 1:
        audio = audio.set_channels(1)

    # Resample audio
    audio = audio.set_frame_rate(target_sr)

    # Convert the audio to 16-bit PCM format
    audio = audio.set_sample_width(2)

    # Remove silences
    if do_trim_silences:
        audio = trim_silences(audio, target_sr)

    # Loudness normalization to -20dB
    audio = match_target_amplitude(audio, -20.0)

    return audio


def get_red_green_segments(dist_matrix, path, wav_type='ref', threshold=0.3):
    if wav_type == "ref":
        num_wav_frames = len(dist_matrix)
    else:
        num_wav_frames = len(dist_matrix[0])
    wav_distances = [0] * num_wav_frames
    for (i, j) in zip(*path):
        wav_distances[i] = dist_matrix[i, j]

    red_segments = [i for i, d in enumerate(wav_distances) if d >= threshold]
    green_segments = [i for i, d in enumerate(wav_distances) if d < threshold]

    return red_segments, green_segments, wav_distances


def assess_pronunciation_quality(dist_matrix, path):
    # _ is green_segments
    red_segments, _, wav_distances = get_red_green_segments(dist_matrix, path, wav_type=None)
    
    # Analyze normalized distances
    num_red_segments = len(red_segments)
    total_segments = len(wav_distances)
    red_percentage = num_red_segments / total_segments if total_segments > 0 else 0.0
    
    # Calculate quality score and repetition need
    quality_score = 1 - red_percentage
    needs_repeat = red_percentage > 0.5
    
    # Print debug information
    print(f"Raw distance stats:")
    print(f"  Min distance: {min(wav_distances):.4f}")
    print(f"  Max distance: {max(wav_distances):.4f}")
    print(f"  Mean distance: {np.mean(wav_distances):.4f}")
    print(f"\nNormalized distance stats:")
    print(f"  Number of red segments (>= 0.5): {num_red_segments}")
    print(f"  Total segments: {total_segments}")
    print(f"\nRed percentage: {red_percentage * 100:.2f}%")
    
    return quality_score, needs_repeat
    

def denoise_audio(input_audio_path):
    assert isinstance(input_audio_path, str), "Input path must be a string"
    output_audio_path = input_audio_path.replace(".wav", "_denoised.wav")
    subprocess.run(["denoise", input_audio_path, output_audio_path, "--plot"], check=True)

    return output_audio_path