| """ |
| Voice Clarity Score calculation module |
| """ |
|
|
| import librosa |
| import numpy as np |
| from typing import Dict, Any, List |
| import soundfile as sf |
|
|
| def calculate_articulation(y: np.ndarray, sr: int) -> float: |
| """ |
| Calculate articulation quality based on spectral contrast. |
| |
| Articulation refers to how clearly individual phonemes are produced. |
| |
| Args: |
| y (np.ndarray): Audio signal |
| sr (int): Sample rate |
| |
| Returns: |
| float: Articulation score (0-100) |
| """ |
| |
| |
| S = np.abs(librosa.stft(y)) |
| contrast = librosa.feature.spectral_contrast(S=S, sr=sr) |
| |
| |
| mean_contrast = np.mean(contrast) |
| |
| |
| |
| min_contrast = 10 |
| max_contrast = 50 |
| normalized_contrast = min(100, max(0, (mean_contrast - min_contrast) / (max_contrast - min_contrast) * 100)) |
| |
| return normalized_contrast |
|
|
| def calculate_enunciation(y: np.ndarray, sr: int) -> float: |
| """ |
| Calculate enunciation quality based on formant clarity and spectral flatness. |
| |
| Enunciation is the precision in pronouncing vowels and consonants. |
| |
| Args: |
| y (np.ndarray): Audio signal |
| sr (int): Sample rate |
| |
| Returns: |
| float: Enunciation score (0-100) |
| """ |
| |
| flatness = np.mean(librosa.feature.spectral_flatness(y=y)) |
| |
| |
| centroid = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)) |
| |
| |
| norm_flatness = max(0, min(100, (0.5 - flatness) / 0.5 * 100)) |
| |
| |
| ideal_centroid = 2500 |
| centroid_deviation = abs(centroid - ideal_centroid) / 2000 |
| norm_centroid = max(0, min(100, (1 - centroid_deviation) * 100)) |
| |
| |
| enunciation_score = (0.7 * norm_flatness) + (0.3 * norm_centroid) |
| |
| return enunciation_score |
|
|
| def calculate_speech_pause_control(segments: List[Dict]) -> float: |
| """ |
| Calculate how effectively pauses are integrated in speech. |
| |
| Speech pause control refers to the natural vs. abrupt pauses in speech. |
| |
| Args: |
| segments (List[Dict]): List of transcript segments with timing information |
| |
| Returns: |
| float: Speech pause control score (0-100) |
| """ |
| if len(segments) < 2: |
| return 100.0 |
| |
| pause_durations = [] |
| for i in range(len(segments) - 1): |
| pause_dur = segments[i + 1]["start"] - segments[i]["end"] |
| if pause_dur > 0.05: |
| pause_durations.append(pause_dur) |
| |
| if not pause_durations: |
| return 100.0 |
| |
| |
| |
| pause_std = np.std(pause_durations) |
| |
| |
| long_pauses = sum(1 for d in pause_durations if d > 2.0) |
| long_pause_ratio = long_pauses / len(pause_durations) if pause_durations else 0 |
| |
| |
| |
| if pause_std < 0.1: |
| std_score = 70 |
| elif pause_std < 0.5: |
| std_score = 100 - ((pause_std - 0.1) / 0.4 * 30) |
| else: |
| std_score = max(0, 70 - ((pause_std - 0.5) / 2.0 * 70)) |
| |
| |
| long_pause_penalty = long_pause_ratio * 50 |
| |
| |
| pause_control_score = max(0, min(100, std_score - long_pause_penalty)) |
| |
| return pause_control_score |
|
|
| def calculate_voice_clarity_score(y: np.ndarray, sr: int, segments: List[Dict]) -> Dict[str, Any]: |
| """ |
| Calculate the Voice Clarity Score (VCS) and its components. |
| |
| VCS reflects the clarity and intelligibility of speech. |
| |
| Args: |
| y (np.ndarray): Audio signal |
| sr (int): Sample rate |
| segments (List[Dict]): List of transcript segments with timing information |
| |
| Returns: |
| Dict[str, Any]: Dictionary with VCS and component scores |
| """ |
| |
| articulation_score = calculate_articulation(y, sr) |
| enunciation_score = calculate_enunciation(y, sr) |
| speech_pause_control_score = calculate_speech_pause_control(segments) |
| |
| |
| vcs = (0.45 * articulation_score) + (0.35 * enunciation_score) + (0.2 * speech_pause_control_score) |
| |
| |
| result = { |
| "VCS": vcs, |
| "components": { |
| "articulation": articulation_score, |
| "enunciation": enunciation_score, |
| "speech_pause_control": speech_pause_control_score |
| } |
| } |
| |
| |
| result["insight"] = get_clarity_insight(vcs) |
| |
| return result |
|
|
| def get_clarity_insight(vcs: float) -> str: |
| """ |
| Generate insight text based on the Voice Clarity Score. |
| |
| Args: |
| vcs (float): Voice Clarity Score (0-100) |
| |
| Returns: |
| str: Insight text explaining the score |
| """ |
| if vcs >= 85: |
| return "Excellent voice clarity with precise articulation and well-controlled pauses. Speech is highly intelligible and pleasant to listen to." |
| elif vcs >= 70: |
| return "Good voice clarity with clear pronunciation and generally appropriate pauses. Minor improvements could enhance overall clarity." |
| elif vcs >= 50: |
| return "Moderate voice clarity with some articulation issues or irregular pauses. Focus on clearer pronunciation and more natural pausing." |
| elif vcs >= 30: |
| return "Below average clarity with noticeable articulation problems or awkward pausing patterns. Consider speech exercises to improve clarity." |
| else: |
| return "Speech clarity needs significant improvement. Articulation is unclear and pausing patterns disrupt intelligibility. Speech therapy exercises may be beneficial." |