| from typing import List, Dict |
| import librosa |
| import numpy as np |
| import spacy |
| import math |
| from .filler_analyzer import detect_fillers |
|
|
| def calc_srs(wpm: float, filler_count: int, long_pause_count: int, pitch_variation: float) -> float: |
| """ |
| Speech Rate Stability (SRS): Reflects the consistency of the speaker's pace and rhythm. |
| """ |
| ideal_wpm = 150 |
| wpm_deviation = min(30, abs(wpm - ideal_wpm)) |
| wpm_consistency = max(0, 100 - (wpm_deviation * 1.67)) |
|
|
| filler_penalty = min(filler_count / 10, 1.0) |
| pause_penalty = min(long_pause_count / 5, 1.0) |
| pitch_penalty = min(pitch_variation / 3.0, 1.0) |
|
|
| stability = (1 - ((filler_penalty + pause_penalty + pitch_penalty) / 3)) * 100 |
| SRS = (0.45 * wpm_consistency) + (0.55 * stability) |
| return min(100, max(0, SRS)) |
|
|
| def calculate_pas(transcript: str, segments: List[Dict], filler_count: int, duration: float) -> Dict[str, float]: |
| """ |
| Calculate the Pause Appropriateness Score (PAS) and its components. |
| """ |
| if not transcript or not segments or duration <= 0: |
| raise ValueError("Transcript, segments, and duration must be valid") |
| |
| nlp = spacy.load("en_core_web_sm") |
| doc = nlp(transcript) |
| |
| words = transcript.split() |
| total_words = len(words) |
| if total_words == 0: |
| raise ValueError("No words found in transcript") |
| |
| filler_rate = filler_count / total_words if total_words > 0 else 0.0 |
| if filler_rate >= 0.10: |
| afw = 0.0 |
| elif filler_rate <= 0.0: |
| afw = 100.0 |
| else: |
| afw = 100.0 - (filler_rate * 1000) |
| afw = max(0.0, min(100.0, afw)) |
| |
| total_pauses = 0 |
| natural_pauses = 0 |
| segment_texts = [seg["text"].strip() for seg in segments] |
| segment_starts = [seg["start"] for seg in segments] |
| segment_ends = [seg["end"] for seg in segments] |
| |
| for i in range(len(segments) - 1): |
| pause_dur = segment_starts[i + 1] - segment_ends[i] |
| if pause_dur > 0.5: |
| total_pauses += 1 |
| if segment_texts[i] and segment_texts[i][-1] in ".!?,": |
| natural_pauses += 1 |
| |
| if segment_starts[0] > 0.5: |
| total_pauses += 1 |
| if duration - segment_ends[-1] > 0.5: |
| total_pauses += 1 |
| if segment_texts[-1] and segment_texts[-1][-1] in ".!?": |
| natural_pauses += 1 |
| |
| npp = 100.0 if total_pauses == 0 else (natural_pauses / total_pauses) * 100.0 |
| pas = (0.4 * npp) + (0.6 * afw) |
| |
| return { |
| "NPP": npp, |
| "AFW": afw, |
| "PAS": pas |
| } |
|
|
| def calculate_rcs(y: np.ndarray, sr: int, segments: List[Dict], duration: float) -> Dict[str, float]: |
| """ |
| Calculate the Rhythm Consistency Score (RCS) and its components. |
| """ |
| if y.size == 0 or sr <= 0 or duration <= 0 or not segments: |
| raise ValueError("Audio signal, sampling rate, duration, and segments must be valid") |
| |
| onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=256) |
| onsets = librosa.onset.onset_detect(onset_envelope=onset_env, sr=sr, units='time', hop_length=256) |
| |
| if len(onsets) > 1: |
| iois = np.diff(onsets) |
| ioi_std = np.std(iois) |
| ioi_std = min(max(ioi_std, 0.1), 0.5) |
| str_score = 100.0 * (0.5 - ioi_std) / (0.5 - 0.1) |
| str_score = max(0.0, min(100.0, str_score)) |
| else: |
| str_score = 100.0 |
| |
| total_transitions = 0 |
| smooth_transitions = 0 |
| pause_threshold = 0.3 |
| |
| for i in range(len(segments) - 1): |
| gap = segments[i + 1]["start"] - segments[i]["end"] |
| total_transitions += 1 |
| if gap <= pause_threshold: |
| smooth_transitions += 1 |
| |
| for segment in segments: |
| words = segment["text"].strip().split() |
| if len(words) > 1: |
| smooth_transitions += len(words) - 1 |
| total_transitions += len(words) - 1 |
| |
| stw = 100.0 if total_transitions == 0 else (smooth_transitions / total_transitions) * 100.0 |
| rcs = (0.5 * str_score) + (0.5 * stw) |
| |
| return { |
| "STR": str_score, |
| "STW": stw, |
| "RCS": rcs |
| } |
|
|
| def calculate_vps( |
| transcript: str, |
| segments: List[Dict], |
| filler_count: int, |
| duration: float, |
| wpm: float, |
| long_pause_count: int, |
| pitch_variation: float, |
| y: np.ndarray, |
| sr: int |
| ) -> Dict[str, float]: |
| """ |
| Calculate the Voice Pacing Score (VPS) and its components: |
| - SRS: Speech Rate Stability Score |
| - PAS: Pause Appropriateness Score |
| - RCS: Rhythm Consistency Score |
| - VPS = (0.5 * SRS) + (0.3 * PAS) + (0.2 * RCS) |
| |
| Args: |
| transcript (str): Transcribed text. |
| segments (List[Dict]): Whisper model segments with 'start', 'end', 'text'. |
| filler_count (int): Number of filler words. |
| duration (float): Audio duration (seconds). |
| wpm (float): Words per minute. |
| long_pause_count (int): Number of long pauses (>1.0s). |
| pitch_variation (float): Pitch variation in semitones. |
| y (np.ndarray): Audio signal. |
| sr (int): Sampling rate. |
| |
| Returns: |
| Dict[str, float]: Scores for SRS, PAS, RCS, VPS, and intermediates. |
| """ |
| |
| if not transcript or not segments or duration <= 0 or y.size == 0 or sr <= 0: |
| raise ValueError("Invalid inputs") |
| |
| |
| srs = calc_srs(wpm, filler_count, long_pause_count, pitch_variation) |
| |
| |
| pas_result = calculate_pas(transcript, segments, filler_count, duration) |
| pas = pas_result["PAS"] |
| npp = pas_result["NPP"] |
| afw = pas_result["AFW"] |
| |
| |
| rcs_result = calculate_rcs(y, sr, segments, duration) |
| rcs = rcs_result["RCS"] |
| str_score = rcs_result["STR"] |
| stw = rcs_result["STW"] |
| |
| |
| vps = (0.5 * srs) + (0.3 * pas) + (0.2 * rcs) |
| vps = max(0.0, min(100.0, vps)) |
| |
| return { |
| "SRS": srs, |
| "PAS": pas, |
| "NPP": npp, |
| "AFW": afw, |
| "RCS": rcs, |
| "STR": str_score, |
| "STW": stw, |
| "VPS": vps |
| } |