| """ |
| Compute fluency score from audio file using SRS and PAS calculations |
| """ |
|
|
| import librosa |
| import numpy as np |
| from typing import Dict, Any, Union |
| from .fluency import calc_srs, calculate_pas, calculate_fluency, get_fluency_insight |
| from .filler_analyzer import detect_fillers |
|
|
| def compute_fluency_score(file_path: str, whisper_model) -> Dict[str, Any]: |
| """ |
| Compute fluency score and its components from a speech sample. |
| |
| Args: |
| file_path (str): Path to the audio file. |
| whisper_model: Transcription model (e.g., OpenAI Whisper or faster-whisper) |
| |
| Returns: |
| dict: A dictionary containing fluency score, SRS, PAS, and component scores. |
| """ |
| |
| result = whisper_model.transcribe(file_path) |
| transcript = result.get("text", "").strip() |
| segments = result.get("segments", []) |
|
|
| |
| if not transcript or not segments: |
| raise ValueError("Empty transcript or segments from Whisper.") |
|
|
| |
| filler_count, _ = detect_fillers(transcript) |
|
|
| |
| y, sr = librosa.load(file_path, sr=None) |
| duration = len(y) / sr if sr else 0.0 |
| if duration <= 0: |
| raise ValueError("Audio duration invalid or zero.") |
|
|
| |
| f0, voiced_flags, voiced_probs = librosa.pyin( |
| y, sr=sr, fmin=80, fmax=400, frame_length=1024, hop_length=256, fill_na=np.nan) |
| voiced_f0 = f0[~np.isnan(f0)] |
| pitch_variation = 0.0 |
| if voiced_f0.size > 0: |
| median_f0 = np.nanmedian(voiced_f0) |
| median_f0 = max(median_f0, 1e-6) |
| semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) |
| pitch_variation = float(np.nanstd(semitone_diffs)) |
|
|
| |
| long_pause_count = 0 |
| if segments: |
| for i in range(len(segments) - 1): |
| pause_dur = segments[i + 1]["start"] - segments[i]["end"] |
| if pause_dur > 1.0: |
| long_pause_count += 1 |
| |
| if segments[0]["start"] > 1.0: |
| long_pause_count += 1 |
| if duration - segments[-1]["end"] > 1.0: |
| long_pause_count += 1 |
|
|
| |
| word_count = len(transcript.split()) |
| words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0 |
|
|
| |
| srs_score = calc_srs( |
| wpm=words_per_min, |
| filler_count=filler_count, |
| long_pause_count=long_pause_count, |
| pitch_variation=pitch_variation |
| ) |
|
|
| |
| pas_result = calculate_pas( |
| transcript=transcript, |
| segments=segments, |
| filler_count=filler_count, |
| duration=duration |
| ) |
| pas_score = pas_result["PAS"] |
|
|
| |
| fluency_result = calculate_fluency(srs=srs_score, pas=pas_score) |
| fluency_score = fluency_result["score"] |
| insight = get_fluency_insight(fluency_score) |
|
|
| |
| return { |
| "fluency_score": fluency_score, |
| "insight": insight, |
| "SRS": srs_score, |
| "PAS": pas_score, |
| "components": { |
| "wpm": words_per_min, |
| "filler_count": filler_count, |
| "long_pause_count": long_pause_count, |
| "pitch_variation": pitch_variation, |
| "word_count": word_count, |
| "duration": duration, |
| "pas_components": pas_result |
| }, |
| "transcript": transcript |
| } |