| from .vers import calc_vers |
| import librosa |
| import numpy as np |
| import math |
| from .filler_analyzer import detect_fillers |
| from .find_valence import get_valence_score |
| from filler_count.filler_score import analyze_fillers |
| import pyworld |
|
|
| def compute_vers_score(file_path: str, whisper_model, filler_count = None) -> dict: |
| """ |
| Compute VERS (Vocal Emotional Regulation Score) and its components from a speech sample. |
| """ |
| result = whisper_model.transcribe(file_path, word_timestamps=False, fp16=False) |
| transcript = result.get("text", "").strip() |
| segments = result.get("segments", []) |
|
|
| |
|
|
| if filler_count is None: |
| |
| result = analyze_fillers(file_path,'base', transcript) |
| filler_count = result.get("filler_count", 0) |
|
|
| |
| y, sr = librosa.load(file_path, sr=None) |
| duration = len(y) / sr if sr else 0.0 |
|
|
| |
| rms = librosa.feature.rms(y=y)[0] |
| mean_rms = float(np.mean(rms)) |
| mean_volume_db = 20 * math.log10(mean_rms + 1e-6) if mean_rms > 0 else -80.0 |
| volume_std = np.std(20 * np.log10(rms + 1e-6)) |
|
|
| |
| vol_max = np.max(np.abs(y)) if y.size > 0 else 0.0 |
| vol_max_db = 20 * math.log10(vol_max + 1e-6) if vol_max > 0 else -80.0 |
|
|
| |
| _f0, t = pyworld.harvest(y.astype(np.float64), sr, f0_floor=80.0, f0_ceil=400.0, frame_period=1000 * 256 / sr) |
| f0 = pyworld.stonemask(y.astype(np.float64), _f0, t, sr) |
| voiced_f0 = f0[f0 > 0] |
| voiced_f0 = voiced_f0[ |
| (voiced_f0 > np.percentile(voiced_f0, 5)) & |
| (voiced_f0 < np.percentile(voiced_f0, 95)) |
| ] |
| pitch_variation = 0.0 |
| if voiced_f0.size > 0: |
| median_f0 = np.median(voiced_f0) |
| median_f0 = max(median_f0, 1e-6) |
| semitone_diffs = 12 * np.log2(voiced_f0 / median_f0) |
| pitch_variation = float(np.std(semitone_diffs)) |
|
|
| |
| total_speaking_time = 0.0 |
| long_pause_count = 0 |
| if segments: |
| for seg in segments: |
| total_speaking_time += (seg["end"] - seg["start"]) |
| for i in range(len(segments) - 1): |
| pause_dur = segments[i+1]["start"] - segments[i]["end"] |
| if pause_dur > 1.0: |
| long_pause_count += 1 |
| first_start = segments[0]["start"] |
| last_end = segments[-1]["end"] |
| if first_start > 1.0: |
| long_pause_count += 1 |
| if duration - last_end > 1.0: |
| long_pause_count += 1 |
|
|
| |
| words = transcript.split() |
| word_count = len(words) |
| words_per_min = (word_count / duration) * 60.0 if duration > 0 else 0.0 |
|
|
| |
| valence_scores = get_valence_score(file_path) |
|
|
| |
| vers_result = calc_vers( |
| filler_count=filler_count, |
| long_pause_count=long_pause_count, |
| pitch_variation=pitch_variation, |
| mean_volume_db=mean_volume_db, |
| vol_max_db=vol_max_db, |
| wpm=words_per_min, |
| volume_std=volume_std, |
| valence_scores=valence_scores |
| ) |
| return vers_result |
|
|