Spaces:
Running
Running
File size: 3,798 Bytes
bf04727 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 | """
Audio processing: convert to WAV 16kHz mono and extract librosa features.
Temp files are deleted immediately after feature extraction.
"""
import os
import tempfile
import numpy as np
from typing import Optional
def convert_to_wav(input_path: str) -> str:
"""Convert any audio format to WAV 16kHz mono. Returns path to WAV file."""
try:
from pydub import AudioSegment
audio = AudioSegment.from_file(input_path)
audio = audio.set_frame_rate(16000).set_channels(1)
wav_fd, wav_path = tempfile.mkstemp(suffix=".wav")
os.close(wav_fd)
audio.export(wav_path, format="wav")
return wav_path
except Exception as e:
raise RuntimeError(f"Audio conversion failed: {e}")
def extract_features(wav_path: str) -> dict:
"""
Extract acoustic features from a 16kHz mono WAV file.
Returns a dict with pitch, energy, speech_rate, pauses, MFCCs, filler_rate.
"""
import librosa
y, sr = librosa.load(wav_path, sr=16000, mono=True)
duration = librosa.get_duration(y=y, sr=sr)
# ββ Pitch ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
f0, voiced_flag, _ = librosa.pyin(
y, fmin=librosa.note_to_hz("C2"), fmax=librosa.note_to_hz("C7"), sr=sr
)
f0_clean = f0[voiced_flag == 1] if voiced_flag is not None else np.array([])
pitch_mean = float(np.mean(f0_clean)) if len(f0_clean) > 0 else 0.0
pitch_std = float(np.std(f0_clean)) if len(f0_clean) > 0 else 0.0
# ββ Energy / RMS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
rms = librosa.feature.rms(y=y)[0]
energy_raw = float(np.sqrt(np.mean(rms ** 2)))
# ββ Tempo (speech rate proxy) βββββββββββββββββββββββββββββββββββββββββββββ
tempo_arr, _ = librosa.beat.beat_track(y=y, sr=sr)
tempo = float(tempo_arr) if np.isscalar(tempo_arr) else float(tempo_arr[0])
# ββ Pauses (silence detection) ββββββββββββββββββββββββββββββββββββββββββββ
intervals = librosa.effects.split(y, top_db=30)
pauses = []
for i in range(1, len(intervals)):
gap = (intervals[i][0] - intervals[i - 1][1]) / sr
if gap > 0.2:
pauses.append(gap)
pause_count = len(pauses)
avg_pause_duration = float(np.mean(pauses)) if pauses else 0.0
# ββ MFCCs ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
mfcc_means = [round(float(np.mean(mfccs[i])), 4) for i in range(13)]
# ββ Filler Rate (estimate via zero crossing rate) βββββββββββββββββββββββββ
# High ZCR in quiet segments indicates filler sounds (um, uh, er)
zcr = librosa.feature.zero_crossing_rate(y)[0]
filler_rate = float(np.mean(zcr)) * 2 # scale to ~0β1 range
return {
"pitch_mean": round(pitch_mean, 2),
"pitch_std": round(pitch_std, 2),
"energy_raw": round(energy_raw, 6),
"speech_rate": round(tempo, 2),
"pause_count": pause_count,
"avg_pause_duration": round(avg_pause_duration, 3),
"filler_rate": round(filler_rate, 4),
"mfcc_features": mfcc_means,
"duration_seconds": round(duration, 2),
}
|