InnerVoice / backend /services /audio_processor.py
E5K7's picture
Initial commit: InnerVoice MVP
bf04727
"""
Audio processing: convert to WAV 16kHz mono and extract librosa features.
Temp files are deleted immediately after feature extraction.
"""
import os
import tempfile
import numpy as np
from typing import Optional
def convert_to_wav(input_path: str) -> str:
"""Convert any audio format to WAV 16kHz mono. Returns path to WAV file."""
try:
from pydub import AudioSegment
audio = AudioSegment.from_file(input_path)
audio = audio.set_frame_rate(16000).set_channels(1)
wav_fd, wav_path = tempfile.mkstemp(suffix=".wav")
os.close(wav_fd)
audio.export(wav_path, format="wav")
return wav_path
except Exception as e:
raise RuntimeError(f"Audio conversion failed: {e}")
def extract_features(wav_path: str) -> dict:
"""
Extract acoustic features from a 16kHz mono WAV file.
Returns a dict with pitch, energy, speech_rate, pauses, MFCCs, filler_rate.
"""
import librosa
y, sr = librosa.load(wav_path, sr=16000, mono=True)
duration = librosa.get_duration(y=y, sr=sr)
# ── Pitch ────────────────────────────────────────────────────────────────
f0, voiced_flag, _ = librosa.pyin(
y, fmin=librosa.note_to_hz("C2"), fmax=librosa.note_to_hz("C7"), sr=sr
)
f0_clean = f0[voiced_flag == 1] if voiced_flag is not None else np.array([])
pitch_mean = float(np.mean(f0_clean)) if len(f0_clean) > 0 else 0.0
pitch_std = float(np.std(f0_clean)) if len(f0_clean) > 0 else 0.0
# ── Energy / RMS ─────────────────────────────────────────────────────────
rms = librosa.feature.rms(y=y)[0]
energy_raw = float(np.sqrt(np.mean(rms ** 2)))
# ── Tempo (speech rate proxy) ─────────────────────────────────────────────
tempo_arr, _ = librosa.beat.beat_track(y=y, sr=sr)
tempo = float(tempo_arr) if np.isscalar(tempo_arr) else float(tempo_arr[0])
# ── Pauses (silence detection) ────────────────────────────────────────────
intervals = librosa.effects.split(y, top_db=30)
pauses = []
for i in range(1, len(intervals)):
gap = (intervals[i][0] - intervals[i - 1][1]) / sr
if gap > 0.2:
pauses.append(gap)
pause_count = len(pauses)
avg_pause_duration = float(np.mean(pauses)) if pauses else 0.0
# ── MFCCs ────────────────────────────────────────────────────────────────
mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
mfcc_means = [round(float(np.mean(mfccs[i])), 4) for i in range(13)]
# ── Filler Rate (estimate via zero crossing rate) ─────────────────────────
# High ZCR in quiet segments indicates filler sounds (um, uh, er)
zcr = librosa.feature.zero_crossing_rate(y)[0]
filler_rate = float(np.mean(zcr)) * 2 # scale to ~0–1 range
return {
"pitch_mean": round(pitch_mean, 2),
"pitch_std": round(pitch_std, 2),
"energy_raw": round(energy_raw, 6),
"speech_rate": round(tempo, 2),
"pause_count": pause_count,
"avg_pause_duration": round(avg_pause_duration, 3),
"filler_rate": round(filler_rate, 4),
"mfcc_features": mfcc_means,
"duration_seconds": round(duration, 2),
}