Spaces:

E5K7
/

InnerVoice

Running

File size: 3,798 Bytes

bf04727

"""
Audio processing: convert to WAV 16kHz mono and extract librosa features.
Temp files are deleted immediately after feature extraction.
"""
import os
import tempfile
import numpy as np
from typing import Optional


def convert_to_wav(input_path: str) -> str:
    """Convert any audio format to WAV 16kHz mono. Returns path to WAV file."""
    try:
        from pydub import AudioSegment
        audio = AudioSegment.from_file(input_path)
        audio = audio.set_frame_rate(16000).set_channels(1)
        wav_fd, wav_path = tempfile.mkstemp(suffix=".wav")
        os.close(wav_fd)
        audio.export(wav_path, format="wav")
        return wav_path
    except Exception as e:
        raise RuntimeError(f"Audio conversion failed: {e}")


def extract_features(wav_path: str) -> dict:
    """
    Extract acoustic features from a 16kHz mono WAV file.
    Returns a dict with pitch, energy, speech_rate, pauses, MFCCs, filler_rate.
    """
    import librosa

    y, sr = librosa.load(wav_path, sr=16000, mono=True)
    duration = librosa.get_duration(y=y, sr=sr)

    # ── Pitch ────────────────────────────────────────────────────────────────
    f0, voiced_flag, _ = librosa.pyin(
        y, fmin=librosa.note_to_hz("C2"), fmax=librosa.note_to_hz("C7"), sr=sr
    )
    f0_clean = f0[voiced_flag == 1] if voiced_flag is not None else np.array([])
    pitch_mean = float(np.mean(f0_clean)) if len(f0_clean) > 0 else 0.0
    pitch_std = float(np.std(f0_clean)) if len(f0_clean) > 0 else 0.0

    # ── Energy / RMS ─────────────────────────────────────────────────────────
    rms = librosa.feature.rms(y=y)[0]
    energy_raw = float(np.sqrt(np.mean(rms ** 2)))

    # ── Tempo (speech rate proxy) ─────────────────────────────────────────────
    tempo_arr, _ = librosa.beat.beat_track(y=y, sr=sr)
    tempo = float(tempo_arr) if np.isscalar(tempo_arr) else float(tempo_arr[0])

    # ── Pauses (silence detection) ────────────────────────────────────────────
    intervals = librosa.effects.split(y, top_db=30)
    pauses = []
    for i in range(1, len(intervals)):
        gap = (intervals[i][0] - intervals[i - 1][1]) / sr
        if gap > 0.2:
            pauses.append(gap)
    pause_count = len(pauses)
    avg_pause_duration = float(np.mean(pauses)) if pauses else 0.0

    # ── MFCCs ────────────────────────────────────────────────────────────────
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_means = [round(float(np.mean(mfccs[i])), 4) for i in range(13)]

    # ── Filler Rate (estimate via zero crossing rate) ─────────────────────────
    # High ZCR in quiet segments indicates filler sounds (um, uh, er)
    zcr = librosa.feature.zero_crossing_rate(y)[0]
    filler_rate = float(np.mean(zcr)) * 2  # scale to ~0–1 range

    return {
        "pitch_mean": round(pitch_mean, 2),
        "pitch_std": round(pitch_std, 2),
        "energy_raw": round(energy_raw, 6),
        "speech_rate": round(tempo, 2),
        "pause_count": pause_count,
        "avg_pause_duration": round(avg_pause_duration, 3),
        "filler_rate": round(filler_rate, 4),
        "mfcc_features": mfcc_means,
        "duration_seconds": round(duration, 2),
    }