Spaces:
Running
Running
| """ | |
| Audio processing: convert to WAV 16kHz mono and extract librosa features. | |
| Temp files are deleted immediately after feature extraction. | |
| """ | |
| import os | |
| import tempfile | |
| import numpy as np | |
| from typing import Optional | |
| def convert_to_wav(input_path: str) -> str: | |
| """Convert any audio format to WAV 16kHz mono. Returns path to WAV file.""" | |
| try: | |
| from pydub import AudioSegment | |
| audio = AudioSegment.from_file(input_path) | |
| audio = audio.set_frame_rate(16000).set_channels(1) | |
| wav_fd, wav_path = tempfile.mkstemp(suffix=".wav") | |
| os.close(wav_fd) | |
| audio.export(wav_path, format="wav") | |
| return wav_path | |
| except Exception as e: | |
| raise RuntimeError(f"Audio conversion failed: {e}") | |
| def extract_features(wav_path: str) -> dict: | |
| """ | |
| Extract acoustic features from a 16kHz mono WAV file. | |
| Returns a dict with pitch, energy, speech_rate, pauses, MFCCs, filler_rate. | |
| """ | |
| import librosa | |
| y, sr = librosa.load(wav_path, sr=16000, mono=True) | |
| duration = librosa.get_duration(y=y, sr=sr) | |
| # ββ Pitch ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| f0, voiced_flag, _ = librosa.pyin( | |
| y, fmin=librosa.note_to_hz("C2"), fmax=librosa.note_to_hz("C7"), sr=sr | |
| ) | |
| f0_clean = f0[voiced_flag == 1] if voiced_flag is not None else np.array([]) | |
| pitch_mean = float(np.mean(f0_clean)) if len(f0_clean) > 0 else 0.0 | |
| pitch_std = float(np.std(f0_clean)) if len(f0_clean) > 0 else 0.0 | |
| # ββ Energy / RMS βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| rms = librosa.feature.rms(y=y)[0] | |
| energy_raw = float(np.sqrt(np.mean(rms ** 2))) | |
| # ββ Tempo (speech rate proxy) βββββββββββββββββββββββββββββββββββββββββββββ | |
| tempo_arr, _ = librosa.beat.beat_track(y=y, sr=sr) | |
| tempo = float(tempo_arr) if np.isscalar(tempo_arr) else float(tempo_arr[0]) | |
| # ββ Pauses (silence detection) ββββββββββββββββββββββββββββββββββββββββββββ | |
| intervals = librosa.effects.split(y, top_db=30) | |
| pauses = [] | |
| for i in range(1, len(intervals)): | |
| gap = (intervals[i][0] - intervals[i - 1][1]) / sr | |
| if gap > 0.2: | |
| pauses.append(gap) | |
| pause_count = len(pauses) | |
| avg_pause_duration = float(np.mean(pauses)) if pauses else 0.0 | |
| # ββ MFCCs ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13) | |
| mfcc_means = [round(float(np.mean(mfccs[i])), 4) for i in range(13)] | |
| # ββ Filler Rate (estimate via zero crossing rate) βββββββββββββββββββββββββ | |
| # High ZCR in quiet segments indicates filler sounds (um, uh, er) | |
| zcr = librosa.feature.zero_crossing_rate(y)[0] | |
| filler_rate = float(np.mean(zcr)) * 2 # scale to ~0β1 range | |
| return { | |
| "pitch_mean": round(pitch_mean, 2), | |
| "pitch_std": round(pitch_std, 2), | |
| "energy_raw": round(energy_raw, 6), | |
| "speech_rate": round(tempo, 2), | |
| "pause_count": pause_count, | |
| "avg_pause_duration": round(avg_pause_duration, 3), | |
| "filler_rate": round(filler_rate, 4), | |
| "mfcc_features": mfcc_means, | |
| "duration_seconds": round(duration, 2), | |
| } | |