Spaces:

E5K7
/

InnerVoice

Running

App Files Files Community

InnerVoice / backend /services /audio_processor.py

E5K7

Initial commit: InnerVoice MVP

bf04727 about 1 month ago

raw

history blame contribute delete

3.8 kB

	"""
	Audio processing: convert to WAV 16kHz mono and extract librosa features.
	Temp files are deleted immediately after feature extraction.
	"""
	import os
	import tempfile
	import numpy as np
	from typing import Optional


	def convert_to_wav(input_path: str) -> str:
	"""Convert any audio format to WAV 16kHz mono. Returns path to WAV file."""
	try:
	from pydub import AudioSegment
	audio = AudioSegment.from_file(input_path)
	audio = audio.set_frame_rate(16000).set_channels(1)
	wav_fd, wav_path = tempfile.mkstemp(suffix=".wav")
	os.close(wav_fd)
	audio.export(wav_path, format="wav")
	return wav_path
	except Exception as e:
	raise RuntimeError(f"Audio conversion failed: {e}")


	def extract_features(wav_path: str) -> dict:
	"""
	Extract acoustic features from a 16kHz mono WAV file.
	Returns a dict with pitch, energy, speech_rate, pauses, MFCCs, filler_rate.
	"""
	import librosa

	y, sr = librosa.load(wav_path, sr=16000, mono=True)
	duration = librosa.get_duration(y=y, sr=sr)

	# ── Pitch ────────────────────────────────────────────────────────────────
	f0, voiced_flag, _ = librosa.pyin(
	y, fmin=librosa.note_to_hz("C2"), fmax=librosa.note_to_hz("C7"), sr=sr
	)
	f0_clean = f0[voiced_flag == 1] if voiced_flag is not None else np.array([])
	pitch_mean = float(np.mean(f0_clean)) if len(f0_clean) > 0 else 0.0
	pitch_std = float(np.std(f0_clean)) if len(f0_clean) > 0 else 0.0

	# ── Energy / RMS ─────────────────────────────────────────────────────────
	rms = librosa.feature.rms(y=y)[0]
	energy_raw = float(np.sqrt(np.mean(rms ** 2)))

	# ── Tempo (speech rate proxy) ─────────────────────────────────────────────
	tempo_arr, _ = librosa.beat.beat_track(y=y, sr=sr)
	tempo = float(tempo_arr) if np.isscalar(tempo_arr) else float(tempo_arr[0])

	# ── Pauses (silence detection) ────────────────────────────────────────────
	intervals = librosa.effects.split(y, top_db=30)
	pauses = []
	for i in range(1, len(intervals)):
	gap = (intervals[i][0] - intervals[i - 1][1]) / sr
	if gap > 0.2:
	pauses.append(gap)
	pause_count = len(pauses)
	avg_pause_duration = float(np.mean(pauses)) if pauses else 0.0

	# ── MFCCs ────────────────────────────────────────────────────────────────
	mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
	mfcc_means = [round(float(np.mean(mfccs[i])), 4) for i in range(13)]

	# ── Filler Rate (estimate via zero crossing rate) ─────────────────────────
	# High ZCR in quiet segments indicates filler sounds (um, uh, er)
	zcr = librosa.feature.zero_crossing_rate(y)[0]
	filler_rate = float(np.mean(zcr)) * 2 # scale to ~0–1 range

	return {
	"pitch_mean": round(pitch_mean, 2),
	"pitch_std": round(pitch_std, 2),
	"energy_raw": round(energy_raw, 6),
	"speech_rate": round(tempo, 2),
	"pause_count": pause_count,
	"avg_pause_duration": round(avg_pause_duration, 3),
	"filler_rate": round(filler_rate, 4),
	"mfcc_features": mfcc_means,
	"duration_seconds": round(duration, 2),
	}