Spaces:

Paranoiid
/

streaming-digit-classifier

Runtime error

streaming-digit-classifier / utils /audio_utils.py

Pranav Mishra

Initial backend deployment - Flask API with ML models

1772a46 4 months ago

15.6 kB

	import numpy as np
	import wave
	import io
	import logging
	import subprocess
	import tempfile
	import os
	from pathlib import Path
	from typing import Tuple, Optional

	logger = logging.getLogger(__name__)

	def check_ffmpeg_available() -> bool:
	"""Check if ffmpeg is available on the system."""
	try:
	result = subprocess.run(['ffmpeg', '-version'],
	capture_output=True,
	text=True,
	timeout=5)
	return result.returncode == 0
	except (subprocess.SubprocessError, FileNotFoundError, subprocess.TimeoutExpired):
	return False

	def convert_with_ffmpeg(audio_data: bytes, target_sr: int = 8000, target_format: str = 'wav') -> Optional[bytes]:
	"""
	Convert audio using ffmpeg for high-quality format conversion.

	Args:
	audio_data: Input audio bytes in any format
	target_sr: Target sampling rate (default: 8000 Hz for ML models)
	target_format: Target audio format (default: wav)

	Returns:
	Converted audio bytes or None if conversion fails
	"""
	if not check_ffmpeg_available():
	logger.warning("ffmpeg not available for audio conversion")
	return None

	temp_input = None
	temp_output = None

	try:
	# Create temporary files
	with tempfile.NamedTemporaryFile(suffix='.input', delete=False) as temp_input:
	temp_input.write(audio_data)
	temp_input.flush()

	with tempfile.NamedTemporaryFile(suffix=f'.{target_format}', delete=False) as temp_output:
	pass # Just need the filename

	# Build ffmpeg command for high-quality conversion
	ffmpeg_cmd = [
	'ffmpeg',
	'-i', temp_input.name,
	'-ar', str(target_sr), # Resample to target sample rate
	'-ac', '1', # Convert to mono
	'-acodec', 'pcm_s16le', # 16-bit PCM (standard for ML)
	'-f', target_format, # Output format
	'-loglevel', 'error', # Reduce ffmpeg output
	'-y', # Overwrite output
	temp_output.name
	]

	logger.debug(f"Running ffmpeg conversion: {' '.join(ffmpeg_cmd)}")

	# Run ffmpeg conversion
	result = subprocess.run(ffmpeg_cmd,
	capture_output=True,
	text=True,
	timeout=30)

	if result.returncode == 0:
	# Read converted audio
	with open(temp_output.name, 'rb') as f:
	converted_audio = f.read()

	logger.debug(f"ffmpeg conversion successful: "
	f"{len(audio_data)} -> {len(converted_audio)} bytes "
	f"({target_sr}Hz, mono, {target_format})")

	return converted_audio
	else:
	logger.error(f"ffmpeg conversion failed: {result.stderr}")
	return None

	except Exception as e:
	logger.error(f"ffmpeg conversion error: {str(e)}")
	return None

	finally:
	# Clean up temporary files
	try:
	if temp_input and os.path.exists(temp_input.name):
	os.unlink(temp_input.name)
	if temp_output and os.path.exists(temp_output.name):
	os.unlink(temp_output.name)
	except Exception as cleanup_error:
	logger.warning(f"Failed to cleanup temp files: {cleanup_error}")

	def convert_for_ml_models(audio_data: bytes, pipeline_type: str = 'mfcc') -> bytes:
	"""
	Convert audio specifically for ML model requirements.

	Args:
	audio_data: Input audio bytes
	pipeline_type: ML pipeline type ('mfcc', 'mel_cnn', 'raw_cnn')

	Returns:
	Audio bytes optimized for the specific ML model
	"""
	# All our ML models expect 8kHz, mono, 16-bit PCM
	target_sr = 8000

	# Try ffmpeg first for best quality
	converted = convert_with_ffmpeg(audio_data, target_sr=target_sr)
	if converted:
	logger.debug(f"Used ffmpeg for {pipeline_type} model audio conversion")
	return converted

	# Fallback to existing conversion methods
	logger.debug(f"Using fallback audio conversion for {pipeline_type} model")
	return convert_audio_format(audio_data)

	def validate_audio_format(audio_data: bytes) -> bool:
	"""
	Validate that audio data is in a supported format.

	Args:
	audio_data: Raw audio bytes

	Returns:
	True if format is supported, False otherwise
	"""
	# Check minimum size
	if len(audio_data) < 44: # WAV header is 44 bytes
	logger.debug(f"Audio data too small: {len(audio_data)} bytes (minimum 44 for WAV header)")
	return False

	# Check for null/empty data
	if audio_data[:20] == b'\x00' * 20:
	logger.error("Audio data appears to be empty/null bytes")
	return False

	# Check if it starts with RIFF header
	if not audio_data.startswith(b'RIFF'):
	logger.error(f"Audio data does not start with RIFF header. First 8 bytes: {audio_data[:8]}")
	# Try to provide more diagnostic info
	if len(audio_data) > 20:
	logger.error(f"First 20 bytes as hex: {audio_data[:20].hex()}")
	return False

	try:
	with wave.open(io.BytesIO(audio_data), 'rb') as wav_file:
	# Check basic WAV properties
	channels = wav_file.getnchannels()
	sample_width = wav_file.getsampwidth()
	frame_rate = wav_file.getframerate()
	frames = wav_file.getnframes()

	logger.debug(f"Audio format: {channels} channels, {sample_width} bytes/sample, {frame_rate} Hz, {frames} frames")

	# Be more lenient with streaming chunks
	if channels not in [1, 2]:
	logger.warning(f"Unusual channel count: {channels}")
	return False
	if sample_width not in [1, 2, 4]: # 8-bit, 16-bit, 32-bit
	logger.warning(f"Unusual sample width: {sample_width}")
	return False
	if frame_rate < 8000 or frame_rate > 48000: # Wider range
	logger.warning(f"Unusual frame rate: {frame_rate}")
	return False
	if frames == 0:
	logger.warning("No audio frames found")
	return False

	return True
	except wave.Error as e:
	logger.error(f"WAV format error: {str(e)}")
	logger.error(f"Audio data size: {len(audio_data)} bytes")
	if len(audio_data) > 44:
	logger.error(f"WAV header bytes: {audio_data[:44].hex()}")
	return False
	except Exception as e:
	logger.error(f"Audio validation failed: {str(e)}")
	logger.error(f"Audio data size: {len(audio_data)} bytes")
	return False

	def convert_audio_format(audio_data: bytes) -> bytes:
	"""
	Convert various audio formats (WebM, OGG, MP3, etc.) to WAV format.

	Args:
	audio_data: Input audio bytes in any supported format

	Returns:
	Converted audio bytes in WAV format

	Raises:
	Exception: If conversion fails
	"""
	try:
	# First detect the audio format
	from .webm_converter import detect_audio_format, convert_webm_to_wav

	audio_format = detect_audio_format(audio_data)
	logger.debug(f"Detected audio format: {audio_format}")

	# Handle WebM specifically (common from MediaRecorder)
	if audio_format == 'webm':
	logger.info("Converting WebM audio to WAV (fallback method)")
	converted = convert_webm_to_wav(audio_data)
	if converted:
	return converted
	else:
	raise Exception("WebM conversion failed")

	# Try using pydub for format conversion (handles WebM, OGG, MP3, etc.)
	try:
	from pydub import AudioSegment
	import io

	# Load audio from bytes
	audio = AudioSegment.from_file(io.BytesIO(audio_data))

	# Convert to mono and 16kHz
	audio = audio.set_channels(1) # Mono
	audio = audio.set_frame_rate(16000) # 16kHz
	audio = audio.set_sample_width(2) # 16-bit

	# Export as WAV
	output_buffer = io.BytesIO()
	audio.export(output_buffer, format="wav")
	return output_buffer.getvalue()

	except ImportError:
	logger.warning("pydub not installed, falling back to basic WAV conversion")
	# Fall back to basic WAV processing
	return convert_to_mono_16khz(audio_data)
	except Exception as e:
	logger.warning(f"pydub conversion failed: {str(e)}, trying fallback methods")

	# Try WebM converter as fallback
	if audio_format in ['webm', 'unknown']:
	logger.info("Trying WebM fallback converter")
	converted = convert_webm_to_wav(audio_data)
	if converted:
	return converted

	# Last resort: basic WAV processing
	return convert_to_mono_16khz(audio_data)

	except Exception as e:
	logger.error(f"All audio conversion methods failed: {str(e)}")
	raise Exception(f"Failed to convert audio format: {str(e)}")

	def convert_to_mono_16khz(audio_data: bytes) -> bytes:
	"""
	Convert audio to mono, 16kHz format suitable for speech recognition.

	Args:
	audio_data: Input audio bytes (WAV format)

	Returns:
	Converted audio bytes in mono 16kHz WAV format

	Raises:
	Exception: If conversion fails
	"""
	try:
	with wave.open(io.BytesIO(audio_data), 'rb') as input_wav:
	frames = input_wav.readframes(input_wav.getnframes())
	channels = input_wav.getnchannels()
	sample_width = input_wav.getsampwidth()
	frame_rate = input_wav.getframerate()

	# Convert to numpy array
	if sample_width == 2:
	audio_array = np.frombuffer(frames, dtype=np.int16)
	else:
	raise Exception(f"Unsupported sample width: {sample_width}")

	# Convert stereo to mono if needed
	if channels == 2:
	audio_array = audio_array.reshape(-1, 2)
	audio_array = np.mean(audio_array, axis=1).astype(np.int16)

	# Resample to 16kHz if needed
	if frame_rate != 16000:
	# Simple downsampling (for production, use proper resampling)
	ratio = frame_rate / 16000
	if ratio > 1:
	# Downsample by taking every nth sample
	indices = np.arange(0, len(audio_array), ratio).astype(int)
	audio_array = audio_array[indices]
	else:
	# Upsample by repeating samples (basic interpolation)
	audio_array = np.repeat(audio_array, int(1/ratio))

	# Create output WAV
	output = io.BytesIO()
	with wave.open(output, 'wb') as output_wav:
	output_wav.setnchannels(1) # Mono
	output_wav.setsampwidth(2) # 16-bit
	output_wav.setframerate(16000) # 16kHz
	output_wav.writeframes(audio_array.tobytes())

	return output.getvalue()

	except Exception as e:
	logger.error(f"Audio conversion failed: {str(e)}")
	raise Exception(f"Failed to convert audio: {str(e)}")

	def get_audio_duration(audio_data: bytes) -> float:
	"""
	Get duration of audio in seconds.

	Args:
	audio_data: WAV audio bytes

	Returns:
	Duration in seconds
	"""
	try:
	with wave.open(io.BytesIO(audio_data), 'rb') as wav_file:
	frames = wav_file.getnframes()
	frame_rate = wav_file.getframerate()
	duration = frames / frame_rate
	return duration
	except Exception as e:
	logger.error(f"Failed to get audio duration: {str(e)}")
	return 0.0

	def audio_to_numpy(audio_data: bytes) -> Tuple[np.ndarray, int]:
	"""
	Convert WAV audio bytes to numpy array.

	Args:
	audio_data: WAV audio bytes

	Returns:
	Tuple of (audio_array, sample_rate)

	Raises:
	Exception: If conversion fails
	"""
	try:
	with wave.open(io.BytesIO(audio_data), 'rb') as wav_file:
	frames = wav_file.readframes(wav_file.getnframes())
	sample_rate = wav_file.getframerate()
	channels = wav_file.getnchannels()
	sample_width = wav_file.getsampwidth()

	if sample_width == 2:
	audio_array = np.frombuffer(frames, dtype=np.int16)
	else:
	raise Exception(f"Unsupported sample width: {sample_width}")

	# Convert to float32 and normalize
	audio_array = audio_array.astype(np.float32) / 32767.0

	# Handle stereo
	if channels == 2:
	audio_array = audio_array.reshape(-1, 2)
	audio_array = np.mean(audio_array, axis=1)

	return audio_array, sample_rate

	except Exception as e:
	logger.error(f"Failed to convert audio to numpy: {str(e)}")
	raise Exception(f"Audio conversion failed: {str(e)}")

	def create_test_audio(digit: str, duration: float = 1.0, sample_rate: int = 16000) -> bytes:
	"""
	Create test audio data for development purposes.

	Args:
	digit: Digit to simulate ('0'-'9')
	duration: Audio duration in seconds
	sample_rate: Sample rate in Hz

	Returns:
	WAV audio bytes
	"""
	try:
	# Create simple tone pattern based on digit
	t = np.linspace(0, duration, int(sample_rate * duration), False)

	# Different frequency patterns for each digit
	freq_map = {
	'0': [400, 600], # Low frequencies
	'1': [800, 1000], # Higher frequencies
	'2': [600, 800],
	'3': [700, 900],
	'4': [500, 700],
	'5': [900, 1100],
	'6': [450, 650],
	'7': [750, 950],
	'8': [550, 750],
	'9': [850, 1050]
	}

	freqs = freq_map.get(digit, [440, 880])

	# Generate tone
	signal = np.sin(freqs[0] * 2.0 * np.pi * t) * 0.3 + np.sin(freqs[1] * 2.0 * np.pi * t) * 0.3

	# Add some envelope
	envelope = np.exp(-3 * t)
	signal = signal * envelope

	# Convert to int16
	signal = (signal * 32767).astype(np.int16)

	# Create WAV
	output = io.BytesIO()
	with wave.open(output, 'wb') as wav_file:
	wav_file.setnchannels(1)
	wav_file.setsampwidth(2)
	wav_file.setframerate(sample_rate)
	wav_file.writeframes(signal.tobytes())

	return output.getvalue()

	except Exception as e:
	logger.error(f"Failed to create test audio: {str(e)}")
	raise Exception(f"Test audio creation failed: {str(e)}")