Pranav Mishra
Initial backend deployment - Flask API with ML models
1772a46
import numpy as np
import wave
import io
import logging
import subprocess
import tempfile
import os
from pathlib import Path
from typing import Tuple, Optional
logger = logging.getLogger(__name__)
def check_ffmpeg_available() -> bool:
"""Check if ffmpeg is available on the system."""
try:
result = subprocess.run(['ffmpeg', '-version'],
capture_output=True,
text=True,
timeout=5)
return result.returncode == 0
except (subprocess.SubprocessError, FileNotFoundError, subprocess.TimeoutExpired):
return False
def convert_with_ffmpeg(audio_data: bytes, target_sr: int = 8000, target_format: str = 'wav') -> Optional[bytes]:
"""
Convert audio using ffmpeg for high-quality format conversion.
Args:
audio_data: Input audio bytes in any format
target_sr: Target sampling rate (default: 8000 Hz for ML models)
target_format: Target audio format (default: wav)
Returns:
Converted audio bytes or None if conversion fails
"""
if not check_ffmpeg_available():
logger.warning("ffmpeg not available for audio conversion")
return None
temp_input = None
temp_output = None
try:
# Create temporary files
with tempfile.NamedTemporaryFile(suffix='.input', delete=False) as temp_input:
temp_input.write(audio_data)
temp_input.flush()
with tempfile.NamedTemporaryFile(suffix=f'.{target_format}', delete=False) as temp_output:
pass # Just need the filename
# Build ffmpeg command for high-quality conversion
ffmpeg_cmd = [
'ffmpeg',
'-i', temp_input.name,
'-ar', str(target_sr), # Resample to target sample rate
'-ac', '1', # Convert to mono
'-acodec', 'pcm_s16le', # 16-bit PCM (standard for ML)
'-f', target_format, # Output format
'-loglevel', 'error', # Reduce ffmpeg output
'-y', # Overwrite output
temp_output.name
]
logger.debug(f"Running ffmpeg conversion: {' '.join(ffmpeg_cmd)}")
# Run ffmpeg conversion
result = subprocess.run(ffmpeg_cmd,
capture_output=True,
text=True,
timeout=30)
if result.returncode == 0:
# Read converted audio
with open(temp_output.name, 'rb') as f:
converted_audio = f.read()
logger.debug(f"ffmpeg conversion successful: "
f"{len(audio_data)} -> {len(converted_audio)} bytes "
f"({target_sr}Hz, mono, {target_format})")
return converted_audio
else:
logger.error(f"ffmpeg conversion failed: {result.stderr}")
return None
except Exception as e:
logger.error(f"ffmpeg conversion error: {str(e)}")
return None
finally:
# Clean up temporary files
try:
if temp_input and os.path.exists(temp_input.name):
os.unlink(temp_input.name)
if temp_output and os.path.exists(temp_output.name):
os.unlink(temp_output.name)
except Exception as cleanup_error:
logger.warning(f"Failed to cleanup temp files: {cleanup_error}")
def convert_for_ml_models(audio_data: bytes, pipeline_type: str = 'mfcc') -> bytes:
"""
Convert audio specifically for ML model requirements.
Args:
audio_data: Input audio bytes
pipeline_type: ML pipeline type ('mfcc', 'mel_cnn', 'raw_cnn')
Returns:
Audio bytes optimized for the specific ML model
"""
# All our ML models expect 8kHz, mono, 16-bit PCM
target_sr = 8000
# Try ffmpeg first for best quality
converted = convert_with_ffmpeg(audio_data, target_sr=target_sr)
if converted:
logger.debug(f"Used ffmpeg for {pipeline_type} model audio conversion")
return converted
# Fallback to existing conversion methods
logger.debug(f"Using fallback audio conversion for {pipeline_type} model")
return convert_audio_format(audio_data)
def validate_audio_format(audio_data: bytes) -> bool:
"""
Validate that audio data is in a supported format.
Args:
audio_data: Raw audio bytes
Returns:
True if format is supported, False otherwise
"""
# Check minimum size
if len(audio_data) < 44: # WAV header is 44 bytes
logger.debug(f"Audio data too small: {len(audio_data)} bytes (minimum 44 for WAV header)")
return False
# Check for null/empty data
if audio_data[:20] == b'\x00' * 20:
logger.error("Audio data appears to be empty/null bytes")
return False
# Check if it starts with RIFF header
if not audio_data.startswith(b'RIFF'):
logger.error(f"Audio data does not start with RIFF header. First 8 bytes: {audio_data[:8]}")
# Try to provide more diagnostic info
if len(audio_data) > 20:
logger.error(f"First 20 bytes as hex: {audio_data[:20].hex()}")
return False
try:
with wave.open(io.BytesIO(audio_data), 'rb') as wav_file:
# Check basic WAV properties
channels = wav_file.getnchannels()
sample_width = wav_file.getsampwidth()
frame_rate = wav_file.getframerate()
frames = wav_file.getnframes()
logger.debug(f"Audio format: {channels} channels, {sample_width} bytes/sample, {frame_rate} Hz, {frames} frames")
# Be more lenient with streaming chunks
if channels not in [1, 2]:
logger.warning(f"Unusual channel count: {channels}")
return False
if sample_width not in [1, 2, 4]: # 8-bit, 16-bit, 32-bit
logger.warning(f"Unusual sample width: {sample_width}")
return False
if frame_rate < 8000 or frame_rate > 48000: # Wider range
logger.warning(f"Unusual frame rate: {frame_rate}")
return False
if frames == 0:
logger.warning("No audio frames found")
return False
return True
except wave.Error as e:
logger.error(f"WAV format error: {str(e)}")
logger.error(f"Audio data size: {len(audio_data)} bytes")
if len(audio_data) > 44:
logger.error(f"WAV header bytes: {audio_data[:44].hex()}")
return False
except Exception as e:
logger.error(f"Audio validation failed: {str(e)}")
logger.error(f"Audio data size: {len(audio_data)} bytes")
return False
def convert_audio_format(audio_data: bytes) -> bytes:
"""
Convert various audio formats (WebM, OGG, MP3, etc.) to WAV format.
Args:
audio_data: Input audio bytes in any supported format
Returns:
Converted audio bytes in WAV format
Raises:
Exception: If conversion fails
"""
try:
# First detect the audio format
from .webm_converter import detect_audio_format, convert_webm_to_wav
audio_format = detect_audio_format(audio_data)
logger.debug(f"Detected audio format: {audio_format}")
# Handle WebM specifically (common from MediaRecorder)
if audio_format == 'webm':
logger.info("Converting WebM audio to WAV (fallback method)")
converted = convert_webm_to_wav(audio_data)
if converted:
return converted
else:
raise Exception("WebM conversion failed")
# Try using pydub for format conversion (handles WebM, OGG, MP3, etc.)
try:
from pydub import AudioSegment
import io
# Load audio from bytes
audio = AudioSegment.from_file(io.BytesIO(audio_data))
# Convert to mono and 16kHz
audio = audio.set_channels(1) # Mono
audio = audio.set_frame_rate(16000) # 16kHz
audio = audio.set_sample_width(2) # 16-bit
# Export as WAV
output_buffer = io.BytesIO()
audio.export(output_buffer, format="wav")
return output_buffer.getvalue()
except ImportError:
logger.warning("pydub not installed, falling back to basic WAV conversion")
# Fall back to basic WAV processing
return convert_to_mono_16khz(audio_data)
except Exception as e:
logger.warning(f"pydub conversion failed: {str(e)}, trying fallback methods")
# Try WebM converter as fallback
if audio_format in ['webm', 'unknown']:
logger.info("Trying WebM fallback converter")
converted = convert_webm_to_wav(audio_data)
if converted:
return converted
# Last resort: basic WAV processing
return convert_to_mono_16khz(audio_data)
except Exception as e:
logger.error(f"All audio conversion methods failed: {str(e)}")
raise Exception(f"Failed to convert audio format: {str(e)}")
def convert_to_mono_16khz(audio_data: bytes) -> bytes:
"""
Convert audio to mono, 16kHz format suitable for speech recognition.
Args:
audio_data: Input audio bytes (WAV format)
Returns:
Converted audio bytes in mono 16kHz WAV format
Raises:
Exception: If conversion fails
"""
try:
with wave.open(io.BytesIO(audio_data), 'rb') as input_wav:
frames = input_wav.readframes(input_wav.getnframes())
channels = input_wav.getnchannels()
sample_width = input_wav.getsampwidth()
frame_rate = input_wav.getframerate()
# Convert to numpy array
if sample_width == 2:
audio_array = np.frombuffer(frames, dtype=np.int16)
else:
raise Exception(f"Unsupported sample width: {sample_width}")
# Convert stereo to mono if needed
if channels == 2:
audio_array = audio_array.reshape(-1, 2)
audio_array = np.mean(audio_array, axis=1).astype(np.int16)
# Resample to 16kHz if needed
if frame_rate != 16000:
# Simple downsampling (for production, use proper resampling)
ratio = frame_rate / 16000
if ratio > 1:
# Downsample by taking every nth sample
indices = np.arange(0, len(audio_array), ratio).astype(int)
audio_array = audio_array[indices]
else:
# Upsample by repeating samples (basic interpolation)
audio_array = np.repeat(audio_array, int(1/ratio))
# Create output WAV
output = io.BytesIO()
with wave.open(output, 'wb') as output_wav:
output_wav.setnchannels(1) # Mono
output_wav.setsampwidth(2) # 16-bit
output_wav.setframerate(16000) # 16kHz
output_wav.writeframes(audio_array.tobytes())
return output.getvalue()
except Exception as e:
logger.error(f"Audio conversion failed: {str(e)}")
raise Exception(f"Failed to convert audio: {str(e)}")
def get_audio_duration(audio_data: bytes) -> float:
"""
Get duration of audio in seconds.
Args:
audio_data: WAV audio bytes
Returns:
Duration in seconds
"""
try:
with wave.open(io.BytesIO(audio_data), 'rb') as wav_file:
frames = wav_file.getnframes()
frame_rate = wav_file.getframerate()
duration = frames / frame_rate
return duration
except Exception as e:
logger.error(f"Failed to get audio duration: {str(e)}")
return 0.0
def audio_to_numpy(audio_data: bytes) -> Tuple[np.ndarray, int]:
"""
Convert WAV audio bytes to numpy array.
Args:
audio_data: WAV audio bytes
Returns:
Tuple of (audio_array, sample_rate)
Raises:
Exception: If conversion fails
"""
try:
with wave.open(io.BytesIO(audio_data), 'rb') as wav_file:
frames = wav_file.readframes(wav_file.getnframes())
sample_rate = wav_file.getframerate()
channels = wav_file.getnchannels()
sample_width = wav_file.getsampwidth()
if sample_width == 2:
audio_array = np.frombuffer(frames, dtype=np.int16)
else:
raise Exception(f"Unsupported sample width: {sample_width}")
# Convert to float32 and normalize
audio_array = audio_array.astype(np.float32) / 32767.0
# Handle stereo
if channels == 2:
audio_array = audio_array.reshape(-1, 2)
audio_array = np.mean(audio_array, axis=1)
return audio_array, sample_rate
except Exception as e:
logger.error(f"Failed to convert audio to numpy: {str(e)}")
raise Exception(f"Audio conversion failed: {str(e)}")
def create_test_audio(digit: str, duration: float = 1.0, sample_rate: int = 16000) -> bytes:
"""
Create test audio data for development purposes.
Args:
digit: Digit to simulate ('0'-'9')
duration: Audio duration in seconds
sample_rate: Sample rate in Hz
Returns:
WAV audio bytes
"""
try:
# Create simple tone pattern based on digit
t = np.linspace(0, duration, int(sample_rate * duration), False)
# Different frequency patterns for each digit
freq_map = {
'0': [400, 600], # Low frequencies
'1': [800, 1000], # Higher frequencies
'2': [600, 800],
'3': [700, 900],
'4': [500, 700],
'5': [900, 1100],
'6': [450, 650],
'7': [750, 950],
'8': [550, 750],
'9': [850, 1050]
}
freqs = freq_map.get(digit, [440, 880])
# Generate tone
signal = np.sin(freqs[0] * 2.0 * np.pi * t) * 0.3 + np.sin(freqs[1] * 2.0 * np.pi * t) * 0.3
# Add some envelope
envelope = np.exp(-3 * t)
signal = signal * envelope
# Convert to int16
signal = (signal * 32767).astype(np.int16)
# Create WAV
output = io.BytesIO()
with wave.open(output, 'wb') as wav_file:
wav_file.setnchannels(1)
wav_file.setsampwidth(2)
wav_file.setframerate(sample_rate)
wav_file.writeframes(signal.tobytes())
return output.getvalue()
except Exception as e:
logger.error(f"Failed to create test audio: {str(e)}")
raise Exception(f"Test audio creation failed: {str(e)}")