Spaces:

aryan083
/

Speech-To-Text

Sleeping

App Files Files Community

Speech-To-Text / src /utils /audio_utils.py

aryan083

added basic ENGLISH STT

312e168 5 months ago

raw

history blame contribute delete

7.98 kB

	import librosa
	import numpy as np
	import soundfile as sf
	from typing import Union, Tuple, Optional
	import logging
	import os
	from pathlib import Path

	class AudioProcessor:
	"""
	Audio processing utilities for speech-to-text preprocessing.
	Optimizes audio for better transcription accuracy.
	"""

	def __init__(self, target_sr: int = 16000):
	self.target_sr = target_sr
	self.logger = logging.getLogger(__name__)

	def preprocess_audio(self, audio_input: Union[str, np.ndarray],
	normalize: bool = True,
	trim_silence: bool = True,
	noise_reduction: bool = False) -> np.ndarray:
	"""
	Preprocess audio for optimal speech recognition.

	Args:
	audio_input: Path to audio file or numpy array
	normalize: Whether to normalize audio amplitude
	trim_silence: Whether to trim silence from beginning/end
	noise_reduction: Whether to apply basic noise reduction

	Returns:
	Preprocessed audio as numpy array
	"""
	try:
	# Load audio if it's a file path
	if isinstance(audio_input, str):
	audio, sr = librosa.load(audio_input, sr=self.target_sr)
	else:
	audio = audio_input
	sr = self.target_sr

	# Resample if needed
	if sr != self.target_sr:
	audio = librosa.resample(audio, orig_sr=sr, target_sr=self.target_sr)

	# Normalize audio
	if normalize:
	audio = librosa.util.normalize(audio)

	# Trim silence
	if trim_silence:
	audio, _ = librosa.effects.trim(audio, top_db=20)

	# Basic noise reduction using spectral gating
	if noise_reduction:
	audio = self._reduce_noise(audio)

	# Ensure audio is not empty
	if len(audio) == 0:
	self.logger.warning("Audio is empty after preprocessing")
	return np.zeros(1024) # Return minimal audio

	return audio

	except Exception as e:
	self.logger.error(f"Audio preprocessing error: {e}")
	# Return original audio or minimal fallback
	if isinstance(audio_input, np.ndarray):
	return audio_input
	else:
	return np.zeros(1024)

	def _reduce_noise(self, audio: np.ndarray, noise_factor: float = 0.1) -> np.ndarray:
	"""
	Simple noise reduction using spectral subtraction.

	Args:
	audio: Input audio signal
	noise_factor: Factor for noise reduction (0.0 to 1.0)

	Returns:
	Noise-reduced audio
	"""
	try:
	# Compute STFT
	stft = librosa.stft(audio)
	magnitude = np.abs(stft)
	phase = np.angle(stft)

	# Estimate noise from first few frames
	noise_frames = min(10, magnitude.shape[1] // 4)
	noise_profile = np.mean(magnitude[:, :noise_frames], axis=1, keepdims=True)

	# Spectral subtraction
	clean_magnitude = magnitude - noise_factor * noise_profile
	clean_magnitude = np.maximum(clean_magnitude, 0.1 * magnitude)

	# Reconstruct audio
	clean_stft = clean_magnitude * np.exp(1j * phase)
	clean_audio = librosa.istft(clean_stft)

	return clean_audio

	except Exception as e:
	self.logger.warning(f"Noise reduction failed: {e}")
	return audio

	def validate_audio(self, audio_path: str) -> Tuple[bool, str]:
	"""
	Validate audio file for processing.

	Args:
	audio_path: Path to audio file

	Returns:
	Tuple of (is_valid, message)
	"""
	try:
	if not os.path.exists(audio_path):
	return False, "Audio file does not exist"

	# Check file size
	file_size = os.path.getsize(audio_path)
	if file_size == 0:
	return False, "Audio file is empty"

	if file_size > 100 * 1024 * 1024: # 100MB limit
	return False, "Audio file too large (>100MB)"

	# Try to load audio
	try:
	audio, sr = librosa.load(audio_path, duration=1.0) # Load first second
	if len(audio) == 0:
	return False, "Audio file contains no audio data"
	except Exception as e:
	return False, f"Cannot load audio file: {str(e)}"

	return True, "Audio file is valid"

	except Exception as e:
	return False, f"Audio validation error: {str(e)}"

	def get_audio_info(self, audio_path: str) -> dict:
	"""
	Get information about audio file.

	Args:
	audio_path: Path to audio file

	Returns:
	Dictionary with audio information
	"""
	try:
	# Get file info
	file_size = os.path.getsize(audio_path)

	# Load audio to get properties
	audio, sr = librosa.load(audio_path, sr=None)
	duration = len(audio) / sr

	return {
	"file_path": audio_path,
	"file_size_mb": file_size / (1024 * 1024),
	"duration_seconds": duration,
	"sample_rate": sr,
	"channels": 1 if audio.ndim == 1 else audio.shape[0],
	"samples": len(audio),
	"format": Path(audio_path).suffix.lower()
	}

	except Exception as e:
	return {
	"error": f"Cannot get audio info: {str(e)}"
	}

	def convert_audio_format(self, input_path: str, output_path: str,
	target_format: str = "wav") -> bool:
	"""
	Convert audio to different format.

	Args:
	input_path: Input audio file path
	output_path: Output audio file path
	target_format: Target format (wav, mp3, flac, etc.)

	Returns:
	Success status
	"""
	try:
	# Load audio
	audio, sr = librosa.load(input_path, sr=self.target_sr)

	# Save in target format
	sf.write(output_path, audio, sr, format=target_format.upper())

	return True

	except Exception as e:
	self.logger.error(f"Audio conversion error: {e}")
	return False

	def split_audio(self, audio_path: str, chunk_duration: int = 30) -> list:
	"""
	Split long audio into chunks for processing.

	Args:
	audio_path: Path to audio file
	chunk_duration: Duration of each chunk in seconds

	Returns:
	List of audio chunks as numpy arrays
	"""
	try:
	# Load full audio
	audio, sr = librosa.load(audio_path, sr=self.target_sr)

	# Calculate chunk size in samples
	chunk_samples = chunk_duration * sr

	# Split audio into chunks
	chunks = []
	for i in range(0, len(audio), chunk_samples):
	chunk = audio[i:i + chunk_samples]
	if len(chunk) > sr: # Only include chunks longer than 1 second
	chunks.append(chunk)

	return chunks

	except Exception as e:
	self.logger.error(f"Audio splitting error: {e}")
	return []