Spaces:

Edmon02
/

SpeechT5_hy

Runtime error

App Files Files Community

SpeechT5_hy / src /audio_processing.py

Edmon02

Enhance deployment and performance optimizations for TTS system

797f6a7 8 months ago

raw

history blame contribute delete

12.6 kB

	"""
	Audio Post-Processing Module
	============================

	Handles audio post-processing, optimization, and quality enhancement.
	Implements cross-fading, noise reduction, and dynamic range optimization.
	Optimized for Hugging Face Spaces deployment.
	"""

	import logging
	import time
	from typing import Tuple, List, Optional
	import numpy as np
	import scipy.signal
	from scipy.ndimage import gaussian_filter1d

	logger = logging.getLogger(__name__)


	class AudioProcessor:
	"""Advanced audio post-processor for TTS output optimization."""

	def __init__(self,
	crossfade_duration: float = 0.1,
	sample_rate: int = 16000,
	apply_noise_gate: bool = True,
	normalize_audio: bool = True):
	"""
	Initialize audio processor.

	Args:
	crossfade_duration: Duration of crossfade between chunks in seconds
	sample_rate: Audio sample rate
	apply_noise_gate: Whether to apply noise gating
	normalize_audio: Whether to normalize audio levels
	"""
	self.crossfade_duration = crossfade_duration
	self.sample_rate = sample_rate
	self.apply_noise_gate = apply_noise_gate
	self.normalize_audio = normalize_audio

	# Calculate crossfade samples
	self.crossfade_samples = int(crossfade_duration * sample_rate)

	logger.info(f"AudioProcessor initialized with {crossfade_duration}s crossfade")

	def _create_crossfade_window(self, length: int) -> Tuple[np.ndarray, np.ndarray]:
	"""
	Create crossfade windows for smooth transitions.

	Args:
	length: Length of crossfade in samples

	Returns:
	Tuple of (fade_out_window, fade_in_window)
	"""
	# Use raised cosine (Hann) window for smooth transitions
	window = np.hanning(2 * length)
	fade_out = window[:length]
	fade_in = window[length:]

	return fade_out, fade_in

	def crossfade_audio_segments(self, audio_segments: List[np.ndarray]) -> np.ndarray:
	"""
	Crossfade multiple audio segments for smooth concatenation.

	Args:
	audio_segments: List of audio arrays to concatenate

	Returns:
	Smoothly concatenated audio array
	"""
	if not audio_segments:
	return np.array([], dtype=np.int16)

	if len(audio_segments) == 1:
	return audio_segments[0]

	logger.debug(f"Crossfading {len(audio_segments)} audio segments")

	# Start with the first segment
	result = audio_segments[0].astype(np.float32)

	for i in range(1, len(audio_segments)):
	current_segment = audio_segments[i].astype(np.float32)

	# Determine crossfade length (limited by segment lengths)
	fade_length = min(
	self.crossfade_samples,
	len(result) // 2,
	len(current_segment) // 2
	)

	if fade_length > 0:
	# Create crossfade windows
	fade_out, fade_in = self._create_crossfade_window(fade_length)

	# Apply crossfade
	# Fade out end of result
	result[-fade_length:] *= fade_out

	# Fade in beginning of current segment
	current_segment[:fade_length] *= fade_in

	# Overlap and add
	overlap = result[-fade_length:] + current_segment[:fade_length]

	# Concatenate: result (except overlapped part) + overlap + current (except overlapped part)
	result = np.concatenate([
	result[:-fade_length],
	overlap,
	current_segment[fade_length:]
	])
	else:
	# No crossfade possible, simple concatenation
	result = np.concatenate([result, current_segment])

	return result.astype(np.int16)

	def _apply_noise_gate(self, audio: np.ndarray, threshold_db: float = -40.0) -> np.ndarray:
	"""
	Apply noise gate to reduce background noise.

	Args:
	audio: Input audio array
	threshold_db: Noise gate threshold in dB

	Returns:
	Noise-gated audio
	"""
	# Convert to float for processing
	audio_float = audio.astype(np.float32)

	# Calculate RMS energy in sliding window
	window_size = int(0.01 * self.sample_rate) # 10ms window

	if len(audio_float) < window_size:
	# For very short audio, return as-is
	return audio.astype(np.int16)

	# Pad audio for edge cases
	padded_audio = np.pad(audio_float, window_size//2, mode='reflect')

	# Calculate RMS energy
	rms = np.sqrt(np.convolve(padded_audio**2,
	np.ones(window_size)/window_size,
	mode='valid'))

	# Ensure rms has the same length as original audio
	if len(rms) != len(audio_float):
	# Resize to match original audio length
	from scipy.ndimage import zoom
	zoom_factor = len(audio_float) / len(rms)
	rms = zoom(rms, zoom_factor)

	# Convert to dB
	rms_db = 20 * np.log10(np.maximum(rms, 1e-10))

	# Create gate mask
	threshold_linear = 10**(threshold_db/20)
	gate_mask = (rms / np.max(rms)) > threshold_linear

	# Smooth the gate mask to avoid clicks
	gate_mask = gaussian_filter1d(gate_mask.astype(float), sigma=2)

	# Ensure gate_mask has the same length as audio
	if len(gate_mask) != len(audio_float):
	from scipy.ndimage import zoom
	zoom_factor = len(audio_float) / len(gate_mask)
	gate_mask = zoom(gate_mask, zoom_factor)

	# Apply gate
	gated_audio = audio_float * gate_mask

	return gated_audio.astype(np.int16)

	def _normalize_audio(self, audio: np.ndarray, target_peak: float = 0.95) -> np.ndarray:
	"""
	Normalize audio to target peak level.

	Args:
	audio: Input audio array
	target_peak: Target peak level (0.0 to 1.0)

	Returns:
	Normalized audio
	"""
	audio_float = audio.astype(np.float32)

	# Find current peak
	current_peak = np.max(np.abs(audio_float))

	if current_peak > 0:
	# Calculate scaling factor
	scale_factor = (target_peak * 32767) / current_peak

	# Apply scaling
	normalized = audio_float * scale_factor

	# Clip to prevent overflow
	normalized = np.clip(normalized, -32767, 32767)

	return normalized.astype(np.int16)

	return audio

	def _apply_dynamic_range_compression(self, audio: np.ndarray,
	ratio: float = 4.0,
	threshold_db: float = -12.0) -> np.ndarray:
	"""
	Apply dynamic range compression to even out volume levels.

	Args:
	audio: Input audio array
	ratio: Compression ratio
	threshold_db: Compression threshold in dB

	Returns:
	Compressed audio
	"""
	audio_float = audio.astype(np.float32) / 32767.0

	# Calculate envelope
	envelope = np.abs(audio_float)
	envelope = gaussian_filter1d(envelope, sigma=int(0.001 * self.sample_rate))

	# Convert to dB
	envelope_db = 20 * np.log10(np.maximum(envelope, 1e-10))

	# Calculate gain reduction
	gain_reduction = np.zeros_like(envelope_db)
	over_threshold = envelope_db > threshold_db
	gain_reduction[over_threshold] = (envelope_db[over_threshold] - threshold_db) / ratio

	# Convert back to linear
	gain_linear = 10**(-gain_reduction / 20)

	# Apply compression
	compressed = audio_float * gain_linear

	return (compressed * 32767).astype(np.int16)

	def process_audio(self, audio: np.ndarray,
	apply_compression: bool = False,
	compression_ratio: float = 3.0) -> np.ndarray:
	"""
	Apply full audio processing pipeline.

	Args:
	audio: Input audio array
	apply_compression: Whether to apply dynamic range compression
	compression_ratio: Compression ratio if compression is applied

	Returns:
	Processed audio
	"""
	start_time = time.time()

	if len(audio) == 0:
	return audio

	processed_audio = audio.copy()

	try:
	# Apply noise gate
	if self.apply_noise_gate:
	processed_audio = self._apply_noise_gate(processed_audio)

	# Apply compression if requested
	if apply_compression:
	processed_audio = self._apply_dynamic_range_compression(
	processed_audio, ratio=compression_ratio
	)

	# Normalize audio
	if self.normalize_audio:
	processed_audio = self._normalize_audio(processed_audio)

	processing_time = time.time() - start_time
	logger.debug(f"Audio processed in {processing_time:.3f}s")

	return processed_audio

	except Exception as e:
	logger.error(f"Audio processing failed: {e}")
	return audio # Return original audio on failure

	def process_and_concatenate(self, audio_segments: List[np.ndarray],
	apply_processing: bool = True) -> np.ndarray:
	"""
	Process and concatenate multiple audio segments.

	Args:
	audio_segments: List of audio arrays
	apply_processing: Whether to apply full processing pipeline

	Returns:
	Processed and concatenated audio
	"""
	if not audio_segments:
	return np.array([], dtype=np.int16)

	# First, crossfade the segments
	concatenated = self.crossfade_audio_segments(audio_segments)

	# Then apply processing if requested
	if apply_processing:
	concatenated = self.process_audio(concatenated)

	return concatenated

	def add_silence(self, audio: np.ndarray,
	start_silence: float = 0.1,
	end_silence: float = 0.1) -> np.ndarray:
	"""
	Add silence padding to audio.

	Args:
	audio: Input audio array
	start_silence: Silence duration at start in seconds
	end_silence: Silence duration at end in seconds

	Returns:
	Audio with added silence
	"""
	start_samples = int(start_silence * self.sample_rate)
	end_samples = int(end_silence * self.sample_rate)

	start_pad = np.zeros(start_samples, dtype=audio.dtype)
	end_pad = np.zeros(end_samples, dtype=audio.dtype)

	return np.concatenate([start_pad, audio, end_pad])

	def get_audio_stats(self, audio: np.ndarray) -> dict:
	"""
	Get audio statistics for quality analysis.

	Args:
	audio: Audio array to analyze

	Returns:
	Dictionary of audio statistics
	"""
	if len(audio) == 0:
	return {"error": "Empty audio"}

	audio_float = audio.astype(np.float32)

	return {
	"duration_seconds": len(audio) / self.sample_rate,
	"sample_count": len(audio),
	"peak_amplitude": np.max(np.abs(audio_float)),
	"rms_level": np.sqrt(np.mean(audio_float**2)),
	"dynamic_range_db": 20 * np.log10(np.max(np.abs(audio_float)) /
	(np.sqrt(np.mean(audio_float**2)) + 1e-10)),
	"zero_crossings": np.sum(np.diff(np.signbit(audio_float))),
	"dc_offset": np.mean(audio_float)
	}