Spaces:

crackuser
/

voiceclone-dev

Sleeping

App Files Files Community

voiceclone-dev / audio_processor.py

crackuser

Create audio_processor.py

0d1b7fe verified 4 months ago

raw

history blame

7.86 kB

	import numpy as np
	import librosa
	import soundfile as sf
	import noisereduce as nr
	from scipy import signal
	from scipy.signal import butter, filtfilt
	import tempfile
	import os
	from typing import Tuple, Optional
	import io

	class AudioProcessor:
	"""Advanced audio processing for voice cloning"""

	def __init__(self):
	self.target_sr = 22050

	def preprocess_audio(self, audio: np.ndarray, sr: int) -> np.ndarray:
	"""Comprehensive audio preprocessing"""

	# Resample to target sample rate
	if sr != self.target_sr:
	audio = librosa.resample(audio, orig_sr=sr, target_sr=self.target_sr)

	# Normalize amplitude
	audio = self.normalize_audio(audio)

	# Trim silence
	audio = self.trim_silence(audio)

	# Apply noise reduction
	audio = self.reduce_noise(audio)

	# Apply pre-emphasis filter
	audio = self.apply_preemphasis(audio)

	return audio

	def normalize_audio(self, audio: np.ndarray, target_db: float = -20.0) -> np.ndarray:
	"""Normalize audio to target dB level"""

	# Calculate RMS
	rms = np.sqrt(np.mean(audio**2))

	if rms > 0:
	# Convert target dB to linear scale
	target_rms = 10**(target_db / 20)

	# Apply normalization
	audio = audio * (target_rms / rms)

	# Prevent clipping
	max_val = np.max(np.abs(audio))
	if max_val > 0.95:
	audio = audio * (0.95 / max_val)

	return audio

	def trim_silence(self, audio: np.ndarray, threshold_db: float = -40.0) -> np.ndarray:
	"""Trim silence from beginning and end"""

	# Use librosa's trim function
	trimmed_audio, _ = librosa.effects.trim(
	audio,
	top_db=-threshold_db,
	frame_length=2048,
	hop_length=512
	)

	return trimmed_audio

	def reduce_noise(self, audio: np.ndarray) -> np.ndarray:
	"""Apply noise reduction"""
	try:
	# Use noisereduce library
	reduced_noise = nr.reduce_noise(y=audio, sr=self.target_sr)
	return reduced_noise
	except:
	# Fallback: simple high-pass filter
	return self.apply_highpass_filter(audio, cutoff=80)

	def apply_preemphasis(self, audio: np.ndarray, coeff: float = 0.97) -> np.ndarray:
	"""Apply pre-emphasis filter"""
	return signal.lfilter([1, -coeff], [1], audio)

	def apply_deemphasis(self, audio: np.ndarray, coeff: float = 0.97) -> np.ndarray:
	"""Apply de-emphasis filter"""
	return signal.lfilter([1], [1, -coeff], audio)

	def apply_highpass_filter(self, audio: np.ndarray, cutoff: float = 80) -> np.ndarray:
	"""Apply high-pass filter"""
	nyquist = self.target_sr * 0.5
	normal_cutoff = cutoff / nyquist
	b, a = butter(5, normal_cutoff, btype='high', analog=False)
	return filtfilt(b, a, audio)

	def apply_lowpass_filter(self, audio: np.ndarray, cutoff: float = 8000) -> np.ndarray:
	"""Apply low-pass filter"""
	nyquist = self.target_sr * 0.5
	normal_cutoff = cutoff / nyquist
	b, a = butter(5, normal_cutoff, btype='low', analog=False)
	return filtfilt(b, a, audio)

	def apply_fade(self, audio: np.ndarray, fade_duration: float = 0.01) -> np.ndarray:
	"""Apply fade in/out"""
	fade_samples = int(fade_duration * self.target_sr)

	if len(audio) > 2 * fade_samples:
	# Fade in
	fade_in = np.linspace(0, 1, fade_samples)
	audio[:fade_samples] *= fade_in

	# Fade out
	fade_out = np.linspace(1, 0, fade_samples)
	audio[-fade_samples:] *= fade_out

	return audio

	def enhance_audio(self, audio: np.ndarray) -> np.ndarray:
	"""Enhance audio quality"""

	# Apply noise reduction
	enhanced = self.reduce_noise(audio)

	# Apply gentle compression
	enhanced = self.apply_compression(enhanced)

	# Apply EQ boost for clarity
	enhanced = self.apply_eq_boost(enhanced)

	# Final normalization
	enhanced = self.normalize_audio(enhanced)

	# Apply fade
	enhanced = self.apply_fade(enhanced)

	return enhanced

	def apply_compression(self, audio: np.ndarray, threshold: float = 0.5, ratio: float = 4.0) -> np.ndarray:
	"""Apply dynamic range compression"""

	# Simple compression algorithm
	compressed = audio.copy()

	# Find samples above threshold
	above_threshold = np.abs(compressed) > threshold

	# Apply compression to samples above threshold
	compressed[above_threshold] = np.sign(compressed[above_threshold]) * (
	threshold + (np.abs(compressed[above_threshold]) - threshold) / ratio
	)

	return compressed

	def apply_eq_boost(self, audio: np.ndarray) -> np.ndarray:
	"""Apply EQ boost for vocal clarity"""

	# Boost frequencies important for speech (1-4 kHz)
	# This is a simplified EQ - would use more sophisticated filtering in practice

	# High-pass filter to remove low frequency noise
	audio = self.apply_highpass_filter(audio, cutoff=85)

	# Gentle low-pass to prevent harsh highs
	audio = self.apply_lowpass_filter(audio, cutoff=7500)

	return audio

	def pitch_shift(self, audio: np.ndarray, semitones: float) -> np.ndarray:
	"""Shift pitch by semitones"""
	return librosa.effects.pitch_shift(audio, sr=self.target_sr, n_steps=semitones)

	def time_stretch(self, audio: np.ndarray, rate: float) -> np.ndarray:
	"""Change playback speed without affecting pitch"""
	return librosa.effects.time_stretch(audio, rate=rate)

	def detect_voice_activity(self, audio: np.ndarray, frame_duration: float = 0.025) -> np.ndarray:
	"""Detect voice activity in audio"""

	frame_length = int(frame_duration * self.target_sr)
	hop_length = frame_length // 2

	# Calculate energy for each frame
	energy = []
	for i in range(0, len(audio) - frame_length + 1, hop_length):
	frame = audio[i:i + frame_length]
	frame_energy = np.sum(frame ** 2)
	energy.append(frame_energy)

	energy = np.array(energy)

	# Simple threshold-based VAD
	threshold = np.mean(energy) * 0.1
	voice_activity = energy > threshold

	return voice_activity

	@staticmethod
	def audio_to_bytes(audio: np.ndarray, sample_rate: int) -> bytes:
	"""Convert audio array to bytes for streaming"""

	with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
	sf.write(tmp_file.name, audio, sample_rate)

	with open(tmp_file.name, 'rb') as f:
	audio_bytes = f.read()

	# Clean up
	os.unlink(tmp_file.name)

	return audio_bytes

	@staticmethod
	def bytes_to_audio(audio_bytes: bytes) -> Tuple[np.ndarray, int]:
	"""Convert bytes to audio array"""

	with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
	tmp_file.write(audio_bytes)
	tmp_file.flush()

	audio, sr = librosa.load(tmp_file.name, sr=None)

	# Clean up
	os.unlink(tmp_file.name)

	return audio, sr