Spaces:

Tingusto
/

audio-transcriptor

Sleeping

App Files Files Community

audio-transcriptor / pyscript /audio_processing.py

Tingusto

Uploaded initial demo

3cdeba6 verified 10 months ago

raw

history blame contribute delete

9.31 kB

	import os
	import librosa
	import numpy as np
	from tabulate import tabulate
	import soundfile as sf
	import scipy.ndimage
	import itertools
	from tqdm import tqdm
	import torch
	import torchaudio

	class AudioProcessor:

	def __init__(self, audio_file):
	self.path = audio_file
	self.name = os.path.splitext(os.path.basename(audio_file))[0]
	self.format = os.path.splitext(os.path.basename(audio_file))[1]
	self.duration = librosa.get_duration(path=audio_file)
	self.sample_rate = librosa.get_samplerate(audio_file)
	self.changes = []
	self.optimized_params = None
	self.load_details()

	# File information methods
	def load_details(self):
	"""Save the attributes of the audio file."""
	data = [
	["File Name", self.name],
	["File Format", self.format],
	["Duration", f"{self.duration} seconds"],
	["Sample Rate", f"{self.sample_rate} Hz"]
	]
	table = tabulate(data, headers=["Attribute", "Value"], tablefmt="outline")
	self.changes.append(table)
	return table

	def display_details(self):
	"""Display the details of the audio file."""
	print(self.changes[-1])

	def display_changes(self):
	"""Display the changes made to the audio file side by side."""
	self._clean_duplicates_changes()
	if len(self.changes) == 1:
	self.display_details()
	else:
	table1 = self.changes[0].split('\n')
	table2 = self.changes[-1].split('\n')

	combined_table = []
	for line1, line2 in zip(table1, table2):
	combined_table.append([line1, '===>', line2])

	print(tabulate(combined_table, tablefmt="plain"))

	def _clean_duplicates_changes(self):
	"""Remove duplicate consecutive changes from the audio file."""
	self.changes = [change for i, change in enumerate(self.changes)
	if i == 0 or change != self.changes[i-1]]

	# Audio processing methods
	def load_as_array(self, sample_rate: int = 16000) -> np.ndarray:
	"""
	Load an audio file and convert it into a NumPy array.

	Parameters
	----------
	sample_rate : int, optional
	The sample rate to which the audio will be resampled (default is 16000 Hz).

	Returns
	-------
	np.ndarray
	A NumPy array containing the audio data.
	"""
	try:
	audio, sr = librosa.load(self.path, sr=sample_rate)
	self.sample_rate = sr
	return audio
	except Exception as e:
	raise RuntimeError(f"Failed to load audio file: {e}")

	def resample_wav(self) -> str:
	output_path = os.path.join('resampled_files', f'{self.name}.wav')
	try:
	audio, sr = librosa.load(self.path)
	resampled_audio = librosa.resample(y=audio, orig_sr=sr, target_sr=16000)
	os.makedirs(os.path.dirname(output_path), exist_ok=True)
	sf.write(output_path, resampled_audio, 16000)
	self._update_file_info(output_path)
	return output_path
	except Exception as e:
	raise RuntimeError(f"Failed to resample audio file: {e}")

	def convert_to_wav(self):
	"""
	Converts an audio file to WAV format.

	Returns
	-------
	str
	The path to the converted audio file.
	"""
	output_path = os.path.join('converted_files', f'{self.name}.wav')
	try:
	os.makedirs(os.path.dirname(output_path), exist_ok=True)
	audio, sr = librosa.load(self.path, sr=16000)
	sf.write(output_path, audio, 16000)
	self._update_file_info(output_path)
	return output_path
	except Exception as e:
	raise RuntimeError(f"Failed to convert audio file to WAV: {e}")

	def enhance_audio(self, noise_reduce_strength=0.5, voice_enhance_strength=1.5, volume_boost=1.2):
	"""
	Enhance audio quality by reducing noise and clarifying voices.
	"""
	try:
	y, sr = librosa.load(self.path, sr=16000)
	y_enhanced = self._enhance_audio_sample(y, noise_reduce_strength, voice_enhance_strength, volume_boost)

	output_path = os.path.join('enhanced_files', f'{self.name}_enhanced.wav')
	os.makedirs(os.path.dirname(output_path), exist_ok=True)
	sf.write(output_path, y_enhanced, sr)

	self._update_file_info(output_path)
	return output_path
	except Exception as e:
	raise RuntimeError(f"Failed to enhance audio: {e}")

	def _compute_spectral_contrast(self, y, sr, n_bands=6, fmin=200.0, quantile=0.02, hop_length=512):
	"""
	Compute spectral contrast using librosa.
	Higher contrast generally indicates clearer speech separation from background.
	"""
	S = np.abs(librosa.stft(y, hop_length=hop_length))
	contrast = librosa.feature.spectral_contrast(
	S=S,
	sr=sr,
	n_bands=n_bands,
	fmin=fmin,
	quantile=quantile,
	hop_length=hop_length
	)
	return np.mean(contrast)

	def optimize_enhancement_parameters(self, step=0.25, max_iterations=50, sample_duration=30):
	"""
	Find optimal parameters for audio enhancement using grid search on a sample.
	"""
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	y_orig, sr = librosa.load(self.path, duration=sample_duration)
	y_orig_tensor = torch.tensor(y_orig, device=device)

	param_ranges = [
	np.arange(0.25, 1.5, step), # noise_reduce_strength
	np.arange(1.0, 3.0, step), # voice_enhance_strength
	np.arange(1.0, 2.0, step) # volume_boost
	]

	best_score = float('-inf')
	best_params = None

	total_iterations = min(max_iterations, len(list(itertools.product(*param_ranges))))

	for params in tqdm(itertools.islice(itertools.product(*param_ranges), max_iterations),
	total=total_iterations,
	desc="Searching for optimal parameters"):
	y_enhanced = self._enhance_audio_sample(y_orig, *params)
	y_enhanced_tensor = torch.tensor(y_enhanced, device=device)

	# Correlation between original and enhanced audio
	min_length = min(len(y_orig_tensor), len(y_enhanced_tensor))
	y_orig_trimmed = y_orig_tensor[:min_length]
	y_enhanced_trimmed = y_enhanced_tensor[:min_length]
	correlation = torch.corrcoef(torch.stack([y_orig_trimmed, y_enhanced_trimmed]))[0, 1].item()

	# Spectral contrast improvement
	contrast_orig = self._compute_spectral_contrast(y_orig, sr)
	contrast_enhanced = self._compute_spectral_contrast(y_enhanced, sr)
	contrast_improvement = contrast_enhanced - contrast_orig

	score = (0.3 * correlation) + (0.7 * contrast_improvement)

	if score > best_score:
	best_score = score
	best_params = params

	self.optimized_params = best_params
	return best_params

	def _enhance_audio_sample(self, y, noise_reduce_strength=0.5, voice_enhance_strength=1.5, volume_boost=1.2):
	"""
	Enhance an audio sample by reducing noise and enhancing voice clarity.

	Parameters
	----------
	y : np.ndarray
	Input audio signal
	noise_reduce_strength : float
	Strength of noise reduction (default: 0.5)
	voice_enhance_strength : float
	Strength of voice enhancement (default: 1.5)
	volume_boost : float
	Volume boost factor (default: 1.2)

	Returns
	-------
	np.ndarray
	Enhanced audio signal
	"""
	# STFT
	S = librosa.stft(y, n_fft=2048)
	S_mag, S_phase = np.abs(S), np.angle(S)
	S_filtered = scipy.ndimage.median_filter(S_mag, size=(1, 31))

	# Noise reduction mask
	mask = np.clip((S_mag - S_filtered) / (S_mag + 1e-10), 0, 1) ** noise_reduce_strength
	S_denoised = S_mag * mask * np.exp(1j * S_phase)

	# Inverse STFT
	y_denoised = librosa.istft(S_denoised)

	# Harmonic-percussive separation and enhancement
	y_harmonic, y_percussive = librosa.effects.hpss(y_denoised)
	y_enhanced = (y_harmonic * voice_enhance_strength + y_percussive) * volume_boost

	return librosa.util.normalize(y_enhanced, norm=np.inf, threshold=1.0)

	# Helper method
	def _update_file_info(self, new_path):
	"""Update file information after processing."""
	self.path = new_path
	self.sample_rate = librosa.get_samplerate(new_path)
	self.format = os.path.splitext(new_path)[1]
	self.duration = librosa.get_duration(path=new_path)
	self.load_details()