SESA_Fast_Separation

Running

App Files Files Community

SESA_Fast_Separation / tests /utils.py

ASesYusuf1

Upload 131 files

01f8b5b verified 10 months ago

raw

history blame contribute delete

6.19 kB

	import os
	import numpy as np
	import librosa
	import librosa.display
	import matplotlib.pyplot as plt
	from PIL import Image
	from io import BytesIO
	import soundfile as sf
	from pathlib import Path
	from skimage.metrics import structural_similarity as ssim


	def generate_waveform_image(audio_path, output_path=None, fig_size=(10, 4)):
	"""Generate a waveform image from an audio file.

	Args:
	audio_path: Path to the audio file
	output_path: Path to save the generated image (optional)
	fig_size: Size of the figure (width, height)

	Returns:
	BytesIO object containing the image if output_path is None, otherwise saves to output_path
	"""
	# Load audio file
	y, sr = librosa.load(audio_path, sr=None, mono=False)

	# If mono, convert to stereo-like format for consistent plotting
	if y.ndim == 1:
	y = np.array([y, y])

	plt.figure(figsize=fig_size)

	# Plot waveform for each channel with fixed Y-axis scale
	plt.subplot(2, 1, 1)
	plt.plot(y[0])
	plt.title('Channel 1')
	plt.ylim([-1.0, 1.0]) # Fixed Y-axis scale for all waveforms

	plt.subplot(2, 1, 2)
	plt.plot(y[1])
	plt.title('Channel 2')
	plt.ylim([-1.0, 1.0]) # Fixed Y-axis scale for all waveforms

	plt.tight_layout()

	if output_path:
	plt.savefig(output_path)
	plt.close()
	return output_path
	else:
	buf = BytesIO()
	plt.savefig(buf, format='png')
	plt.close()
	buf.seek(0)
	return buf


	def generate_spectrogram_image(audio_path, output_path=None, fig_size=(10, 8)):
	"""Generate a spectrogram image from an audio file.

	Args:
	audio_path: Path to the audio file
	output_path: Path to save the generated image (optional)
	fig_size: Size of the figure (width, height)

	Returns:
	BytesIO object containing the image if output_path is None, otherwise saves to output_path
	"""
	# Load audio file
	y, sr = librosa.load(audio_path, sr=None, mono=False)

	# If mono, convert to stereo-like format for consistent plotting
	if y.ndim == 1:
	y = np.array([y, y])

	plt.figure(figsize=fig_size)

	# Set fixed min and max values for spectrogram color scale
	vmin = -80 # dB
	vmax = 0 # dB

	# Generate spectrograms for each channel
	for i in range(2):
	# Compute spectrogram
	S = librosa.amplitude_to_db(np.abs(librosa.stft(y[i])), ref=np.max)

	plt.subplot(2, 1, i+1)
	# Use fixed frequency range and consistent color scaling
	librosa.display.specshow(
	S,
	sr=sr,
	x_axis='time',
	y_axis='log',
	vmin=vmin,
	vmax=vmax
	)
	plt.colorbar(format='%+2.0f dB')
	plt.title(f'Channel {i+1} Spectrogram')

	# Set frequency range (y-axis) - typically up to Nyquist frequency (sr/2)
	plt.ylim([20, sr/2]) # From 20Hz to Nyquist frequency

	plt.tight_layout()

	if output_path:
	plt.savefig(output_path)
	plt.close()
	return output_path
	else:
	buf = BytesIO()
	plt.savefig(buf, format='png')
	plt.close()
	buf.seek(0)
	return buf


	def compare_images(image1_path, image2_path, min_similarity_threshold=0.999):
	"""Compare two images using Structural Similarity Index (SSIM) which is robust to small shifts.

	Args:
	image1_path: Path to the first image
	image2_path: Path to the second image
	min_similarity_threshold: Minimum similarity required for images to be considered matching (0.0-1.0)
	- Higher values (closer to 1.0) require images to be more similar
	- Lower values (closer to 0.0) are more permissive
	- A value of 0.99 requires 99% similarity between images
	- A value of 0.0 would consider any images to match

	Returns:
	Tuple of (similarity_score, is_match)
	- similarity_score: Value between 0.0 and 1.0, where 1.0 means identical images
	- is_match: Boolean indicating if similarity_score >= min_similarity_threshold
	"""
	# Open images
	img1 = Image.open(image1_path).convert('RGB')
	img2 = Image.open(image2_path).convert('RGB')

	# Ensure same size for comparison
	if img1.size != img2.size:
	img2 = img2.resize(img1.size)

	# Convert to numpy arrays
	arr1 = np.array(img1)
	arr2 = np.array(img2)

	# Calculate SSIM for each color channel
	similarity_scores = []
	for channel in range(3): # RGB channels
	score = ssim(arr1[:,:,channel], arr2[:,:,channel], data_range=255)
	similarity_scores.append(score)

	# Calculate average SSIM across channels
	similarity_score = np.mean(similarity_scores)

	# Determine if images match by comparing similarity to threshold
	is_match = similarity_score >= min_similarity_threshold

	return (similarity_score, is_match)


	def generate_reference_images(input_path, output_dir=None, prefix=""):
	"""Generate reference waveform and spectrogram images for an audio file.

	Args:
	input_path: Path to the audio file
	output_dir: Directory to save the generated images (optional)
	prefix: Prefix to add to the output image filenames

	Returns:
	Tuple of (waveform_path, spectrogram_path)
	"""
	if output_dir is None:
	output_dir = os.path.dirname(input_path)

	# Create output directory if it doesn't exist
	os.makedirs(output_dir, exist_ok=True)

	input_filename = os.path.basename(input_path)
	name_without_ext = os.path.splitext(input_filename)[0]

	# Generate waveform image
	waveform_path = os.path.join(output_dir, f"{prefix}{name_without_ext}_waveform.png")
	generate_waveform_image(input_path, waveform_path)

	# Generate spectrogram image
	spectrogram_path = os.path.join(output_dir, f"{prefix}{name_without_ext}_spectrogram.png")
	generate_spectrogram_image(input_path, spectrogram_path)

	return (waveform_path, spectrogram_path)