import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import soundfile as sf
import gradio as gr
import io
import os
import base64
def analyze_audio_files(files, folder_path):
output_html = ""
file_paths = []
# Handle inputs: files can be a list of file paths or a folder path
if files:
file_paths.extend(files)
if folder_path:
if os.path.isdir(folder_path):
folder_files = [os.path.join(folder_path, f) for f in os.listdir(folder_path)
if os.path.isfile(os.path.join(folder_path, f))]
file_paths.extend(folder_files)
else:
return f"
Folder not found: {folder_path}
"
for audio_file in file_paths:
try:
# Load the audio file
y, sr = librosa.load(audio_file, sr=None)
# Get original bit depth from file metadata
with sf.SoundFile(audio_file) as f:
bit_depth_info = f.subtype_info
# Time domain analysis
duration = len(y) / sr
# Frequency domain analysis
desired_freq_resolution = 10.0 # in Hz
# Calculate n_fft, limit it to a reasonable range
n_fft = int(sr / desired_freq_resolution)
n_fft = 2 ** int(np.ceil(np.log2(n_fft))) # Next power of two
# Set maximum and minimum n_fft to avoid excessive computation
max_n_fft = 32768
min_n_fft = 1024
n_fft = min(max(n_fft, min_n_fft), max_n_fft)
hop_length = n_fft // 4
# Compute the Short-Time Fourier Transform (STFT)
S = np.abs(librosa.stft(y, n_fft=n_fft, hop_length=hop_length))
# Compute the spectrogram (in dB)
S_db = librosa.amplitude_to_db(S, ref=np.max)
# Average over time to get the frequency spectrum
S_mean = np.mean(S, axis=1)
freqs = np.linspace(0, sr / 2, len(S_mean))
# Plot the waveform
fig_waveform = plt.figure(figsize=(8, 4))
librosa.display.waveshow(y, sr=sr, alpha=0.5)
plt.title('Waveform', fontsize=14)
plt.xlabel('Time (s)', fontsize=12)
plt.ylabel('Amplitude', fontsize=12)
plt.tight_layout()
waveform_image = io.BytesIO()
plt.savefig(waveform_image, format='png', bbox_inches='tight')
plt.close(fig_waveform)
waveform_image.seek(0)
waveform_base64 = base64.b64encode(
waveform_image.read()).decode('utf-8')
waveform_html = f'
'
# Calculate spectral features: spectral centroid, spectral bandwidth, and spectral rolloff
spectral_centroids = librosa.feature.spectral_centroid(y=y, sr=sr)[
0]
spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[
0]
spectral_rolloff = librosa.feature.spectral_rolloff(
y=y, sr=sr, roll_percent=0.85)[0]
times = librosa.times_like(spectral_centroids)
# Plot the spectral features
fig_spectral_features = plt.figure(figsize=(8, 4))
plt.semilogy(times, spectral_centroids, label='Spectral Centroid')
plt.semilogy(times, spectral_bandwidth, label='Spectral Bandwidth')
plt.semilogy(times, spectral_rolloff,
label='Spectral Rolloff', linestyle='--')
plt.title('Spectral Features', fontsize=14)
plt.xlabel('Time (s)', fontsize=12)
plt.ylabel('Hz', fontsize=12)
plt.legend(loc='upper right')
plt.tight_layout()
spectral_features_image = io.BytesIO()
plt.savefig(spectral_features_image,
format='png', bbox_inches='tight')
plt.close(fig_spectral_features)
spectral_features_image.seek(0)
spectral_features_base64 = base64.b64encode(
spectral_features_image.read()).decode('utf-8')
spectral_features_html = f'
'
# Plot the frequency spectrum
fig1 = plt.figure(figsize=(8, 4))
plt.semilogx(freqs, 20 * np.log10(S_mean + 1e-10)) # Avoid log(0)
plt.xlabel('Frequency (Hz)', fontsize=12)
plt.ylabel('Amplitude (dB)', fontsize=12)
plt.title('Frequency Spectrum', fontsize=14)
plt.grid(True, which='both', ls='--')
plt.xlim(20, sr / 2)
plt.tight_layout()
spectrum_image = io.BytesIO()
plt.savefig(spectrum_image, format='png', bbox_inches='tight')
plt.close(fig1)
spectrum_image.seek(0)
spectrum_base64 = base64.b64encode(
spectrum_image.read()).decode('utf-8')
spectrum_html = f'
'
# Plot the spectrogram
fig3 = plt.figure(figsize=(8, 4))
librosa.display.specshow(
S_db, sr=sr, x_axis='time', y_axis='linear', hop_length=hop_length)
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram', fontsize=14)
plt.xlabel('Time (s)', fontsize=12)
plt.ylabel('Frequency (Hz)', fontsize=12)
plt.tight_layout()
spectrogram_image = io.BytesIO()
plt.savefig(spectrogram_image, format='png', bbox_inches='tight')
plt.close(fig3)
spectrogram_image.seek(0)
spectrogram_base64 = base64.b64encode(
spectrogram_image.read()).decode('utf-8')
spectrogram_html = f'
'
# Analyze high-frequency content
# Define a threshold relative to the maximum amplitude
threshold_db = -80 # dB
max_amplitude_db = 20 * np.log10(np.max(S_mean + 1e-10))
threshold_amplitude_db = max_amplitude_db + threshold_db
threshold_amplitude = 10 ** (threshold_amplitude_db / 20)
# Find the highest frequency with significant content
significant_indices = np.where(S_mean >= threshold_amplitude)[0]
if len(significant_indices) > 0:
highest_freq = freqs[significant_indices[-1]]
# Estimate the real sample rate
estimated_sample_rate = highest_freq * 2 # Nyquist theorem
significant_freq_text = f"{highest_freq:.2f} Hz"
estimated_sample_rate_text = f"{estimated_sample_rate / 1000:.2f} kHz"
else:
significant_freq_text = "No significant frequency content detected."
estimated_sample_rate_text = "N/A"
# Estimate effective bit depth
# Calculate the signal's dynamic range
signal_rms = np.sqrt(np.mean(y ** 2))
noise_floor = np.percentile(np.abs(y), 0.1)
# Avoid division by zero
dynamic_range_db = 20 * \
np.log10(signal_rms / (noise_floor + 1e-10))
estimated_bit_depth = int(np.ceil(dynamic_range_db / 6.02))
# Prepare the output text as an HTML table
output_text = f"""
{os.path.basename(audio_file)}
| File Bit Depth: | {bit_depth_info} |
| Sample Rate: | {sr} Hz |
| Duration: | {duration:.2f} seconds |
| Using n_fft = | {n_fft} |
| Significant frequency content up to: | {significant_freq_text} |
| Estimated Real Sample Rate: | {estimated_sample_rate_text} |
| Estimated Dynamic Range: | {dynamic_range_db:.2f} dB |
| Estimated Effective Bit Depth: | {estimated_bit_depth} bits PCM |
"""
# Plot histogram of sample values
fig2 = plt.figure(figsize=(8, 4))
plt.hist(y, bins=1000, alpha=0.7, color='blue',
edgecolor='black', log=True)
plt.xlabel('Amplitude', fontsize=12)
plt.ylabel('Count (log scale)', fontsize=12)
plt.title('Histogram of Sample Amplitudes', fontsize=14)
plt.grid(True)
plt.tight_layout()
histogram_image = io.BytesIO()
plt.savefig(histogram_image, format='png', bbox_inches='tight')
plt.close(fig2)
histogram_image.seek(0)
histogram_base64 = base64.b64encode(
histogram_image.read()).decode('utf-8')
histogram_html = f'
'
# Combine text and images into HTML
output_html += f"""
{output_text}
Waveform
{waveform_html}
Spectral Features
{spectral_features_html}
Frequency Spectrum
{spectrum_html}
Spectrogram
{spectrogram_html}
Histogram of Sample Amplitudes
{histogram_html}
"""
except Exception as e:
# Handle errors gracefully
output_html += f"File: {os.path.basename(audio_file)}
Error: {str(e)}
"
# Return the aggregated HTML output
return output_html
with gr.Blocks() as demo:
gr.Markdown("Wave Wizard")
gr.Markdown(
"Upload one or more audio files, or specify a folder containing audio files.")
with gr.Row():
file_input = gr.Files(label="Upload Audio Files",
type="filepath", file_count="multiple")
folder_input = gr.Textbox(label="Folder Path (optional)",
placeholder="Enter folder path containing audio files")
analyze_button = gr.Button("Analyze")
output_display = gr.HTML()
def analyze_wrapper(files, folder_path):
outputs = analyze_audio_files(files, folder_path)
return outputs
analyze_button.click(analyze_wrapper, inputs=[
file_input, folder_input], outputs=output_display)
demo.launch()