Spaces:
Sleeping
Sleeping
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import librosa
|
| 2 |
+
import numpy as np
|
| 3 |
+
import matplotlib.pyplot as plt
|
| 4 |
+
import cv2
|
| 5 |
+
import io
|
| 6 |
+
import tempfile
|
| 7 |
+
from PIL import Image
|
| 8 |
+
import gradio as gr
|
| 9 |
+
from gradio_imageslider import ImageSlider
|
| 10 |
+
|
| 11 |
+
def generate_mel_spectrogram(audio_path, sr=22050, n_mels=128, fmin=0, fmax=7000):
|
| 12 |
+
# Load audio file
|
| 13 |
+
y, sr = librosa.load(audio_path, sr=sr)
|
| 14 |
+
|
| 15 |
+
# Generate Mel Spectrogram
|
| 16 |
+
S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=n_mels, fmin=fmin, fmax=fmax)
|
| 17 |
+
S_dB = librosa.power_to_db(S, ref=np.max)
|
| 18 |
+
|
| 19 |
+
return S_dB, y, sr
|
| 20 |
+
|
| 21 |
+
def detect_zero_db(spectrogram):
|
| 22 |
+
# Create a binary mask where the spectrogram values are close to 0 dB
|
| 23 |
+
threshold = -10 # +0 dB threshold
|
| 24 |
+
mask = np.isclose(spectrogram, threshold, atol=17) # Use a tolerance to include values close to 0 dB
|
| 25 |
+
|
| 26 |
+
return mask
|
| 27 |
+
|
| 28 |
+
def plot_spectrogram(spectrogram, file_path):
|
| 29 |
+
# Plot the Mel Spectrogram and save it to a file
|
| 30 |
+
plt.figure(figsize=(6, 6))
|
| 31 |
+
plt.axis('off')
|
| 32 |
+
librosa.display.specshow(spectrogram, sr=22050, x_axis='time', y_axis='mel', fmin=0, fmax=7000)
|
| 33 |
+
plt.savefig(file_path, format='png', bbox_inches='tight', pad_inches=0)
|
| 34 |
+
plt.close()
|
| 35 |
+
|
| 36 |
+
def plot_edge_spectrogram(edges, file_path):
|
| 37 |
+
# Plot the Edge Detected Spectrogram and save it to a file
|
| 38 |
+
plt.figure(figsize=(6, 6))
|
| 39 |
+
plt.axis('off')
|
| 40 |
+
plt.imshow(edges, cmap='gray', aspect='auto', origin='lower')
|
| 41 |
+
plt.savefig(file_path, format='png', bbox_inches='tight', pad_inches=0)
|
| 42 |
+
plt.close()
|
| 43 |
+
|
| 44 |
+
def plot_frequency(times, frequencies, label, color, file_path):
|
| 45 |
+
plt.figure(figsize=(12, 6))
|
| 46 |
+
plt.plot(times, frequencies, label=label, color=color, linewidth=2)
|
| 47 |
+
plt.title(f'{label} Frequency')
|
| 48 |
+
plt.xlabel('Time (s)')
|
| 49 |
+
plt.ylabel('Frequency (Hz)')
|
| 50 |
+
plt.legend()
|
| 51 |
+
|
| 52 |
+
# Save to file
|
| 53 |
+
plt.savefig(file_path, format='png', bbox_inches='tight', pad_inches=0)
|
| 54 |
+
plt.close()
|
| 55 |
+
|
| 56 |
+
def process_audio(audio_file):
|
| 57 |
+
mel_spectrogram, y, sr = generate_mel_spectrogram(audio_file)
|
| 58 |
+
edges = detect_zero_db(mel_spectrogram)
|
| 59 |
+
|
| 60 |
+
# Create temporary files to save the generated images
|
| 61 |
+
with tempfile.NamedTemporaryFile(suffix=".png", delete=False) as mel_file, \
|
| 62 |
+
tempfile.NamedTemporaryFile(suffix=".png", delete=False) as edge_file, \
|
| 63 |
+
tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f0_file, \
|
| 64 |
+
tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f1_file, \
|
| 65 |
+
tempfile.NamedTemporaryFile(suffix=".png", delete=False) as f2_file:
|
| 66 |
+
|
| 67 |
+
mel_spectrogram_img = mel_file.name
|
| 68 |
+
edge_spectrogram_img = edge_file.name
|
| 69 |
+
f0_img = f0_file.name
|
| 70 |
+
f1_img = f1_file.name
|
| 71 |
+
f2_img = f2_file.name
|
| 72 |
+
|
| 73 |
+
# Save the Mel spectrogram and edge-detected spectrogram to the temporary files
|
| 74 |
+
plot_spectrogram(mel_spectrogram, mel_spectrogram_img)
|
| 75 |
+
plot_edge_spectrogram(edges, edge_spectrogram_img)
|
| 76 |
+
|
| 77 |
+
# Extract and save individual frequency plots
|
| 78 |
+
f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C2'), fmax=librosa.note_to_hz('C7'))
|
| 79 |
+
times = librosa.times_like(f0, sr=sr)
|
| 80 |
+
|
| 81 |
+
plot_frequency(times, f0, 'F0', 'cyan', f0_img)
|
| 82 |
+
|
| 83 |
+
# Formant frequency (F1 and F2) detection using LPC
|
| 84 |
+
lpc_order = 5 # LPC order for formant estimation
|
| 85 |
+
formants = np.empty((times.shape[0], 2)) # F1 and F2
|
| 86 |
+
formants[:] = np.nan # Initialize with NaN for unvoiced frames
|
| 87 |
+
|
| 88 |
+
for i in range(len(times)):
|
| 89 |
+
if voiced_flag[i] and i * sr < len(y):
|
| 90 |
+
frame = y[int(i * sr):int(i * sr + sr)] # 1 frame
|
| 91 |
+
if len(frame) == 0:
|
| 92 |
+
continue
|
| 93 |
+
|
| 94 |
+
# Apply LPC
|
| 95 |
+
A = librosa.lpc(frame, order = lpc_order)
|
| 96 |
+
rts = np.roots(A)
|
| 97 |
+
rts = rts[np.imag(rts) >= 0]
|
| 98 |
+
angz = np.arctan2(np.imag(rts), np.real(rts))
|
| 99 |
+
frqs = angz * (sr / (2 * np.pi))
|
| 100 |
+
frqs = np.sort(frqs)
|
| 101 |
+
|
| 102 |
+
if len(frqs) >= 2:
|
| 103 |
+
formants[i, 0] = frqs[0] # F1
|
| 104 |
+
formants[i, 1] = frqs[1] # F2
|
| 105 |
+
|
| 106 |
+
plot_frequency(times, formants[:, 0], 'F1', 'magenta', f1_img)
|
| 107 |
+
plot_frequency(times, formants[:, 1], 'F2', 'yellow', f2_img)
|
| 108 |
+
|
| 109 |
+
return [mel_spectrogram_img, edge_spectrogram_img], f0_img, f1_img, f2_img
|
| 110 |
+
|
| 111 |
+
with gr.Blocks() as demo:
|
| 112 |
+
with gr.Group():
|
| 113 |
+
audio_input = gr.Audio(label="Upload an audio file in WAV format", type="filepath")
|
| 114 |
+
img_slider = ImageSlider(label="Before and After Edge Detection", type="filepath", slider_color="pink")
|
| 115 |
+
f0_plot = gr.Image(label="F0 Frequency Plot", type="filepath")
|
| 116 |
+
f1_plot = gr.Image(label="F1 Frequency Plot", type="filepath")
|
| 117 |
+
f2_plot = gr.Image(label="F2 Frequency Plot", type="filepath")
|
| 118 |
+
|
| 119 |
+
audio_input.upload(process_audio, inputs=audio_input, outputs=[img_slider, f0_plot, f1_plot, f2_plot])
|
| 120 |
+
|
| 121 |
+
if __name__ == "__main__":
|
| 122 |
+
demo.launch()
|