Spaces:
Running on T4
Running on T4
Update app.py
Browse files
app.py
CHANGED
|
@@ -60,4 +60,77 @@ def wav_bytes_from_spectrogram_image(image: Image.Image) -> T.Tuple[io.BytesIO,
|
|
| 60 |
|
| 61 |
return wav_bytes
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
gr.Interface(fn=wav_bytes_from_spectrogram_image, inputs=[gr.Image()], outputs=[gr.Audio()]).launch()
|
|
|
|
| 60 |
|
| 61 |
return wav_bytes
|
| 62 |
|
| 63 |
+
def spectrogram_from_image(
|
| 64 |
+
image: Image.Image, max_volume: float = 50, power_for_image: float = 0.25
|
| 65 |
+
) -> np.ndarray:
|
| 66 |
+
"""
|
| 67 |
+
Compute a spectrogram magnitude array from a spectrogram image.
|
| 68 |
+
TODO(hayk): Add image_from_spectrogram and call this out as the reverse.
|
| 69 |
+
"""
|
| 70 |
+
# Convert to a numpy array of floats
|
| 71 |
+
data = np.array(image).astype(np.float32)
|
| 72 |
+
|
| 73 |
+
# Flip Y take a single channel
|
| 74 |
+
data = data[::-1, :, 0]
|
| 75 |
+
|
| 76 |
+
# Invert
|
| 77 |
+
data = 255 - data
|
| 78 |
+
|
| 79 |
+
# Rescale to max volume
|
| 80 |
+
data = data * max_volume / 255
|
| 81 |
+
|
| 82 |
+
# Reverse the power curve
|
| 83 |
+
data = np.power(data, 1 / power_for_image)
|
| 84 |
+
|
| 85 |
+
return data
|
| 86 |
+
|
| 87 |
+
def waveform_from_spectrogram(
|
| 88 |
+
Sxx: np.ndarray,
|
| 89 |
+
n_fft: int,
|
| 90 |
+
hop_length: int,
|
| 91 |
+
win_length: int,
|
| 92 |
+
num_samples: int,
|
| 93 |
+
sample_rate: int,
|
| 94 |
+
mel_scale: bool = True,
|
| 95 |
+
n_mels: int = 512,
|
| 96 |
+
max_mel_iters: int = 200,
|
| 97 |
+
num_griffin_lim_iters: int = 32,
|
| 98 |
+
device: str = "cuda:0",
|
| 99 |
+
) -> np.ndarray:
|
| 100 |
+
"""
|
| 101 |
+
Reconstruct a waveform from a spectrogram.
|
| 102 |
+
This is an approximate inverse of spectrogram_from_waveform, using the Griffin-Lim algorithm
|
| 103 |
+
to approximate the phase.
|
| 104 |
+
"""
|
| 105 |
+
Sxx_torch = torch.from_numpy(Sxx).to(device)
|
| 106 |
+
|
| 107 |
+
# TODO(hayk): Make this a class that caches the two things
|
| 108 |
+
|
| 109 |
+
if mel_scale:
|
| 110 |
+
mel_inv_scaler = torchaudio.transforms.InverseMelScale(
|
| 111 |
+
n_mels=n_mels,
|
| 112 |
+
sample_rate=sample_rate,
|
| 113 |
+
f_min=0,
|
| 114 |
+
f_max=10000,
|
| 115 |
+
n_stft=n_fft // 2 + 1,
|
| 116 |
+
norm=None,
|
| 117 |
+
mel_scale="htk",
|
| 118 |
+
max_iter=max_mel_iters,
|
| 119 |
+
).to(device)
|
| 120 |
+
|
| 121 |
+
Sxx_torch = mel_inv_scaler(Sxx_torch)
|
| 122 |
+
|
| 123 |
+
griffin_lim = torchaudio.transforms.GriffinLim(
|
| 124 |
+
n_fft=n_fft,
|
| 125 |
+
win_length=win_length,
|
| 126 |
+
hop_length=hop_length,
|
| 127 |
+
power=1.0,
|
| 128 |
+
n_iter=num_griffin_lim_iters,
|
| 129 |
+
).to(device)
|
| 130 |
+
|
| 131 |
+
waveform = griffin_lim(Sxx_torch).cpu().numpy()
|
| 132 |
+
|
| 133 |
+
return waveform
|
| 134 |
+
|
| 135 |
+
|
| 136 |
gr.Interface(fn=wav_bytes_from_spectrogram_image, inputs=[gr.Image()], outputs=[gr.Audio()]).launch()
|