ayf3 commited on
Commit
58fd207
·
verified ·
1 Parent(s): b48d117

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +19 -5
app.py CHANGED
@@ -227,13 +227,27 @@ def compute_mel(y, sr=SAMPLE_RATE):
227
  return mel
228
 
229
 
230
- def mel_to_audio_griffinlim(mel, sr=SAMPLE_RATE, n_iter=60):
231
- inverse_mel = torchaudio.transforms.InverseMelScale(
232
- n_stft=1024 // 2 + 1, n_mels=N_MELS,
233
- sample_rate=sr, f_min=0, f_max=float(sr // 2), mel_scale="htk",
 
234
  )
 
 
 
 
 
 
 
 
 
 
 
235
  mel_power = torch.exp(mel)
236
- spec = inverse_mel(mel_power)
 
 
237
  gl = torchaudio.transforms.GriffinLim(n_fft=1024, hop_length=256, n_iter=n_iter)
238
  audio = gl(spec)
239
  return audio.detach().cpu().numpy() if np is not None else audio.detach().cpu().tolist()
 
227
  return mel
228
 
229
 
230
+ def _get_mel_fb_pinv(sr=SAMPLE_RATE, n_mels=N_MELS):
231
+ """Compute pseudo-inverse of mel filterbank (cached)."""
232
+ fb = torchaudio.functional.melscale_filterbanks(
233
+ n_freqs=513, f_min=0, f_max=float(sr // 2),
234
+ n_mels=n_mels, sample_rate=sr, norm=None, mel_scale="htk",
235
  )
236
+ return torch.linalg.pinv(fb) # (513, n_mels)
237
+
238
+
239
+ _FB_PINV_CACHE = {}
240
+
241
+ def mel_to_audio_griffinlim(mel, sr=SAMPLE_RATE, n_iter=60):
242
+ key = (sr, mel.shape[0])
243
+ if key not in _FB_PINV_CACHE:
244
+ _FB_PINV_CACHE[key] = _get_mel_fb_pinv(sr=sr, n_mels=mel.shape[0])
245
+ fb_pinv = _FB_PINV_CACHE[key]
246
+
247
  mel_power = torch.exp(mel)
248
+ spec = fb_pinv @ mel_power
249
+ spec = torch.clamp(spec, min=0)
250
+
251
  gl = torchaudio.transforms.GriffinLim(n_fft=1024, hop_length=256, n_iter=n_iter)
252
  audio = gl(spec)
253
  return audio.detach().cpu().numpy() if np is not None else audio.detach().cpu().tolist()