bnaghib commited on
Commit
d0ba962
·
verified ·
1 Parent(s): 263b1b4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -5
app.py CHANGED
@@ -16,16 +16,18 @@ vocoder = HIFIGAN.from_hparams(
16
  )
17
 
18
  def generate_wav(text):
19
- # Generate mel spectrogram (no pace argument)
20
  mel_output, mel_length, alignment = taco.encode_text(text)
21
 
22
- # Slow down speech by stretching mel spectrogram
 
23
  mel_output = torch.nn.functional.interpolate(
24
- mel_output.transpose(1, 2),
25
- scale_factor=1.25, # 1.1 = slightly slower, 1.25 = calm, 1.35 = very slow
26
  mode="linear",
27
  align_corners=False
28
- ).transpose(1, 2)
 
29
 
30
  # Smooth mel for more natural prosody
31
  mel_output = mel_output * 0.9
 
16
  )
17
 
18
  def generate_wav(text):
19
+ # Generate mel spectrogram
20
  mel_output, mel_length, alignment = taco.encode_text(text)
21
 
22
+ # Slow down speech by stretching ONLY the time dimension
23
+ mel_output = mel_output.permute(0, 2, 1) # [1, 80, T]
24
  mel_output = torch.nn.functional.interpolate(
25
+ mel_output,
26
+ scale_factor=1.25, # 1.1 = slightly slower, 1.25 = calm
27
  mode="linear",
28
  align_corners=False
29
+ )
30
+ mel_output = mel_output.permute(0, 2, 1) # back to [1, T, 80]
31
 
32
  # Smooth mel for more natural prosody
33
  mel_output = mel_output * 0.9