fffiloni commited on
Commit
f6f0443
·
1 Parent(s): 723e43d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -8
app.py CHANGED
@@ -1,7 +1,7 @@
1
  import gradio as gr
2
  import torch
3
  from transformers import BarkModel
4
- #from optimum.bettertransformer import BetterTransformer
5
 
6
  model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16)
7
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
@@ -11,15 +11,15 @@ from transformers import AutoProcessor
11
  processor = AutoProcessor.from_pretrained("suno/bark-small")
12
 
13
  # Use bettertransform for flash attention
14
- #model = BetterTransformer.transform(model, keep_original_model=False)
15
 
16
  # Enable CPU offload
17
- #model.enable_cpu_offload()
18
 
19
  import numpy as np
20
- #from scipy.io import wavfile
21
  #from pydub import AudioSegment
22
- import soundfile as sf
23
 
24
  def infer(text_prompt):
25
  text_prompt = text_prompt
@@ -28,14 +28,14 @@ def infer(text_prompt):
28
  with torch.inference_mode():
29
  speech_output = model.generate(**inputs, do_sample = True, fine_temperature = 0.4, coarse_temperature = 0.8)
30
  #audio_out = speech_output[0].cpu().numpy()
31
- audio_out = speech_output[0]
32
 
33
  # Assuming audio_out contains audio data and the sampling rate
34
  sampling_rate = model.generation_config.sample_rate
 
35
  #wavfile.write("output.wav", sampling_rate, audio_out)
36
 
37
- # Save the audio data as an audio file using soundfile library
38
- sf.write("output.wav", audio_out, sampling_rate)
39
 
40
 
41
 
 
1
  import gradio as gr
2
  import torch
3
  from transformers import BarkModel
4
+ from optimum.bettertransformer import BetterTransformer
5
 
6
  model = BarkModel.from_pretrained("suno/bark-small", torch_dtype=torch.float16)
7
  device = "cuda:0" if torch.cuda.is_available() else "cpu"
 
11
  processor = AutoProcessor.from_pretrained("suno/bark-small")
12
 
13
  # Use bettertransform for flash attention
14
+ model = BetterTransformer.transform(model, keep_original_model=False)
15
 
16
  # Enable CPU offload
17
+ model.enable_cpu_offload()
18
 
19
  import numpy as np
20
+ from scipy.io.wavfile import write as write_wav
21
  #from pydub import AudioSegment
22
+ #import soundfile as sf
23
 
24
  def infer(text_prompt):
25
  text_prompt = text_prompt
 
28
  with torch.inference_mode():
29
  speech_output = model.generate(**inputs, do_sample = True, fine_temperature = 0.4, coarse_temperature = 0.8)
30
  #audio_out = speech_output[0].cpu().numpy()
31
+ audio_array = speech_output[0].cpu().numpy().squeeze()
32
 
33
  # Assuming audio_out contains audio data and the sampling rate
34
  sampling_rate = model.generation_config.sample_rate
35
+ write_wav("output.wav", sample_rate, audio_array)
36
  #wavfile.write("output.wav", sampling_rate, audio_out)
37
 
38
+
 
39
 
40
 
41