1inkusFace commited on
Commit
faca83a
·
verified ·
1 Parent(s): 9c6d7bb

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +10 -37
app.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  import spaces
3
  import os
4
  import uuid
@@ -14,7 +13,6 @@ os.environ["SAFETENSORS_FAST_GPU"] = "1"
14
  os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
15
 
16
  import torch
17
-
18
  torch.backends.cuda.matmul.allow_tf32 = False
19
  torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
20
  torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
@@ -29,44 +27,28 @@ import torchaudio
29
  from einops import rearrange
30
  import gradio as gr
31
 
32
- # Importing the model-related functions
33
  from stable_audio_tools import get_pretrained_model
34
  from stable_audio_tools.inference.generation import generate_diffusion_cond
35
 
36
  model, model_config = get_pretrained_model("ford442/stable-audio-open-1.0")
37
-
38
  device = "cuda" if torch.cuda.is_available() else "cpu"
39
  print(f"Using device: {device}")
40
-
41
  model.to(device,torch.bfloat16)
42
 
43
- # Function to set up, generate, and process the audio
44
- @spaces.GPU(duration=60) # Allocate GPU only when this function is called
45
  def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
46
  print(f"Prompt received: {prompt}")
47
  print(f"Settings: Duration={seconds_total}s, Steps={steps}, CFG Scale={cfg_scale}")
48
-
49
- # Fetch the Hugging Face token from the environment variable
50
- hf_token = os.getenv('HF_TOKEN')
51
- print(f"Hugging Face token: {hf_token}")
52
-
53
- # Use pre-loaded model and configuration
54
  sample_rate = model_config["sample_rate"]
55
  sample_size = model_config["sample_size"]
56
-
57
  print(f"Sample rate: {sample_rate}, Sample size: {sample_size}")
58
-
59
  print("Model moved to device.")
60
-
61
- # Set up text and timing conditioning
62
  conditioning = [{
63
  "prompt": prompt,
64
  "seconds_start": 0,
65
  "seconds_total": seconds_total
66
  }]
67
  print(f"Conditioning: {conditioning}")
68
-
69
- # Generate stereo audio
70
  print("Generating audio...")
71
  output = generate_diffusion_cond(
72
  model,
@@ -80,27 +62,22 @@ def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
80
  device=device
81
  )
82
  print("Audio generated.")
83
-
84
- # Rearrange audio batch to a single sequence
85
  output = rearrange(output, "b d n -> d (b n)")
86
- print("Audio rearranged.")
87
-
88
  # Peak normalize, clip, convert to int16
89
  output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
90
- print("Audio normalized and converted.")
91
-
92
- # Generate a unique filename for the output
93
  unique_filename = f"output_{uuid.uuid4().hex}.wav"
94
  print(f"Saving audio to file: {unique_filename}")
95
-
96
- # Save to file
97
- torchaudio.save(unique_filename, output, sample_rate)
 
 
 
 
 
98
  print(f"Audio saved: {unique_filename}")
99
-
100
- # Return the path to the generated audio file
101
  return unique_filename
102
 
103
- # Setting up the Gradio Interface
104
  interface = gr.Interface(
105
  fn=generate_audio,
106
  inputs=[
@@ -115,14 +92,12 @@ interface = gr.Interface(
115
  examples=[
116
  [
117
  "Create a serene soundscape of a quiet beach at sunset.", # Text prompt
118
-
119
- 45, # Duration in Seconds
120
  100, # Number of Diffusion Steps
121
  10, # CFG Scale
122
  ],
123
  [
124
  "Generate an energetic and bustling city street scene with distant traffic and close conversations.", # Text prompt
125
-
126
  30, # Duration in Seconds
127
  120, # Number of Diffusion Steps
128
  5, # CFG Scale
@@ -146,7 +121,6 @@ interface = gr.Interface(
146
  25, # Duration in Seconds
147
  90, # Number of Diffusion Steps
148
  6, # CFG Scale
149
-
150
  ],
151
  ["Rock beat played in a treated studio, session drumming on an acoustic kit.",
152
  30, # Duration in Seconds
@@ -156,5 +130,4 @@ interface = gr.Interface(
156
  ]
157
  ])
158
 
159
- # Launch the Interface
160
  interface.launch()
 
 
1
  import spaces
2
  import os
3
  import uuid
 
13
  os.putenv('HF_HUB_ENABLE_HF_TRANSFER','1')
14
 
15
  import torch
 
16
  torch.backends.cuda.matmul.allow_tf32 = False
17
  torch.backends.cuda.matmul.allow_bf16_reduced_precision_reduction = False
18
  torch.backends.cuda.matmul.allow_fp16_reduced_precision_reduction = False
 
27
  from einops import rearrange
28
  import gradio as gr
29
 
 
30
  from stable_audio_tools import get_pretrained_model
31
  from stable_audio_tools.inference.generation import generate_diffusion_cond
32
 
33
  model, model_config = get_pretrained_model("ford442/stable-audio-open-1.0")
 
34
  device = "cuda" if torch.cuda.is_available() else "cpu"
35
  print(f"Using device: {device}")
 
36
  model.to(device,torch.bfloat16)
37
 
38
+ @spaces.GPU(duration=60)
 
39
  def generate_audio(prompt, seconds_total=30, steps=100, cfg_scale=7):
40
  print(f"Prompt received: {prompt}")
41
  print(f"Settings: Duration={seconds_total}s, Steps={steps}, CFG Scale={cfg_scale}")
 
 
 
 
 
 
42
  sample_rate = model_config["sample_rate"]
43
  sample_size = model_config["sample_size"]
 
44
  print(f"Sample rate: {sample_rate}, Sample size: {sample_size}")
 
45
  print("Model moved to device.")
 
 
46
  conditioning = [{
47
  "prompt": prompt,
48
  "seconds_start": 0,
49
  "seconds_total": seconds_total
50
  }]
51
  print(f"Conditioning: {conditioning}")
 
 
52
  print("Generating audio...")
53
  output = generate_diffusion_cond(
54
  model,
 
62
  device=device
63
  )
64
  print("Audio generated.")
 
 
65
  output = rearrange(output, "b d n -> d (b n)")
 
 
66
  # Peak normalize, clip, convert to int16
67
  output = output.to(torch.float32).div(torch.max(torch.abs(output))).clamp(-1, 1).mul(32767).to(torch.int16).cpu()
 
 
 
68
  unique_filename = f"output_{uuid.uuid4().hex}.wav"
69
  print(f"Saving audio to file: {unique_filename}")
70
+ torchaudio.save(
71
+ unique_filename,
72
+ output,
73
+ sample_rate,
74
+ format="mp3",
75
+ encoding="MP3",
76
+ bits_per_sample=320
77
+ )
78
  print(f"Audio saved: {unique_filename}")
 
 
79
  return unique_filename
80
 
 
81
  interface = gr.Interface(
82
  fn=generate_audio,
83
  inputs=[
 
92
  examples=[
93
  [
94
  "Create a serene soundscape of a quiet beach at sunset.", # Text prompt
95
+ 45, # Duration in Seconds
 
96
  100, # Number of Diffusion Steps
97
  10, # CFG Scale
98
  ],
99
  [
100
  "Generate an energetic and bustling city street scene with distant traffic and close conversations.", # Text prompt
 
101
  30, # Duration in Seconds
102
  120, # Number of Diffusion Steps
103
  5, # CFG Scale
 
121
  25, # Duration in Seconds
122
  90, # Number of Diffusion Steps
123
  6, # CFG Scale
 
124
  ],
125
  ["Rock beat played in a treated studio, session drumming on an acoustic kit.",
126
  30, # Duration in Seconds
 
130
  ]
131
  ])
132
 
 
133
  interface.launch()