YuvalShaffir commited on
Commit
37f416d
·
verified ·
1 Parent(s): 872b362

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +54 -48
app.py CHANGED
@@ -26,7 +26,7 @@ def load_model():
26
 
27
 
28
  @spaces.GPU(duration=120)
29
- def inference(audio_path, prompt ="drums beats with snares"):
30
  # Fetch the Hugging Face token from the environment variable
31
  hf_token = os.getenv('HF_TOKEN')
32
  print(f"Hugging Face token: {hf_token}")
@@ -56,47 +56,47 @@ def inference(audio_path, prompt ="drums beats with snares"):
56
  for i in range(len(diffusion_steps)):
57
  steps = diffusion_steps[i]
58
  print(f"number of steps: {steps}")
59
- for j in range(len(float_values)):
60
- noise_level = float_values[j]
61
- print(f"Noise level is: {noise_level}")
62
- audio, sr = torchaudio.load(audio_path)
63
- output = generate_diffusion_cond(
64
- model,
65
- steps=steps,
66
- cfg_scale=7,
67
- conditioning=conditioning,
68
- sample_size=our_sample_size,
69
- sigma_min=0.3,
70
- sigma_max=500,
71
- sampler_type="dpmpp-3m-sde",
72
- device=device,
73
- init_audio=(sr, audio),
74
- init_noise_level=noise_level,
75
- # use_init = True,
76
- )
77
-
78
- # Rearrange audio batch to a single sequence
79
- output = rearrange(output, "b d n -> d (b n)")
80
- print("rearranged the output into a single sequence")
81
-
82
- # Peak normalize, clip, convert to int16, and save to file
83
- output = (
84
- output.to(torch.float32)
85
- .div(torch.max(torch.abs(output)))
86
- .clamp(-1, 1)
87
- .mul(32767)
88
- .to(torch.int16)
89
- .cpu()
90
- )
91
- print("Normalized the output, clip and convert to int16")
92
-
93
- # Generate a unique filename for the output
94
- unique_filename = f"output_{uuid.uuid4().hex}.mp3"
95
- print(f"Saving audio to file: {unique_filename}")
96
- torchaudio.save(unique_filename, output, sample_rate)
97
- print(f"saved to filename {unique_filename}")
98
-
99
- return unique_filename
100
 
101
 
102
 
@@ -106,7 +106,8 @@ interface = gr.Interface(
106
  inputs=[
107
  # gr.UploadButton(label="Audio without drums",file_types=['mp3']),
108
  gr.Audio(type="filepath", label="Audio without drums"),
109
- gr.Textbox(label="Text prompt", placeholder="Enter your text prompt here")
 
110
  ],
111
  outputs=gr.Audio(type="filepath", label="Generated Audio"),
112
  title="Stable Audio Generator",
@@ -114,23 +115,28 @@ interface = gr.Interface(
114
  examples=[
115
  [
116
  "the_chosen_ones/085838/no_drums.mp3", # Audio without drums
117
- "A techno song with fast, outer space-themed drum beats." # Text prompt
 
118
  ],
119
  [
120
  "the_chosen_ones/103522/no_drums.mp3", # Audio without drums
121
- "A slow country melody accompanied by drum beats." # Text prompt
 
122
  ],
123
  [
124
  "the_chosen_ones/103800/no_drums.mp3", # Audio without drums
125
- "A rap song featuring slow, groovy drums with intermittent snares." # Text prompt
 
126
  ],
127
  [
128
  "the_chosen_ones/103808/no_drums.mp3", # Audio without drums
129
- "Smooth, slow piano grooves paired with intense, rapid drum rhythms." # Text prompt
 
130
  ],
131
  [
132
  "the_chosen_ones/134796/no_drums.mp3", # Audio without drums
133
- "A rap track with rapid drum beats and snares." # Text prompt
 
134
  ]
135
  ],
136
  cache_examples=True
 
26
 
27
 
28
  @spaces.GPU(duration=120)
29
+ def inference(audio_path, prompt ="drums beats with snares", noise_level = 2.7):
30
  # Fetch the Hugging Face token from the environment variable
31
  hf_token = os.getenv('HF_TOKEN')
32
  print(f"Hugging Face token: {hf_token}")
 
56
  for i in range(len(diffusion_steps)):
57
  steps = diffusion_steps[i]
58
  print(f"number of steps: {steps}")
59
+ # for j in range(len(float_values)):
60
+ # noise_level = float_values[j]
61
+ print(f"Noise level is: {noise_level}")
62
+ audio, sr = torchaudio.load(audio_path)
63
+ output = generate_diffusion_cond(
64
+ model,
65
+ steps=steps,
66
+ cfg_scale=7,
67
+ conditioning=conditioning,
68
+ sample_size=our_sample_size,
69
+ sigma_min=0.3,
70
+ sigma_max=500,
71
+ sampler_type="dpmpp-3m-sde",
72
+ device=device,
73
+ init_audio=(sr, audio),
74
+ init_noise_level=noise_level,
75
+ # use_init = True,
76
+ )
77
+
78
+ # Rearrange audio batch to a single sequence
79
+ output = rearrange(output, "b d n -> d (b n)")
80
+ print("rearranged the output into a single sequence")
81
+
82
+ # Peak normalize, clip, convert to int16, and save to file
83
+ output = (
84
+ output.to(torch.float32)
85
+ .div(torch.max(torch.abs(output)))
86
+ .clamp(-1, 1)
87
+ .mul(32767)
88
+ .to(torch.int16)
89
+ .cpu()
90
+ )
91
+ print("Normalized the output, clip and convert to int16")
92
+
93
+ # Generate a unique filename for the output
94
+ unique_filename = f"output_{uuid.uuid4().hex}.mp3"
95
+ print(f"Saving audio to file: {unique_filename}")
96
+ torchaudio.save(unique_filename, output, sample_rate)
97
+ print(f"saved to filename {unique_filename}")
98
+
99
+ return unique_filename
100
 
101
 
102
 
 
106
  inputs=[
107
  # gr.UploadButton(label="Audio without drums",file_types=['mp3']),
108
  gr.Audio(type="filepath", label="Audio without drums"),
109
+ gr.Textbox(label="Text prompt", placeholder="Enter your text prompt here"),
110
+ gr.Slider(2.5, 3.5, step=0.1, value=2.7, label="Noise Level", info="Choose between 2.5 and 3.5"),
111
  ],
112
  outputs=gr.Audio(type="filepath", label="Generated Audio"),
113
  title="Stable Audio Generator",
 
115
  examples=[
116
  [
117
  "the_chosen_ones/085838/no_drums.mp3", # Audio without drums
118
+ "A techno song with fast, outer space-themed drum beats.", # Text prompt
119
+ 2.7 # Noise Level
120
  ],
121
  [
122
  "the_chosen_ones/103522/no_drums.mp3", # Audio without drums
123
+ "A slow country melody accompanied by drum beats.", # Text prompt
124
+ 2.7 # Noise Level
125
  ],
126
  [
127
  "the_chosen_ones/103800/no_drums.mp3", # Audio without drums
128
+ "A rap song featuring slow, groovy drums with intermittent snares.", # Text prompt
129
+ 2.7 # Noise Level
130
  ],
131
  [
132
  "the_chosen_ones/103808/no_drums.mp3", # Audio without drums
133
+ "Smooth, slow piano grooves paired with intense, rapid drum rhythms.", # Text prompt
134
+ 2.7 # Noise Level
135
  ],
136
  [
137
  "the_chosen_ones/134796/no_drums.mp3", # Audio without drums
138
+ "A rap track with rapid drum beats and snares.", # Text prompt
139
+ 2.7 # Noise Level
140
  ]
141
  ],
142
  cache_examples=True