Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
|
@@ -26,7 +26,7 @@ def load_model():
|
|
| 26 |
|
| 27 |
|
| 28 |
@spaces.GPU(duration=120)
|
| 29 |
-
def inference(audio_path, prompt ="drums beats with snares"):
|
| 30 |
# Fetch the Hugging Face token from the environment variable
|
| 31 |
hf_token = os.getenv('HF_TOKEN')
|
| 32 |
print(f"Hugging Face token: {hf_token}")
|
|
@@ -56,47 +56,47 @@ def inference(audio_path, prompt ="drums beats with snares"):
|
|
| 56 |
for i in range(len(diffusion_steps)):
|
| 57 |
steps = diffusion_steps[i]
|
| 58 |
print(f"number of steps: {steps}")
|
| 59 |
-
for j in range(len(float_values)):
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
|
| 101 |
|
| 102 |
|
|
@@ -106,7 +106,8 @@ interface = gr.Interface(
|
|
| 106 |
inputs=[
|
| 107 |
# gr.UploadButton(label="Audio without drums",file_types=['mp3']),
|
| 108 |
gr.Audio(type="filepath", label="Audio without drums"),
|
| 109 |
-
gr.Textbox(label="Text prompt", placeholder="Enter your text prompt here")
|
|
|
|
| 110 |
],
|
| 111 |
outputs=gr.Audio(type="filepath", label="Generated Audio"),
|
| 112 |
title="Stable Audio Generator",
|
|
@@ -114,23 +115,28 @@ interface = gr.Interface(
|
|
| 114 |
examples=[
|
| 115 |
[
|
| 116 |
"the_chosen_ones/085838/no_drums.mp3", # Audio without drums
|
| 117 |
-
"A techno song with fast, outer space-themed drum beats." # Text prompt
|
|
|
|
| 118 |
],
|
| 119 |
[
|
| 120 |
"the_chosen_ones/103522/no_drums.mp3", # Audio without drums
|
| 121 |
-
"A slow country melody accompanied by drum beats." # Text prompt
|
|
|
|
| 122 |
],
|
| 123 |
[
|
| 124 |
"the_chosen_ones/103800/no_drums.mp3", # Audio without drums
|
| 125 |
-
"A rap song featuring slow, groovy drums with intermittent snares." # Text prompt
|
|
|
|
| 126 |
],
|
| 127 |
[
|
| 128 |
"the_chosen_ones/103808/no_drums.mp3", # Audio without drums
|
| 129 |
-
"Smooth, slow piano grooves paired with intense, rapid drum rhythms." # Text prompt
|
|
|
|
| 130 |
],
|
| 131 |
[
|
| 132 |
"the_chosen_ones/134796/no_drums.mp3", # Audio without drums
|
| 133 |
-
"A rap track with rapid drum beats and snares." # Text prompt
|
|
|
|
| 134 |
]
|
| 135 |
],
|
| 136 |
cache_examples=True
|
|
|
|
| 26 |
|
| 27 |
|
| 28 |
@spaces.GPU(duration=120)
|
| 29 |
+
def inference(audio_path, prompt ="drums beats with snares", noise_level = 2.7):
|
| 30 |
# Fetch the Hugging Face token from the environment variable
|
| 31 |
hf_token = os.getenv('HF_TOKEN')
|
| 32 |
print(f"Hugging Face token: {hf_token}")
|
|
|
|
| 56 |
for i in range(len(diffusion_steps)):
|
| 57 |
steps = diffusion_steps[i]
|
| 58 |
print(f"number of steps: {steps}")
|
| 59 |
+
# for j in range(len(float_values)):
|
| 60 |
+
# noise_level = float_values[j]
|
| 61 |
+
print(f"Noise level is: {noise_level}")
|
| 62 |
+
audio, sr = torchaudio.load(audio_path)
|
| 63 |
+
output = generate_diffusion_cond(
|
| 64 |
+
model,
|
| 65 |
+
steps=steps,
|
| 66 |
+
cfg_scale=7,
|
| 67 |
+
conditioning=conditioning,
|
| 68 |
+
sample_size=our_sample_size,
|
| 69 |
+
sigma_min=0.3,
|
| 70 |
+
sigma_max=500,
|
| 71 |
+
sampler_type="dpmpp-3m-sde",
|
| 72 |
+
device=device,
|
| 73 |
+
init_audio=(sr, audio),
|
| 74 |
+
init_noise_level=noise_level,
|
| 75 |
+
# use_init = True,
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
# Rearrange audio batch to a single sequence
|
| 79 |
+
output = rearrange(output, "b d n -> d (b n)")
|
| 80 |
+
print("rearranged the output into a single sequence")
|
| 81 |
+
|
| 82 |
+
# Peak normalize, clip, convert to int16, and save to file
|
| 83 |
+
output = (
|
| 84 |
+
output.to(torch.float32)
|
| 85 |
+
.div(torch.max(torch.abs(output)))
|
| 86 |
+
.clamp(-1, 1)
|
| 87 |
+
.mul(32767)
|
| 88 |
+
.to(torch.int16)
|
| 89 |
+
.cpu()
|
| 90 |
+
)
|
| 91 |
+
print("Normalized the output, clip and convert to int16")
|
| 92 |
+
|
| 93 |
+
# Generate a unique filename for the output
|
| 94 |
+
unique_filename = f"output_{uuid.uuid4().hex}.mp3"
|
| 95 |
+
print(f"Saving audio to file: {unique_filename}")
|
| 96 |
+
torchaudio.save(unique_filename, output, sample_rate)
|
| 97 |
+
print(f"saved to filename {unique_filename}")
|
| 98 |
+
|
| 99 |
+
return unique_filename
|
| 100 |
|
| 101 |
|
| 102 |
|
|
|
|
| 106 |
inputs=[
|
| 107 |
# gr.UploadButton(label="Audio without drums",file_types=['mp3']),
|
| 108 |
gr.Audio(type="filepath", label="Audio without drums"),
|
| 109 |
+
gr.Textbox(label="Text prompt", placeholder="Enter your text prompt here"),
|
| 110 |
+
gr.Slider(2.5, 3.5, step=0.1, value=2.7, label="Noise Level", info="Choose between 2.5 and 3.5"),
|
| 111 |
],
|
| 112 |
outputs=gr.Audio(type="filepath", label="Generated Audio"),
|
| 113 |
title="Stable Audio Generator",
|
|
|
|
| 115 |
examples=[
|
| 116 |
[
|
| 117 |
"the_chosen_ones/085838/no_drums.mp3", # Audio without drums
|
| 118 |
+
"A techno song with fast, outer space-themed drum beats.", # Text prompt
|
| 119 |
+
2.7 # Noise Level
|
| 120 |
],
|
| 121 |
[
|
| 122 |
"the_chosen_ones/103522/no_drums.mp3", # Audio without drums
|
| 123 |
+
"A slow country melody accompanied by drum beats.", # Text prompt
|
| 124 |
+
2.7 # Noise Level
|
| 125 |
],
|
| 126 |
[
|
| 127 |
"the_chosen_ones/103800/no_drums.mp3", # Audio without drums
|
| 128 |
+
"A rap song featuring slow, groovy drums with intermittent snares.", # Text prompt
|
| 129 |
+
2.7 # Noise Level
|
| 130 |
],
|
| 131 |
[
|
| 132 |
"the_chosen_ones/103808/no_drums.mp3", # Audio without drums
|
| 133 |
+
"Smooth, slow piano grooves paired with intense, rapid drum rhythms.", # Text prompt
|
| 134 |
+
2.7 # Noise Level
|
| 135 |
],
|
| 136 |
[
|
| 137 |
"the_chosen_ones/134796/no_drums.mp3", # Audio without drums
|
| 138 |
+
"A rap track with rapid drum beats and snares.", # Text prompt
|
| 139 |
+
2.7 # Noise Level
|
| 140 |
]
|
| 141 |
],
|
| 142 |
cache_examples=True
|