Spaces:

YuvalShaffir
/

final_proj_demo

Runtime error

App Files Files Community

YuvalShaffir commited on Jul 9, 2024

Commit

37f416d

verified ·

1 Parent(s): 872b362

Update app.py

Browse files

Files changed (1) hide show

app.py +54 -48

app.py CHANGED Viewed

@@ -26,7 +26,7 @@ def load_model():
 @spaces.GPU(duration=120)
-def inference(audio_path, prompt ="drums beats with snares"):
     # Fetch the Hugging Face token from the environment variable
     hf_token = os.getenv('HF_TOKEN')
     print(f"Hugging Face token: {hf_token}")
@@ -56,47 +56,47 @@ def inference(audio_path, prompt ="drums beats with snares"):
         for i in range(len(diffusion_steps)):
             steps = diffusion_steps[i]
             print(f"number of steps: {steps}")
-            for j in range(len(float_values)):
-                noise_level = float_values[j]
-                print(f"Noise level is: {noise_level}")
-                audio, sr = torchaudio.load(audio_path)
-                output = generate_diffusion_cond(
-                    model,
-                    steps=steps,
-                    cfg_scale=7,
-                    conditioning=conditioning,
-                    sample_size=our_sample_size,
-                    sigma_min=0.3,
-                    sigma_max=500,
-                    sampler_type="dpmpp-3m-sde",
-                    device=device,
-                    init_audio=(sr, audio),
-                    init_noise_level=noise_level,
-                    # use_init = True,
-                )
-                # Rearrange audio batch to a single sequence
-                output = rearrange(output, "b d n -> d (b n)")
-                print("rearranged the output into a single sequence")
-                # Peak normalize, clip, convert to int16, and save to file
-                output = (
-                    output.to(torch.float32)
-                    .div(torch.max(torch.abs(output)))
-                    .clamp(-1, 1)
-                    .mul(32767)
-                    .to(torch.int16)
-                    .cpu()
-                )
-                print("Normalized the output, clip and convert to int16")
-                 # Generate a unique filename for the output
-                unique_filename = f"output_{uuid.uuid4().hex}.mp3"
-                print(f"Saving audio to file: {unique_filename}")
-                torchaudio.save(unique_filename, output, sample_rate)
-                print(f"saved to filename {unique_filename}")
-                return unique_filename
@@ -106,7 +106,8 @@ interface = gr.Interface(
     inputs=[
         # gr.UploadButton(label="Audio without drums",file_types=['mp3']),
         gr.Audio(type="filepath", label="Audio without drums"),
-        gr.Textbox(label="Text prompt", placeholder="Enter your text prompt here")
     ],
     outputs=gr.Audio(type="filepath", label="Generated Audio"),
     title="Stable Audio Generator",
@@ -114,23 +115,28 @@ interface = gr.Interface(
     examples=[
         [
             "the_chosen_ones/085838/no_drums.mp3",  # Audio without drums
-            "A techno song with fast, outer space-themed drum beats."  # Text prompt
         ],
         [
             "the_chosen_ones/103522/no_drums.mp3",  # Audio without drums
-            "A slow country melody accompanied by drum beats."  # Text prompt
         ],
         [
             "the_chosen_ones/103800/no_drums.mp3",  # Audio without drums
-            "A rap song featuring slow, groovy drums with intermittent snares."  # Text prompt
         ],
         [
             "the_chosen_ones/103808/no_drums.mp3",  # Audio without drums
-            "Smooth, slow piano grooves paired with intense, rapid drum rhythms."  # Text prompt
         ],
         [
             "the_chosen_ones/134796/no_drums.mp3",  # Audio without drums
-            "A rap track with rapid drum beats and snares."  # Text prompt
         ]
     ],
     cache_examples=True

 @spaces.GPU(duration=120)
+def inference(audio_path, prompt ="drums beats with snares", noise_level = 2.7):
     # Fetch the Hugging Face token from the environment variable
     hf_token = os.getenv('HF_TOKEN')
     print(f"Hugging Face token: {hf_token}")
         for i in range(len(diffusion_steps)):
             steps = diffusion_steps[i]
             print(f"number of steps: {steps}")
+            # for j in range(len(float_values)):
+            # noise_level = float_values[j]
+            print(f"Noise level is: {noise_level}")
+            audio, sr = torchaudio.load(audio_path)
+            output = generate_diffusion_cond(
+                model,
+                steps=steps,
+                cfg_scale=7,
+                conditioning=conditioning,
+                sample_size=our_sample_size,
+                sigma_min=0.3,
+                sigma_max=500,
+                sampler_type="dpmpp-3m-sde",
+                device=device,
+                init_audio=(sr, audio),
+                init_noise_level=noise_level,
+                # use_init = True,
+            )
+            # Rearrange audio batch to a single sequence
+            output = rearrange(output, "b d n -> d (b n)")
+            print("rearranged the output into a single sequence")
+            # Peak normalize, clip, convert to int16, and save to file
+            output = (
+                output.to(torch.float32)
+                .div(torch.max(torch.abs(output)))
+                .clamp(-1, 1)
+                .mul(32767)
+                .to(torch.int16)
+                .cpu()
+            )
+            print("Normalized the output, clip and convert to int16")
+             # Generate a unique filename for the output
+            unique_filename = f"output_{uuid.uuid4().hex}.mp3"
+            print(f"Saving audio to file: {unique_filename}")
+            torchaudio.save(unique_filename, output, sample_rate)
+            print(f"saved to filename {unique_filename}")
+            return unique_filename
     inputs=[
         # gr.UploadButton(label="Audio without drums",file_types=['mp3']),
         gr.Audio(type="filepath", label="Audio without drums"),
+        gr.Textbox(label="Text prompt", placeholder="Enter your text prompt here"),
+        gr.Slider(2.5, 3.5, step=0.1, value=2.7, label="Noise Level", info="Choose between 2.5 and 3.5"),
     ],
     outputs=gr.Audio(type="filepath", label="Generated Audio"),
     title="Stable Audio Generator",
     examples=[
         [
             "the_chosen_ones/085838/no_drums.mp3",  # Audio without drums
+            "A techno song with fast, outer space-themed drum beats.",  # Text prompt
+            2.7 # Noise Level
         ],
         [
             "the_chosen_ones/103522/no_drums.mp3",  # Audio without drums
+            "A slow country melody accompanied by drum beats.",  # Text prompt
+            2.7 # Noise Level
         ],
         [
             "the_chosen_ones/103800/no_drums.mp3",  # Audio without drums
+            "A rap song featuring slow, groovy drums with intermittent snares.",  # Text prompt
+            2.7 # Noise Level
         ],
         [
             "the_chosen_ones/103808/no_drums.mp3",  # Audio without drums
+            "Smooth, slow piano grooves paired with intense, rapid drum rhythms.",  # Text prompt
+            2.7 # Noise Level
         ],
         [
             "the_chosen_ones/134796/no_drums.mp3",  # Audio without drums
+            "A rap track with rapid drum beats and snares.",  # Text prompt
+            2.7 # Noise Level
         ]
     ],
     cache_examples=True