Spaces:

Audiofool
/

WeaveWave

Sleeping

App Files Files Community

Audiofool commited on Mar 30, 2025

Commit

5f28c4a

1 Parent(s): f800d5f

update app.py

Browse files

Files changed (1) hide show

app.py +92 -155

app.py CHANGED Viewed

@@ -9,29 +9,17 @@ import base64
 from pathlib import Path
 from tempfile import NamedTemporaryFile
-from einops import rearrange
-import torch
 import gradio as gr
 import requests
-from audiocraft.data.audio_utils import convert_audio
-from audiocraft.data.audio import audio_write
-from audiocraft.models.encodec import InterleaveStereoCompressionModel
-from audiocraft.models import MusicGen, MultiBandDiffusion
 from theme_wave import theme, css
 # --- Configuration (Main App) ---
-MLLM_API_URL = (
-    "http://localhost:8000"
-)
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 # --- Global Variables (Main App) ---
-MODEL = None
-MBD = None
 INTERRUPTING = False
-USE_DIFFUSION = False  # Keep this for now, even if unused, for easier switching
 # --- Utility Functions (Main App) ---
@@ -72,29 +60,8 @@ def make_waveform(*args, **kwargs):
         return gr.make_waveform(*args, **kwargs)
-# --- Model Loading (Main App) ---
-def load_musicgen_model(version="facebook/musicgen-stereo-melody-large"):
-    global MODEL
-    print(f"Loading MusicGen model: {version}")
-    if MODEL is None or MODEL.name != version:
-        if MODEL is not None:
-            del MODEL
-        torch.cuda.empty_cache()
-        MODEL = MusicGen.get_pretrained(version, device=DEVICE)
-def load_diffusion_model():
-    global MBD
-    if MBD is None:
-        print("Loading diffusion model")
-        MBD = MultiBandDiffusion.get_mbd_musicgen(device=DEVICE)
 # --- API Client Functions ---
 def get_mllm_description(media_path: str, user_prompt: str) -> str:
     """Gets the music description from the MLLM API."""
@@ -122,7 +89,7 @@ def get_mllm_description(media_path: str, user_prompt: str) -> str:
                 f"{MLLM_API_URL}/describe_text/", json={"user_prompt": user_prompt}
             )
-        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx).
         return response.json()["description"]
     except requests.exceptions.RequestException as e:
@@ -131,9 +98,73 @@ def get_mllm_description(media_path: str, user_prompt: str) -> str:
         raise gr.Error(f"An unexpected error occurred: {e}")
-# --- Music Generation ---
 def predict_full(
     model_version,
     media_type,
@@ -149,9 +180,9 @@ def predict_full(
     decoder,
     progress=gr.Progress(),
 ):
-    global INTERRUPTING, USE_DIFFUSION
     INTERRUPTING = False
-    USE_DIFFUSION = decoder == "MultiBand_Diffusion"
     if media_type == "Image":
         media = image_input if image_input else None
@@ -160,124 +191,37 @@ def predict_full(
     else:
         media = None
-    # 1. Get Music Description (using the API client).
     progress(progress=None, desc="Generating music description...")
     if media:
         try:
             music_description = get_mllm_description(media, text_prompt)
         except Exception as e:
-            raise gr.Error(str(e))  # Re-raise for Gradio to handle.
     else:
         music_description = text_prompt
-    # 2. Load MusicGen Model (locally).
-    progress(progress=None, desc="Loading MusicGen model...")
-    load_musicgen_model(model_version)
-    # 3. Set Generation Parameters (locally).
-    MODEL.set_generation_params(
-        duration=duration,
-        top_k=topk,
-        top_p=topp,
-        temperature=temperature,
-        cfg_coef=cfg_coef,
-    )
-    # 4. Melody Preprocessing (locally).
-    progress(progress=None, desc="Processing melody...")
-    melody_tensor = None  # Use a different variable name
-    if melody:
-        try:
-            sr, melody_tensor = (
-                melody[0],
-                torch.from_numpy(melody[1]).to(MODEL.device).float().t(),
-            )
-            if melody_tensor.dim() == 1:
-                melody_tensor = melody_tensor[None]
-            melody_tensor = melody_tensor[..., : int(sr * duration)]
-            melody_tensor = convert_audio(
-                melody_tensor, sr, MODEL.sample_rate, MODEL.audio_channels
-            )
-        except Exception as e:
-            raise gr.Error(f"Error processing melody: {e}")
-    # 5. Music Generation (locally).
-    progress(progress=None, desc="Generating music...")
-    if USE_DIFFUSION:
-        load_diffusion_model()
     try:
-        if melody_tensor is not None:  # Use the new variable
-            output = MODEL.generate_with_chroma(
-                descriptions=[music_description],
-                melody_wavs=[melody_tensor],
-                melody_sample_rate=MODEL.sample_rate,
-                progress=True,
-                return_tokens=USE_DIFFUSION,
-            )
-        else:
-            output = MODEL.generate(
-                descriptions=[music_description],
-                progress=True,
-                return_tokens=USE_DIFFUSION,
-            )
-    except RuntimeError as e:
-        raise gr.Error("Error while generating: " + str(e))
-    if USE_DIFFUSION:
-        progress(progress=None, desc="Running MultiBandDiffusion...")
-        tokens = output[1]
-        if isinstance(MODEL.compression_model, InterleaveStereoCompressionModel):
-            left, right = MODEL.compression_model.get_left_right_codes(tokens)
-            tokens = torch.cat([left, right])
-        outputs_diffusion = MBD.tokens_to_wav(tokens)
-        if isinstance(MODEL.compression_model, InterleaveStereoCompressionModel):
-            assert outputs_diffusion.shape[1] == 1  # output is mono
-            outputs_diffusion = rearrange(
-                outputs_diffusion, "(s b) c t -> b (s c) t", s=2
-            )
-        output_audio = torch.cat([output[0], outputs_diffusion], dim=0)
-    else:
-        output_audio = output[0]
-    output_audio = output_audio.detach().cpu().float()
-    # 6. Save and Return (locally).
-    progress(progress=None, desc="Saving and returning...")
-    output_audio_paths = []
-    for i, audio in enumerate(output_audio):
-        with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
-            audio_write(
-                file.name,
-                audio,
-                MODEL.sample_rate,
-                strategy="loudness",
-                loudness_headroom_db=16,
-                loudness_compressor=True,
-                add_suffix=False,
-            )
-            output_audio_paths.append(file.name)
-            file_cleaner.add(file.name)
-    if USE_DIFFUSION:
-        # Return both audios, but make sure to return the correct one first
-        result = (
-            output_audio_paths[0],  # Original
-            output_audio_paths[1],  # MBD
         )
-    else:
-        result = (
-            output_audio_paths[0],
-            None,
-        )  # Only original audio and description
-    del melody_tensor, output, output_audio
-    if torch.cuda.is_available():
-        torch.cuda.empty_cache()
-    return result
 Wave = theme()
@@ -349,9 +293,7 @@ def create_ui(launch_kwargs=None):
                 )
                 with gr.Row():
                     submit_button = gr.Button("Generate Music", variant="primary")
-                    interrupt_button = gr.Button(
-                        "Interrupt", variant="stop"
-                    )  # Keep as gr.Button
         with gr.Row():
             model_version = gr.Dropdown(
                 [
@@ -384,8 +326,6 @@ def create_ui(launch_kwargs=None):
                 interactive=True,
             )
-        # with gr.Row():
-        #     description_output = gr.Textbox(label="MLLM Generated Description")
         with gr.Row():
             output_audio = gr.Audio(label="Generated Music", type="filepath")
             output_audio_mbd = gr.Audio(
@@ -408,12 +348,9 @@ def create_ui(launch_kwargs=None):
                 cfg_coef,
                 decoder,
             ],
-            # outputs=[output_audio, description_output, output_audio_mbd],
             outputs=[output_audio, output_audio_mbd],
         )
         interrupt_button.click(interrupt_handler, [], [])
-        if INTERRUPTING:
-            raise gr.Error("Interrupted.")
         gr.Examples(
             examples=[
@@ -495,7 +432,7 @@ if __name__ == "__main__":
     )
     parser.add_argument(
         "--server_port", type=int, default=0, help="Port to run the server on"
-    )  # Add server_port argument.
     parser.add_argument("--inbrowser", action="store_true", help="Open in browser")
     parser.add_argument("--share", action="store_true", help="Share the Gradio UI")
@@ -513,4 +450,4 @@ if __name__ == "__main__":
         launch_kwargs["share"] = args.share
     logging.basicConfig(level=logging.INFO, stream=sys.stderr)
-    create_ui(launch_kwargs)

 from pathlib import Path
 from tempfile import NamedTemporaryFile
 import gradio as gr
 import requests
 from theme_wave import theme, css
 # --- Configuration (Main App) ---
+MLLM_API_URL = "http://localhost:8000"
+MUSICGEN_API_URL = "https://your-musicgen-api-endpoint.com"  # Replace with actual MusicGen API endpoint
 # --- Global Variables (Main App) ---
 INTERRUPTING = False
 # --- Utility Functions (Main App) ---
         return gr.make_waveform(*args, **kwargs)
 # --- API Client Functions ---
 def get_mllm_description(media_path: str, user_prompt: str) -> str:
     """Gets the music description from the MLLM API."""
                 f"{MLLM_API_URL}/describe_text/", json={"user_prompt": user_prompt}
             )
+        response.raise_for_status()
         return response.json()["description"]
     except requests.exceptions.RequestException as e:
         raise gr.Error(f"An unexpected error occurred: {e}")
+def generate_music_from_api(
+    description: str,
+    melody=None,
+    duration: int = 10,
+    model_version: str = "facebook/musicgen-stereo-melody-large",
+    topk: int = 250,
+    topp: float = 0,
+    temperature: float = 1.0,
+    cfg_coef: float = 3.0,
+    use_diffusion: bool = False,
+):
+    """Generates music using the MusicGen API."""
+    # Prepare the API request payload
+    payload = {
+        "description": description,
+        "duration": duration,
+        "model_version": model_version,
+        "topk": topk,
+        "topp": topp,
+        "temperature": temperature,
+        "cfg_coef": cfg_coef,
+        "use_diffusion": use_diffusion
+    }
+    # Handle melody if provided
+    if melody is not None:
+        sr, melody_data = melody
+        # Convert melody to base64 for API transmission
+        melody_bytes = melody_data.tobytes() if hasattr(melody_data, 'tobytes') else melody_data.tostring()
+        encoded_melody = base64.b64encode(melody_bytes).decode("utf-8")
+        payload["melody"] = encoded_melody
+        payload["melody_sample_rate"] = sr
+    try:
+        response = requests.post(f"{MUSICGEN_API_URL}/generate", json=payload)
+        response.raise_for_status()
+        result = response.json()
+        # Assuming API returns base64 encoded audio files
+        audio_data = base64.b64decode(result["audio"])
+        diffusion_audio_data = base64.b64decode(result.get("diffusion_audio", "")) if use_diffusion else None
+        # Save to temporary files
+        output_paths = []
+        with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
+            file.write(audio_data)
+            output_paths.append(file.name)
+            file_cleaner.add(file.name)
+        if diffusion_audio_data:
+            with NamedTemporaryFile("wb", suffix=".wav", delete=False) as file:
+                file.write(diffusion_audio_data)
+                output_paths.append(file.name)
+                file_cleaner.add(file.name)
+        return output_paths[0], output_paths[1] if len(output_paths) > 1 else None
+    except requests.exceptions.RequestException as e:
+        raise gr.Error(f"Error communicating with MusicGen API: {e}")
+    except Exception as e:
+        raise gr.Error(f"An unexpected error occurred: {e}")
+# --- Music Generation ---
 def predict_full(
     model_version,
     media_type,
     decoder,
     progress=gr.Progress(),
 ):
+    global INTERRUPTING
     INTERRUPTING = False
+    use_diffusion = decoder == "MultiBand_Diffusion"
     if media_type == "Image":
         media = image_input if image_input else None
     else:
         media = None
+    # 1. Get Music Description (using the MLLM API)
     progress(progress=None, desc="Generating music description...")
     if media:
         try:
             music_description = get_mllm_description(media, text_prompt)
         except Exception as e:
+            raise gr.Error(str(e))
     else:
         music_description = text_prompt
+    # 2. Generate music using MusicGen API
+    progress(progress=None, desc="Generating music via API...")
     try:
+        output_audio_path, output_audio_mbd_path = generate_music_from_api(
+            description=music_description,
+            melody=melody,
+            duration=duration,
+            model_version=model_version,
+            topk=topk,
+            topp=topp,
+            temperature=temperature,
+            cfg_coef=cfg_coef,
+            use_diffusion=use_diffusion
         )
+    except Exception as e:
+        raise gr.Error(f"Error generating music: {e}")
+    if INTERRUPTING:
+        raise gr.Error("Generation interrupted.")
+    return output_audio_path, output_audio_mbd_path
 Wave = theme()
                 )
                 with gr.Row():
                     submit_button = gr.Button("Generate Music", variant="primary")
+                    interrupt_button = gr.Button("Interrupt", variant="stop")
         with gr.Row():
             model_version = gr.Dropdown(
                 [
                 interactive=True,
             )
         with gr.Row():
             output_audio = gr.Audio(label="Generated Music", type="filepath")
             output_audio_mbd = gr.Audio(
                 cfg_coef,
                 decoder,
             ],
             outputs=[output_audio, output_audio_mbd],
         )
         interrupt_button.click(interrupt_handler, [], [])
         gr.Examples(
             examples=[
     )
     parser.add_argument(
         "--server_port", type=int, default=0, help="Port to run the server on"
+    )
     parser.add_argument("--inbrowser", action="store_true", help="Open in browser")
     parser.add_argument("--share", action="store_true", help="Share the Gradio UI")
         launch_kwargs["share"] = args.share
     logging.basicConfig(level=logging.INFO, stream=sys.stderr)
+    create_ui(launch_kwargs)