ghostai1
/

GHOSTSONAFB

English

python

Model card Files Files and versions

xet

Community

ghostai1 commited on May 11, 2025

Commit

798e897

verified ·

1 Parent(s): e433b2c

Update app.py

Browse files

Files changed (1) hide show

app.py +32 -34

app.py CHANGED Viewed

@@ -10,7 +10,6 @@ import gradio as gr
 from pydub import AudioSegment
 from audiocraft.models import MusicGen
 from torch.cuda.amp import autocast
-from pydub.effects import reverb
 # Set PYTORCH_CUDA_ALLOC_CONF to manage memory fragmentation
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
@@ -82,25 +81,28 @@ def apply_chorus(segment):
     delayed = delayed.set_frame_rate(segment.frame_rate)
     return segment.overlay(delayed, position=20)
 def apply_eq(segment):
     # Adjusted EQ for clarity in midrange
     segment = segment.low_pass_filter(8000)
     segment = segment.high_pass_filter(80)
-    # Boost midrange frequencies (500 Hz to 2 kHz) for clarity
     segment = segment.equalizer(frequency=1000, gain=2, q=1.0)
     return segment
-def apply_reverb(segment):
-    # Add subtle reverb for depth
-    return reverb(segment, reverb_time=1500, wet_level=0.2)
 def apply_limiter(segment, max_db=-3.0):
     if segment.dBFS > max_db:
         segment = segment - (segment.dBFS - max_db)
     return segment
 def apply_final_gain(segment, target_db=-12.0):
-    # Adjust final gain to a safe loudness level
     gain_adjustment = target_db - segment.dBFS
     return segment + gain_adjustment
@@ -113,25 +115,21 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
         start_time = time.time()
         total_duration = min(max(total_duration, 10), 90)
-        chunk_duration = 15
-        num_chunks = max(2, (total_duration + chunk_duration - 1) // chunk_duration)
-        chunk_duration = total_duration / num_chunks
-        overlap_duration = min(1.0, crossfade_duration / 1000.0)
-        generation_duration = chunk_duration + overlap_duration
         audio_chunks = []
         sample_rate = musicgen_model.sample_rate
         for i in range(num_chunks):
-            chunk_prompt = instrumental_prompt  # Use the same prompt for all chunks
             print(f"Generating chunk {i+1}/{num_chunks} on GPU (prompt: {chunk_prompt})...")
             musicgen_model.set_generation_params(
-                duration=generation_duration,
                 use_sampling=True,
-                top_k=top_k,
-                top_p=top_p,
-                temperature=temperature,
                 cfg_coef=cfg_scale
             )
@@ -155,13 +153,16 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
             if audio_chunk.shape[0] != 2:
                 raise ValueError(f"Expected stereo audio with shape (2, samples), got shape {audio_chunk.shape}")
-            temp_wav_path = f"temp_chunk_{i}.wav"
-            chunk_path = f"chunk_{i}.mp3"
-            torchaudio.save(temp_wav_path, audio_chunk, sample_rate, bits_per_sample=24)
-            segment = AudioSegment.from_wav(temp_wav_path)
-            segment.export(chunk_path, format="mp3", bitrate="320k")
-            os.remove(temp_wav_path)
-            audio_chunks.append(chunk_path)
             torch.cuda.empty_cache()
             gc.collect()
@@ -169,9 +170,9 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
             print_resource_usage(f"After Chunk {i+1} Generation")
         print("Combining audio chunks...")
-        final_segment = AudioSegment.from_mp3(audio_chunks[0])
         for i in range(1, len(audio_chunks)):
-            next_segment = AudioSegment.from_mp3(audio_chunks[i])
             next_segment = next_segment + 1
             final_segment = final_segment.append(next_segment, crossfade=crossfade_duration)
@@ -194,9 +195,6 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
         )
         print(f"Saved final audio to {mp3_path}")
-        for chunk_path in audio_chunks:
-            os.remove(chunk_path)
         print_resource_usage("After Final Generation")
         print(f"Total Generation Time: {time.time() - start_time:.2f} seconds")
@@ -208,7 +206,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
         gc.collect()
 def clear_inputs():
-    return "", 3.0, 300, 0.95, 1.0, 45, 750
 # 7) CUSTOM CSS (Unchanged)
 css = """
@@ -379,7 +377,7 @@ with gr.Blocks(css=css) as demo:
             label="Top-K Sampling",
             minimum=10,
             maximum=500,
-            value=300,
             step=10,
             info="Limits sampling to the top k most likely tokens. Higher values increase diversity."
         )
@@ -387,7 +385,7 @@ with gr.Blocks(css=css) as demo:
             label="Top-P Sampling (Nucleus Sampling)",
             minimum=0.0,
             maximum=1.0,
-            value=0.95,
             step=0.1,
             info="Keeps tokens with cumulative probability above p. Higher values increase diversity."
         )
@@ -395,7 +393,7 @@ with gr.Blocks(css=css) as demo:
             label="Temperature",
             minimum=0.1,
             maximum=2.0,
-            value=1.0,
             step=0.1,
             info="Controls randomness. Higher values make output more diverse but less predictable."
         )

 from pydub import AudioSegment
 from audiocraft.models import MusicGen
 from torch.cuda.amp import autocast
 # Set PYTORCH_CUDA_ALLOC_CONF to manage memory fragmentation
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:32"
     delayed = delayed.set_frame_rate(segment.frame_rate)
     return segment.overlay(delayed, position=20)
+def apply_reverb(segment):
+    # Simulate reverb by overlaying multiple delayed copies with decreasing amplitude
+    reverb_segment = segment
+    for delay_ms, gain_db in [(50, -10), (100, -15), (150, -20)]:
+        delayed = segment - gain_db
+        delayed = delayed.set_frame_rate(segment.frame_rate)
+        reverb_segment = reverb_segment.overlay(delayed, position=delay_ms)
+    return reverb_segment
 def apply_eq(segment):
     # Adjusted EQ for clarity in midrange
     segment = segment.low_pass_filter(8000)
     segment = segment.high_pass_filter(80)
     segment = segment.equalizer(frequency=1000, gain=2, q=1.0)
     return segment
 def apply_limiter(segment, max_db=-3.0):
     if segment.dBFS > max_db:
         segment = segment - (segment.dBFS - max_db)
     return segment
 def apply_final_gain(segment, target_db=-12.0):
     gain_adjustment = target_db - segment.dBFS
     return segment + gain_adjustment
         start_time = time.time()
         total_duration = min(max(total_duration, 10), 90)
+        chunk_duration = total_duration  # Single chunk to minimize overhead
+        num_chunks = 1  # Single chunk generation
         audio_chunks = []
         sample_rate = musicgen_model.sample_rate
         for i in range(num_chunks):
+            chunk_prompt = instrumental_prompt
             print(f"Generating chunk {i+1}/{num_chunks} on GPU (prompt: {chunk_prompt})...")
             musicgen_model.set_generation_params(
+                duration=chunk_duration,
                 use_sampling=True,
+                top_k=250,  # Reduced for faster generation
+                top_p=0.9,  # Adjusted for balance
+                temperature=0.9,  # Slightly reduced for consistency
                 cfg_coef=cfg_scale
             )
             if audio_chunk.shape[0] != 2:
                 raise ValueError(f"Expected stereo audio with shape (2, samples), got shape {audio_chunk.shape}")
+            # Process in memory using pydub without intermediate file I/O
+            audio_array = audio_chunk.numpy()
+            audio_array = (audio_array * 32767).astype(np.int16)  # Convert to 16-bit PCM
+            segment = AudioSegment(
+                audio_array.tobytes(),
+                frame_rate=sample_rate,
+                sample_width=2,  # 16-bit
+                channels=2
+            )
+            audio_chunks.append(segment)
             torch.cuda.empty_cache()
             gc.collect()
             print_resource_usage(f"After Chunk {i+1} Generation")
         print("Combining audio chunks...")
+        final_segment = audio_chunks[0]
         for i in range(1, len(audio_chunks)):
+            next_segment = audio_chunks[i]
             next_segment = next_segment + 1
             final_segment = final_segment.append(next_segment, crossfade=crossfade_duration)
         )
         print(f"Saved final audio to {mp3_path}")
         print_resource_usage("After Final Generation")
         print(f"Total Generation Time: {time.time() - start_time:.2f} seconds")
         gc.collect()
 def clear_inputs():
+    return "", 3.0, 250, 0.9, 0.9, 45, 750
 # 7) CUSTOM CSS (Unchanged)
 css = """
             label="Top-K Sampling",
             minimum=10,
             maximum=500,
+            value=250,
             step=10,
             info="Limits sampling to the top k most likely tokens. Higher values increase diversity."
         )
             label="Top-P Sampling (Nucleus Sampling)",
             minimum=0.0,
             maximum=1.0,
+            value=0.9,
             step=0.1,
             info="Keeps tokens with cumulative probability above p. Higher values increase diversity."
         )
             label="Temperature",
             minimum=0.1,
             maximum=2.0,
+            value=0.9,
             step=0.1,
             info="Controls randomness. Higher values make output more diverse but less predictable."
         )