ghostai1 commited on
Commit
3bcd320
·
verified ·
1 Parent(s): 57ea3c5

Update app.py

Browse files

roll back i messed up ><

Files changed (1) hide show
  1. app.py +46 -48
app.py CHANGED
@@ -47,7 +47,7 @@ def print_resource_usage(stage: str):
47
  print(f"GPU Memory Reserved: {torch.cuda.memory_reserved() / (1024**3):.2f} GB")
48
  print("---------------")
49
 
50
- # 4) GENRE PROMPT FUNCTIONS (Unchanged)
51
  def set_rock_prompt():
52
  return "Hard rock with dynamic electric guitars, heavy steady drums, deep groovy bass, subtle organ layers, and a hint of Red Hot Chili Peppers' funky rock energy, maintaining a cohesive structure throughout"
53
 
@@ -77,34 +77,23 @@ def set_deep_house_prompt():
77
 
78
  # 5) AUDIO PROCESSING FUNCTIONS
79
  def apply_chorus(segment):
80
- delayed = segment - 6
 
81
  delayed = delayed.set_frame_rate(segment.frame_rate)
82
  return segment.overlay(delayed, position=20)
83
 
84
- def apply_reverb(segment):
85
- # Simulate reverb by overlaying multiple delayed copies with decreasing amplitude
86
- reverb_segment = segment
87
- for delay_ms, gain_db in [(50, -10), (100, -15), (150, -20)]:
88
- delayed = segment - gain_db
89
- delayed = delayed.set_frame_rate(segment.frame_rate)
90
- reverb_segment = reverb_segment.overlay(delayed, position=delay_ms)
91
- return reverb_segment
92
-
93
  def apply_eq(segment):
94
- # Use only low-pass and high-pass filters, as equalizer is not available in pydub
95
- segment = segment.low_pass_filter(8000)
96
- segment = segment.high_pass_filter(80)
97
  return segment
98
 
99
  def apply_limiter(segment, max_db=-3.0):
 
100
  if segment.dBFS > max_db:
101
  segment = segment - (segment.dBFS - max_db)
102
  return segment
103
 
104
- def apply_final_gain(segment, target_db=-12.0):
105
- gain_adjustment = target_db - segment.dBFS
106
- return segment + gain_adjustment
107
-
108
  # 6) GENERATION & I/O FUNCTIONS
109
  def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p: float, temperature: float, total_duration: int, crossfade_duration: int):
110
  global musicgen_model
@@ -113,22 +102,30 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
113
  try:
114
  start_time = time.time()
115
 
 
116
  total_duration = min(max(total_duration, 10), 90)
117
- chunk_duration = total_duration # Single chunk to minimize overhead
118
- num_chunks = 1 # Single chunk generation
 
 
 
 
 
119
 
120
  audio_chunks = []
121
  sample_rate = musicgen_model.sample_rate
122
 
 
123
  for i in range(num_chunks):
124
- chunk_prompt = instrumental_prompt
 
125
  print(f"Generating chunk {i+1}/{num_chunks} on GPU (prompt: {chunk_prompt})...")
126
  musicgen_model.set_generation_params(
127
- duration=chunk_duration,
128
  use_sampling=True,
129
- top_k=250,
130
- top_p=0.9,
131
- temperature=0.9,
132
  cfg_coef=cfg_scale
133
  )
134
 
@@ -152,38 +149,36 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
152
  if audio_chunk.shape[0] != 2:
153
  raise ValueError(f"Expected stereo audio with shape (2, samples), got shape {audio_chunk.shape}")
154
 
155
- # Process in memory using pydub without intermediate file I/O
156
- audio_array = audio_chunk.numpy()
157
- audio_array = (audio_array * 32767).astype(np.int16) # Convert to 16-bit PCM
158
- segment = AudioSegment(
159
- audio_array.tobytes(),
160
- frame_rate=sample_rate,
161
- sample_width=2, # 16-bit
162
- channels=2
163
- )
164
- audio_chunks.append(segment)
165
 
166
  torch.cuda.empty_cache()
167
  gc.collect()
168
  time.sleep(0.5)
169
  print_resource_usage(f"After Chunk {i+1} Generation")
170
 
 
171
  print("Combining audio chunks...")
172
- final_segment = audio_chunks[0]
173
  for i in range(1, len(audio_chunks)):
174
- next_segment = audio_chunks[i]
175
- next_segment = next_segment + 1
176
  final_segment = final_segment.append(next_segment, crossfade=crossfade_duration)
177
 
 
178
  final_segment = final_segment[:total_duration * 1000]
179
 
 
180
  print("Post-processing final track...")
181
  final_segment = apply_eq(final_segment)
182
  final_segment = apply_chorus(final_segment)
183
- final_segment = apply_reverb(final_segment)
184
- final_segment = apply_limiter(final_segment, max_db=-3.0)
185
- final_segment = final_segment.normalize(headroom=-6.0)
186
- final_segment = apply_final_gain(final_segment, target_db=-12.0)
187
 
188
  mp3_path = "output_cleaned.mp3"
189
  final_segment.export(
@@ -194,6 +189,9 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
194
  )
195
  print(f"Saved final audio to {mp3_path}")
196
 
 
 
 
197
  print_resource_usage("After Final Generation")
198
  print(f"Total Generation Time: {time.time() - start_time:.2f} seconds")
199
 
@@ -205,7 +203,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
205
  gc.collect()
206
 
207
  def clear_inputs():
208
- return "", 3.0, 250, 0.9, 0.9, 45, 750
209
 
210
  # 7) CUSTOM CSS (Unchanged)
211
  css = """
@@ -376,7 +374,7 @@ with gr.Blocks(css=css) as demo:
376
  label="Top-K Sampling",
377
  minimum=10,
378
  maximum=500,
379
- value=250,
380
  step=10,
381
  info="Limits sampling to the top k most likely tokens. Higher values increase diversity."
382
  )
@@ -384,7 +382,7 @@ with gr.Blocks(css=css) as demo:
384
  label="Top-P Sampling (Nucleus Sampling)",
385
  minimum=0.0,
386
  maximum=1.0,
387
- value=0.9,
388
  step=0.1,
389
  info="Keeps tokens with cumulative probability above p. Higher values increase diversity."
390
  )
@@ -392,7 +390,7 @@ with gr.Blocks(css=css) as demo:
392
  label="Temperature",
393
  minimum=0.1,
394
  maximum=2.0,
395
- value=0.9,
396
  step=0.1,
397
  info="Controls randomness. Higher values make output more diverse but less predictable."
398
  )
@@ -400,7 +398,7 @@ with gr.Blocks(css=css) as demo:
400
  label="Total Duration (seconds)",
401
  minimum=10,
402
  maximum=90,
403
- value=45,
404
  step=1,
405
  info="Total duration of the track (10 to 90 seconds)."
406
  )
@@ -408,7 +406,7 @@ with gr.Blocks(css=css) as demo:
408
  label="Crossfade Duration (ms)",
409
  minimum=100,
410
  maximum=2000,
411
- value=750,
412
  step=100,
413
  info="Crossfade duration between chunks for smoother transitions."
414
  )
 
47
  print(f"GPU Memory Reserved: {torch.cuda.memory_reserved() / (1024**3):.2f} GB")
48
  print("---------------")
49
 
50
+ # 4) GENRE PROMPT FUNCTIONS (Updated for consistency, more instruments, and popular styles)
51
  def set_rock_prompt():
52
  return "Hard rock with dynamic electric guitars, heavy steady drums, deep groovy bass, subtle organ layers, and a hint of Red Hot Chili Peppers' funky rock energy, maintaining a cohesive structure throughout"
53
 
 
77
 
78
  # 5) AUDIO PROCESSING FUNCTIONS
79
  def apply_chorus(segment):
80
+ # Enhanced chorus effect for richer sound
81
+ delayed = segment - 6 # Reduced gain to -6 dB for a subtler effect
82
  delayed = delayed.set_frame_rate(segment.frame_rate)
83
  return segment.overlay(delayed, position=20)
84
 
 
 
 
 
 
 
 
 
 
85
  def apply_eq(segment):
86
+ # Adjusted EQ for a more balanced sound
87
+ segment = segment.low_pass_filter(8000) # Raised cutoff to 8kHz for brighter highs
88
+ segment = segment.high_pass_filter(80) # Lowered cutoff to 80Hz for deeper bass
89
  return segment
90
 
91
  def apply_limiter(segment, max_db=-3.0):
92
+ # Apply limiter with a higher threshold to preserve dynamics
93
  if segment.dBFS > max_db:
94
  segment = segment - (segment.dBFS - max_db)
95
  return segment
96
 
 
 
 
 
97
  # 6) GENERATION & I/O FUNCTIONS
98
  def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p: float, temperature: float, total_duration: int, crossfade_duration: int):
99
  global musicgen_model
 
102
  try:
103
  start_time = time.time()
104
 
105
+ # Ensure total duration is within reasonable bounds (up to 90 seconds for longer tracks)
106
  total_duration = min(max(total_duration, 10), 90)
107
+ chunk_duration = 15
108
+ num_chunks = max(2, (total_duration + chunk_duration - 1) // chunk_duration)
109
+ chunk_duration = total_duration / num_chunks
110
+
111
+ # Generate slightly longer chunks for overlap
112
+ overlap_duration = min(1.0, crossfade_duration / 1000.0)
113
+ generation_duration = chunk_duration + overlap_duration
114
 
115
  audio_chunks = []
116
  sample_rate = musicgen_model.sample_rate
117
 
118
+ # Generate audio in chunks with a consistent prompt
119
  for i in range(num_chunks):
120
+ chunk_prompt = instrumental_prompt # Use the same prompt for all chunks
121
+
122
  print(f"Generating chunk {i+1}/{num_chunks} on GPU (prompt: {chunk_prompt})...")
123
  musicgen_model.set_generation_params(
124
+ duration=generation_duration,
125
  use_sampling=True,
126
+ top_k=top_k,
127
+ top_p=top_p,
128
+ temperature=temperature,
129
  cfg_coef=cfg_scale
130
  )
131
 
 
149
  if audio_chunk.shape[0] != 2:
150
  raise ValueError(f"Expected stereo audio with shape (2, samples), got shape {audio_chunk.shape}")
151
 
152
+ temp_wav_path = f"temp_chunk_{i}.wav"
153
+ chunk_path = f"chunk_{i}.mp3"
154
+ torchaudio.save(temp_wav_path, audio_chunk, sample_rate, bits_per_sample=24)
155
+ segment = AudioSegment.from_wav(temp_wav_path)
156
+ segment.export(chunk_path, format="mp3", bitrate="320k")
157
+ os.remove(temp_wav_path)
158
+ audio_chunks.append(chunk_path)
 
 
 
159
 
160
  torch.cuda.empty_cache()
161
  gc.collect()
162
  time.sleep(0.5)
163
  print_resource_usage(f"After Chunk {i+1} Generation")
164
 
165
+ # Combine chunks with crossfade
166
  print("Combining audio chunks...")
167
+ final_segment = AudioSegment.from_mp3(audio_chunks[0])
168
  for i in range(1, len(audio_chunks)):
169
+ next_segment = AudioSegment.from_mp3(audio_chunks[i])
170
+ next_segment = next_segment + 1 # Reduced gain boost to +1 dB
171
  final_segment = final_segment.append(next_segment, crossfade=crossfade_duration)
172
 
173
+ # Trim to exact total duration
174
  final_segment = final_segment[:total_duration * 1000]
175
 
176
+ # Post-process with improved dynamics
177
  print("Post-processing final track...")
178
  final_segment = apply_eq(final_segment)
179
  final_segment = apply_chorus(final_segment)
180
+ final_segment = apply_limiter(final_segment, max_db=-3.0) # Apply limiter only once with higher threshold
181
+ final_segment = final_segment.normalize(headroom=-6.0) # Increased headroom to -6 dB
 
 
182
 
183
  mp3_path = "output_cleaned.mp3"
184
  final_segment.export(
 
189
  )
190
  print(f"Saved final audio to {mp3_path}")
191
 
192
+ for chunk_path in audio_chunks:
193
+ os.remove(chunk_path)
194
+
195
  print_resource_usage("After Final Generation")
196
  print(f"Total Generation Time: {time.time() - start_time:.2f} seconds")
197
 
 
203
  gc.collect()
204
 
205
  def clear_inputs():
206
+ return "", 3.0, 300, 0.95, 1.0, 30, 500
207
 
208
  # 7) CUSTOM CSS (Unchanged)
209
  css = """
 
374
  label="Top-K Sampling",
375
  minimum=10,
376
  maximum=500,
377
+ value=300,
378
  step=10,
379
  info="Limits sampling to the top k most likely tokens. Higher values increase diversity."
380
  )
 
382
  label="Top-P Sampling (Nucleus Sampling)",
383
  minimum=0.0,
384
  maximum=1.0,
385
+ value=0.95,
386
  step=0.1,
387
  info="Keeps tokens with cumulative probability above p. Higher values increase diversity."
388
  )
 
390
  label="Temperature",
391
  minimum=0.1,
392
  maximum=2.0,
393
+ value=1.0,
394
  step=0.1,
395
  info="Controls randomness. Higher values make output more diverse but less predictable."
396
  )
 
398
  label="Total Duration (seconds)",
399
  minimum=10,
400
  maximum=90,
401
+ value=30,
402
  step=1,
403
  info="Total duration of the track (10 to 90 seconds)."
404
  )
 
406
  label="Crossfade Duration (ms)",
407
  minimum=100,
408
  maximum=2000,
409
+ value=500,
410
  step=100,
411
  info="Crossfade duration between chunks for smoother transitions."
412
  )