ghostai1 commited on
Commit
7a43119
·
verified ·
1 Parent(s): e351dd1

Update stablecuda12build1.py

Browse files

i love math! its so much fun optimization CBR floats oh boy sure do love it yuppers

Files changed (1) hide show
  1. stablecuda12build1.py +54 -8
stablecuda12build1.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import torch
3
  import torchaudio
@@ -124,9 +125,22 @@ def check_disk_space(path="."):
124
  return False
125
 
126
  # Audio processing functions (CPU-based)
 
 
 
 
 
 
 
 
 
 
 
 
127
  def balance_stereo(audio_segment, noise_threshold=-60, sample_rate=16000):
128
  logger.debug(f"Balancing stereo for segment with sample rate {sample_rate}")
129
  try:
 
130
  samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
131
  if audio_segment.channels == 2:
132
  stereo_samples = samples.reshape(-1, 2)
@@ -135,7 +149,7 @@ def balance_stereo(audio_segment, noise_threshold=-60, sample_rate=16000):
135
  stereo_samples = stereo_samples * mask
136
  left_nonzero = stereo_samples[:, 0][stereo_samples[:, 0] != 0]
137
  right_nonzero = stereo_samples[:, 1][stereo_samples[:, 1] != 0]
138
- left_rms = np.sqrt(np.mean(left_nonzero**2)) if len(left_nonzero) > 0 else 0
139
  right_rms = np.sqrt(np.mean(right_nonzero**2)) if len(right_nonzero) > 0 else 0
140
  if left_rms > 0 and right_rms > 0:
141
  avg_rms = (left_rms + right_rms) / 2
@@ -150,7 +164,7 @@ def balance_stereo(audio_segment, noise_threshold=-60, sample_rate=16000):
150
  )
151
  logger.debug("Stereo balancing completed")
152
  return balanced_segment
153
- logger.debug("Segment is not stereo, returning unchanged")
154
  return audio_segment
155
  except Exception as e:
156
  logger.error(f"Failed to balance stereo: {e}")
@@ -171,6 +185,7 @@ def calculate_rms(segment):
171
  def rms_normalize(segment, target_rms_db=-23.0, peak_limit_db=-3.0, sample_rate=16000):
172
  logger.debug(f"Normalizing RMS for segment with target {target_rms_db} dBFS")
173
  try:
 
174
  target_rms = 10 ** (target_rms_db / 20) * (2**23 if segment.sample_width == 3 else 32767)
175
  current_rms = calculate_rms(segment)
176
  if current_rms > 0:
@@ -187,6 +202,7 @@ def rms_normalize(segment, target_rms_db=-23.0, peak_limit_db=-3.0, sample_rate=
187
  def hard_limit(audio_segment, limit_db=-3.0, sample_rate=16000):
188
  logger.debug(f"Applying hard limit at {limit_db} dBFS")
189
  try:
 
190
  limit = 10 ** (limit_db / 20.0) * (2**23 if audio_segment.sample_width == 3 else 32767)
191
  samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
192
  samples = np.clip(samples, -limit, limit).astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
@@ -194,7 +210,7 @@ def hard_limit(audio_segment, limit_db=-3.0, sample_rate=16000):
194
  samples.tobytes(),
195
  frame_rate=sample_rate,
196
  sample_width=audio_segment.sample_width,
197
- channels=audio_segment.channels
198
  )
199
  logger.debug("Hard limit applied")
200
  return limited_segment
@@ -206,6 +222,7 @@ def hard_limit(audio_segment, limit_db=-3.0, sample_rate=16000):
206
  def apply_eq(segment, sample_rate=16000):
207
  logger.debug(f"Applying EQ with sample rate {sample_rate}")
208
  try:
 
209
  segment = segment.high_pass_filter(20)
210
  segment = segment.low_pass_filter(20000)
211
  logger.debug("EQ applied")
@@ -218,6 +235,7 @@ def apply_eq(segment, sample_rate=16000):
218
  def apply_fade(segment, fade_in_duration=500, fade_out_duration=500):
219
  logger.debug(f"Applying fade: in={fade_in_duration}ms, out={fade_out_duration}ms")
220
  try:
 
221
  segment = segment.fade_in(fade_in_duration)
222
  segment = segment.fade_out(fade_out_duration)
223
  logger.debug("Fade applied")
@@ -535,6 +553,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
535
  max_duration = min(max_steps_int / 50, 30) # Convert steps to seconds, cap at 30s
536
  total_duration = min(max(total_duration, 30), 120) # Clamp between 30s and 120s
537
  processing_sample_rate = 16000 # Fixed for processing
 
538
  audio_segments = []
539
  overlap_duration = 0.2 # 200ms for continuation and crossfade
540
  remaining_duration = total_duration
@@ -598,6 +617,9 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
598
  if prev_sr != processing_sample_rate:
599
  logger.debug(f"Resampling from {prev_sr} to {processing_sample_rate}")
600
  prev_audio = torchaudio.transforms.Resample(prev_sr, processing_sample_rate)(prev_audio)
 
 
 
601
  prev_audio = prev_audio.to(device)
602
  os.remove(temp_wav_path)
603
  logger.debug(f"Deleted temporary file {temp_wav_path}")
@@ -622,7 +644,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
622
  logger.debug("Converting mono to stereo on CPU")
623
  audio_np = np.stack([audio_np, audio_np], axis=0)
624
  elif audio_np.ndim == 2 and audio_np.shape[0] != 2:
625
- logger.debug("Adjusting to stereo on CPU")
626
  audio_np = np.concatenate([audio_np, audio_np], axis=0)[:2]
627
  if audio_np.shape[0] != 2:
628
  logger.error(f"Expected stereo audio with shape (2, samples), got shape {audio_np.shape}")
@@ -666,6 +688,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
666
  continue
667
 
668
  try:
 
669
  segment = segment - 15
670
  if segment.frame_rate != processing_sample_rate:
671
  logger.debug(f"Setting segment sample rate to {processing_sample_rate}")
@@ -696,26 +719,48 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
696
  logger.info("Combining audio chunks...")
697
  try:
698
  final_segment = audio_segments[0][:min(max_duration, total_duration) * 1000]
 
699
  overlap_ms = int(overlap_duration * 1000)
700
 
701
  for i in range(1, len(audio_segments)):
702
  current_segment = audio_segments[i]
703
  current_segment = current_segment[:min(max_duration, total_duration - (i * max_duration)) * 1000]
 
704
 
705
  if overlap_ms > 0 and len(current_segment) > overlap_ms:
706
  logger.debug(f"Applying crossfade between chunks {i} and {i+1}")
707
  prev_overlap = final_segment[-overlap_ms:]
708
  curr_overlap = current_segment[:overlap_ms]
709
- num_samples = len(np.array(prev_overlap.get_array_of_samples(), dtype=np.float32)) // 2
 
 
 
 
 
 
 
 
 
 
 
 
710
  blended_samples = np.zeros((num_samples, 2), dtype=np.float32)
711
- prev_samples = np.array(prev_overlap.get_array_of_samples(), dtype=np.float32).reshape(-1, 2)
712
- curr_samples = np.array(curr_overlap.get_array_of_samples(), dtype=np.float32).reshape(-1, 2)
713
  hann_window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(num_samples) / num_samples))
714
  fade_out = hann_window[::-1]
715
  fade_in = hann_window
716
  blended_samples = (prev_samples * fade_out[:, None] + curr_samples * fade_in[:, None])
 
 
 
 
 
 
 
 
717
  blended_segment = AudioSegment(
718
- blended_samples.astype(np.int32 if sample_width == 3 else np.int16).tobytes(),
719
  frame_rate=processing_sample_rate,
720
  sample_width=sample_width,
721
  channels=2
@@ -739,6 +784,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
739
  logger.info("⚠️ WARNING: Audio is set to safe levels (~ -23 dBFS RMS, -3 dBFS peak). Start playback at LOW volume (10-20%) and adjust gradually.")
740
  logger.info("VERIFY: Open the file in Audacity to check for static. RMS should be ~ -23 dBFS, peaks ≤ -3 dBFS. Report any static or issues.")
741
  try:
 
742
  logger.debug(f"Exporting final audio to {mp3_path} with bitrate {bitrate}, sample rate {output_sample_rate_int} Hz, bit depth {bit_depth_int}-bit")
743
  final_segment.export(
744
  mp3_path,
 
1
+
2
  import os
3
  import torch
4
  import torchaudio
 
125
  return False
126
 
127
  # Audio processing functions (CPU-based)
128
+ def ensure_stereo(audio_segment, sample_rate=16000, sample_width=2):
129
+ """Ensure the audio segment is stereo (2 channels)."""
130
+ try:
131
+ if audio_segment.channels != 2:
132
+ logger.debug(f"Converting to stereo: {audio_segment.channels} channels detected")
133
+ audio_segment = audio_segment.set_channels(2)
134
+ return audio_segment
135
+ except Exception as e:
136
+ logger.error(f"Failed to ensure stereo: {e}")
137
+ logger.error(traceback.format_exc())
138
+ return audio_segment
139
+
140
  def balance_stereo(audio_segment, noise_threshold=-60, sample_rate=16000):
141
  logger.debug(f"Balancing stereo for segment with sample rate {sample_rate}")
142
  try:
143
+ audio_segment = ensure_stereo(audio_segment, sample_rate, audio_segment.sample_width)
144
  samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
145
  if audio_segment.channels == 2:
146
  stereo_samples = samples.reshape(-1, 2)
 
149
  stereo_samples = stereo_samples * mask
150
  left_nonzero = stereo_samples[:, 0][stereo_samples[:, 0] != 0]
151
  right_nonzero = stereo_samples[:, 1][stereo_samples[:, 1] != 0]
152
+ left_rms = np.sqrt(np.mean(left_nonzero**2)) if len(left_nonzero) > 0 machts 0
153
  right_rms = np.sqrt(np.mean(right_nonzero**2)) if len(right_nonzero) > 0 else 0
154
  if left_rms > 0 and right_rms > 0:
155
  avg_rms = (left_rms + right_rms) / 2
 
164
  )
165
  logger.debug("Stereo balancing completed")
166
  return balanced_segment
167
+ logger.error("Failed to ensure stereo channels")
168
  return audio_segment
169
  except Exception as e:
170
  logger.error(f"Failed to balance stereo: {e}")
 
185
  def rms_normalize(segment, target_rms_db=-23.0, peak_limit_db=-3.0, sample_rate=16000):
186
  logger.debug(f"Normalizing RMS for segment with target {target_rms_db} dBFS")
187
  try:
188
+ segment = ensure_stereo(segment, sample_rate, segment.sample_width)
189
  target_rms = 10 ** (target_rms_db / 20) * (2**23 if segment.sample_width == 3 else 32767)
190
  current_rms = calculate_rms(segment)
191
  if current_rms > 0:
 
202
  def hard_limit(audio_segment, limit_db=-3.0, sample_rate=16000):
203
  logger.debug(f"Applying hard limit at {limit_db} dBFS")
204
  try:
205
+ audio_segment = ensure_stereo(audio_segment, sample_rate, audio_segment.sample_width)
206
  limit = 10 ** (limit_db / 20.0) * (2**23 if audio_segment.sample_width == 3 else 32767)
207
  samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
208
  samples = np.clip(samples, -limit, limit).astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
 
210
  samples.tobytes(),
211
  frame_rate=sample_rate,
212
  sample_width=audio_segment.sample_width,
213
+ channels=2
214
  )
215
  logger.debug("Hard limit applied")
216
  return limited_segment
 
222
  def apply_eq(segment, sample_rate=16000):
223
  logger.debug(f"Applying EQ with sample rate {sample_rate}")
224
  try:
225
+ segment = ensure_stereo(segment, sample_rate, segment.sample_width)
226
  segment = segment.high_pass_filter(20)
227
  segment = segment.low_pass_filter(20000)
228
  logger.debug("EQ applied")
 
235
  def apply_fade(segment, fade_in_duration=500, fade_out_duration=500):
236
  logger.debug(f"Applying fade: in={fade_in_duration}ms, out={fade_out_duration}ms")
237
  try:
238
+ segment = ensure_stereo(segment, segment.frame_rate, segment.sample_width)
239
  segment = segment.fade_in(fade_in_duration)
240
  segment = segment.fade_out(fade_out_duration)
241
  logger.debug("Fade applied")
 
553
  max_duration = min(max_steps_int / 50, 30) # Convert steps to seconds, cap at 30s
554
  total_duration = min(max(total_duration, 30), 120) # Clamp between 30s and 120s
555
  processing_sample_rate = 16000 # Fixed for processing
556
+ channels = 2 # Enforce stereo
557
  audio_segments = []
558
  overlap_duration = 0.2 # 200ms for continuation and crossfade
559
  remaining_duration = total_duration
 
617
  if prev_sr != processing_sample_rate:
618
  logger.debug(f"Resampling from {prev_sr} to {processing_sample_rate}")
619
  prev_audio = torchaudio.transforms.Resample(prev_sr, processing_sample_rate)(prev_audio)
620
+ if prev_audio.shape[0] != 2:
621
+ logger.debug(f"Converting to stereo: {prev_audio.shape[0]} channels detected")
622
+ prev_audio = prev_audio.repeat(2, 1)[:, :prev_audio.shape[1]]
623
  prev_audio = prev_audio.to(device)
624
  os.remove(temp_wav_path)
625
  logger.debug(f"Deleted temporary file {temp_wav_path}")
 
644
  logger.debug("Converting mono to stereo on CPU")
645
  audio_np = np.stack([audio_np, audio_np], axis=0)
646
  elif audio_np.ndim == 2 and audio_np.shape[0] != 2:
647
+ logger.debug(f"Adjusting to stereo on CPU: {audio_np.shape[0]} channels detected")
648
  audio_np = np.concatenate([audio_np, audio_np], axis=0)[:2]
649
  if audio_np.shape[0] != 2:
650
  logger.error(f"Expected stereo audio with shape (2, samples), got shape {audio_np.shape}")
 
688
  continue
689
 
690
  try:
691
+ segment = ensure_stereo(segment, processing_sample_rate, sample_width)
692
  segment = segment - 15
693
  if segment.frame_rate != processing_sample_rate:
694
  logger.debug(f"Setting segment sample rate to {processing_sample_rate}")
 
719
  logger.info("Combining audio chunks...")
720
  try:
721
  final_segment = audio_segments[0][:min(max_duration, total_duration) * 1000]
722
+ final_segment = ensure_stereo(final_segment, processing_sample_rate, sample_width)
723
  overlap_ms = int(overlap_duration * 1000)
724
 
725
  for i in range(1, len(audio_segments)):
726
  current_segment = audio_segments[i]
727
  current_segment = current_segment[:min(max_duration, total_duration - (i * max_duration)) * 1000]
728
+ current_segment = ensure_stereo(current_segment, processing_sample_rate, sample_width)
729
 
730
  if overlap_ms > 0 and len(current_segment) > overlap_ms:
731
  logger.debug(f"Applying crossfade between chunks {i} and {i+1}")
732
  prev_overlap = final_segment[-overlap_ms:]
733
  curr_overlap = current_segment[:overlap_ms]
734
+ # Ensure stereo and consistent sample length
735
+ prev_overlap = ensure_stereo(prev_overlap, processing_sample_rate, sample_width)
736
+ curr_overlap = ensure_stereo(curr_overlap, processing_sample_rate, sample_width)
737
+ # Calculate samples using torchaudio for precision
738
+ prev_audio, _ = torchaudio.load(io.BytesIO(prev_overlap.raw_data))
739
+ curr_audio, _ = torchaudio.load(io.BytesIO(curr_overlap.raw_data))
740
+ num_samples = min(prev_audio.shape[1], curr_audio.shape[1])
741
+ # Ensure num_samples is even for stereo
742
+ num_samples = num_samples - (num_samples % 2)
743
+ if num_samples <= 0:
744
+ logger.warning(f"Skipping crossfade for chunk {i+1} due to insufficient samples")
745
+ final_segment += current_segment
746
+ continue
747
  blended_samples = np.zeros((num_samples, 2), dtype=np.float32)
748
+ prev_samples = prev_audio[:, :num_samples].numpy().T
749
+ curr_samples = curr_audio[:, :num_samples].numpy().T
750
  hann_window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(num_samples) / num_samples))
751
  fade_out = hann_window[::-1]
752
  fade_in = hann_window
753
  blended_samples = (prev_samples * fade_out[:, None] + curr_samples * fade_in[:, None])
754
+ # Ensure byte length is multiple of sample_width * channels
755
+ blended_samples = blended_samples.astype(np.int32 if sample_width == 3 else np.int16)
756
+ byte_data = blended_samples.tobytes()
757
+ byte_length = len(byte_data)
758
+ expected_length = byte_length - (byte_length % (sample_width * channels))
759
+ if byte_length != expected_length:
760
+ logger.debug(f"Truncating blended samples from {byte_length} to {expected_length} bytes")
761
+ byte_data = byte_data[:expected_length]
762
  blended_segment = AudioSegment(
763
+ byte_data,
764
  frame_rate=processing_sample_rate,
765
  sample_width=sample_width,
766
  channels=2
 
784
  logger.info("⚠️ WARNING: Audio is set to safe levels (~ -23 dBFS RMS, -3 dBFS peak). Start playback at LOW volume (10-20%) and adjust gradually.")
785
  logger.info("VERIFY: Open the file in Audacity to check for static. RMS should be ~ -23 dBFS, peaks ≤ -3 dBFS. Report any static or issues.")
786
  try:
787
+ clean_memory() # Pre-export cleanup
788
  logger.debug(f"Exporting final audio to {mp3_path} with bitrate {bitrate}, sample rate {output_sample_rate_int} Hz, bit depth {bit_depth_int}-bit")
789
  final_segment.export(
790
  mp3_path,