Update stablecuda12build1.py
Browse filesi love math! its so much fun optimization CBR floats oh boy sure do love it yuppers
- stablecuda12build1.py +54 -8
stablecuda12build1.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import os
|
| 2 |
import torch
|
| 3 |
import torchaudio
|
|
@@ -124,9 +125,22 @@ def check_disk_space(path="."):
|
|
| 124 |
return False
|
| 125 |
|
| 126 |
# Audio processing functions (CPU-based)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
def balance_stereo(audio_segment, noise_threshold=-60, sample_rate=16000):
|
| 128 |
logger.debug(f"Balancing stereo for segment with sample rate {sample_rate}")
|
| 129 |
try:
|
|
|
|
| 130 |
samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
|
| 131 |
if audio_segment.channels == 2:
|
| 132 |
stereo_samples = samples.reshape(-1, 2)
|
|
@@ -135,7 +149,7 @@ def balance_stereo(audio_segment, noise_threshold=-60, sample_rate=16000):
|
|
| 135 |
stereo_samples = stereo_samples * mask
|
| 136 |
left_nonzero = stereo_samples[:, 0][stereo_samples[:, 0] != 0]
|
| 137 |
right_nonzero = stereo_samples[:, 1][stereo_samples[:, 1] != 0]
|
| 138 |
-
left_rms = np.sqrt(np.mean(left_nonzero**2)) if len(left_nonzero) > 0
|
| 139 |
right_rms = np.sqrt(np.mean(right_nonzero**2)) if len(right_nonzero) > 0 else 0
|
| 140 |
if left_rms > 0 and right_rms > 0:
|
| 141 |
avg_rms = (left_rms + right_rms) / 2
|
|
@@ -150,7 +164,7 @@ def balance_stereo(audio_segment, noise_threshold=-60, sample_rate=16000):
|
|
| 150 |
)
|
| 151 |
logger.debug("Stereo balancing completed")
|
| 152 |
return balanced_segment
|
| 153 |
-
logger.
|
| 154 |
return audio_segment
|
| 155 |
except Exception as e:
|
| 156 |
logger.error(f"Failed to balance stereo: {e}")
|
|
@@ -171,6 +185,7 @@ def calculate_rms(segment):
|
|
| 171 |
def rms_normalize(segment, target_rms_db=-23.0, peak_limit_db=-3.0, sample_rate=16000):
|
| 172 |
logger.debug(f"Normalizing RMS for segment with target {target_rms_db} dBFS")
|
| 173 |
try:
|
|
|
|
| 174 |
target_rms = 10 ** (target_rms_db / 20) * (2**23 if segment.sample_width == 3 else 32767)
|
| 175 |
current_rms = calculate_rms(segment)
|
| 176 |
if current_rms > 0:
|
|
@@ -187,6 +202,7 @@ def rms_normalize(segment, target_rms_db=-23.0, peak_limit_db=-3.0, sample_rate=
|
|
| 187 |
def hard_limit(audio_segment, limit_db=-3.0, sample_rate=16000):
|
| 188 |
logger.debug(f"Applying hard limit at {limit_db} dBFS")
|
| 189 |
try:
|
|
|
|
| 190 |
limit = 10 ** (limit_db / 20.0) * (2**23 if audio_segment.sample_width == 3 else 32767)
|
| 191 |
samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
|
| 192 |
samples = np.clip(samples, -limit, limit).astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
|
|
@@ -194,7 +210,7 @@ def hard_limit(audio_segment, limit_db=-3.0, sample_rate=16000):
|
|
| 194 |
samples.tobytes(),
|
| 195 |
frame_rate=sample_rate,
|
| 196 |
sample_width=audio_segment.sample_width,
|
| 197 |
-
channels=
|
| 198 |
)
|
| 199 |
logger.debug("Hard limit applied")
|
| 200 |
return limited_segment
|
|
@@ -206,6 +222,7 @@ def hard_limit(audio_segment, limit_db=-3.0, sample_rate=16000):
|
|
| 206 |
def apply_eq(segment, sample_rate=16000):
|
| 207 |
logger.debug(f"Applying EQ with sample rate {sample_rate}")
|
| 208 |
try:
|
|
|
|
| 209 |
segment = segment.high_pass_filter(20)
|
| 210 |
segment = segment.low_pass_filter(20000)
|
| 211 |
logger.debug("EQ applied")
|
|
@@ -218,6 +235,7 @@ def apply_eq(segment, sample_rate=16000):
|
|
| 218 |
def apply_fade(segment, fade_in_duration=500, fade_out_duration=500):
|
| 219 |
logger.debug(f"Applying fade: in={fade_in_duration}ms, out={fade_out_duration}ms")
|
| 220 |
try:
|
|
|
|
| 221 |
segment = segment.fade_in(fade_in_duration)
|
| 222 |
segment = segment.fade_out(fade_out_duration)
|
| 223 |
logger.debug("Fade applied")
|
|
@@ -535,6 +553,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
| 535 |
max_duration = min(max_steps_int / 50, 30) # Convert steps to seconds, cap at 30s
|
| 536 |
total_duration = min(max(total_duration, 30), 120) # Clamp between 30s and 120s
|
| 537 |
processing_sample_rate = 16000 # Fixed for processing
|
|
|
|
| 538 |
audio_segments = []
|
| 539 |
overlap_duration = 0.2 # 200ms for continuation and crossfade
|
| 540 |
remaining_duration = total_duration
|
|
@@ -598,6 +617,9 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
| 598 |
if prev_sr != processing_sample_rate:
|
| 599 |
logger.debug(f"Resampling from {prev_sr} to {processing_sample_rate}")
|
| 600 |
prev_audio = torchaudio.transforms.Resample(prev_sr, processing_sample_rate)(prev_audio)
|
|
|
|
|
|
|
|
|
|
| 601 |
prev_audio = prev_audio.to(device)
|
| 602 |
os.remove(temp_wav_path)
|
| 603 |
logger.debug(f"Deleted temporary file {temp_wav_path}")
|
|
@@ -622,7 +644,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
| 622 |
logger.debug("Converting mono to stereo on CPU")
|
| 623 |
audio_np = np.stack([audio_np, audio_np], axis=0)
|
| 624 |
elif audio_np.ndim == 2 and audio_np.shape[0] != 2:
|
| 625 |
-
logger.debug("Adjusting to stereo on CPU")
|
| 626 |
audio_np = np.concatenate([audio_np, audio_np], axis=0)[:2]
|
| 627 |
if audio_np.shape[0] != 2:
|
| 628 |
logger.error(f"Expected stereo audio with shape (2, samples), got shape {audio_np.shape}")
|
|
@@ -666,6 +688,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
| 666 |
continue
|
| 667 |
|
| 668 |
try:
|
|
|
|
| 669 |
segment = segment - 15
|
| 670 |
if segment.frame_rate != processing_sample_rate:
|
| 671 |
logger.debug(f"Setting segment sample rate to {processing_sample_rate}")
|
|
@@ -696,26 +719,48 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
| 696 |
logger.info("Combining audio chunks...")
|
| 697 |
try:
|
| 698 |
final_segment = audio_segments[0][:min(max_duration, total_duration) * 1000]
|
|
|
|
| 699 |
overlap_ms = int(overlap_duration * 1000)
|
| 700 |
|
| 701 |
for i in range(1, len(audio_segments)):
|
| 702 |
current_segment = audio_segments[i]
|
| 703 |
current_segment = current_segment[:min(max_duration, total_duration - (i * max_duration)) * 1000]
|
|
|
|
| 704 |
|
| 705 |
if overlap_ms > 0 and len(current_segment) > overlap_ms:
|
| 706 |
logger.debug(f"Applying crossfade between chunks {i} and {i+1}")
|
| 707 |
prev_overlap = final_segment[-overlap_ms:]
|
| 708 |
curr_overlap = current_segment[:overlap_ms]
|
| 709 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 710 |
blended_samples = np.zeros((num_samples, 2), dtype=np.float32)
|
| 711 |
-
prev_samples =
|
| 712 |
-
curr_samples =
|
| 713 |
hann_window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(num_samples) / num_samples))
|
| 714 |
fade_out = hann_window[::-1]
|
| 715 |
fade_in = hann_window
|
| 716 |
blended_samples = (prev_samples * fade_out[:, None] + curr_samples * fade_in[:, None])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 717 |
blended_segment = AudioSegment(
|
| 718 |
-
|
| 719 |
frame_rate=processing_sample_rate,
|
| 720 |
sample_width=sample_width,
|
| 721 |
channels=2
|
|
@@ -739,6 +784,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
| 739 |
logger.info("⚠️ WARNING: Audio is set to safe levels (~ -23 dBFS RMS, -3 dBFS peak). Start playback at LOW volume (10-20%) and adjust gradually.")
|
| 740 |
logger.info("VERIFY: Open the file in Audacity to check for static. RMS should be ~ -23 dBFS, peaks ≤ -3 dBFS. Report any static or issues.")
|
| 741 |
try:
|
|
|
|
| 742 |
logger.debug(f"Exporting final audio to {mp3_path} with bitrate {bitrate}, sample rate {output_sample_rate_int} Hz, bit depth {bit_depth_int}-bit")
|
| 743 |
final_segment.export(
|
| 744 |
mp3_path,
|
|
|
|
| 1 |
+
|
| 2 |
import os
|
| 3 |
import torch
|
| 4 |
import torchaudio
|
|
|
|
| 125 |
return False
|
| 126 |
|
| 127 |
# Audio processing functions (CPU-based)
|
| 128 |
+
def ensure_stereo(audio_segment, sample_rate=16000, sample_width=2):
|
| 129 |
+
"""Ensure the audio segment is stereo (2 channels)."""
|
| 130 |
+
try:
|
| 131 |
+
if audio_segment.channels != 2:
|
| 132 |
+
logger.debug(f"Converting to stereo: {audio_segment.channels} channels detected")
|
| 133 |
+
audio_segment = audio_segment.set_channels(2)
|
| 134 |
+
return audio_segment
|
| 135 |
+
except Exception as e:
|
| 136 |
+
logger.error(f"Failed to ensure stereo: {e}")
|
| 137 |
+
logger.error(traceback.format_exc())
|
| 138 |
+
return audio_segment
|
| 139 |
+
|
| 140 |
def balance_stereo(audio_segment, noise_threshold=-60, sample_rate=16000):
|
| 141 |
logger.debug(f"Balancing stereo for segment with sample rate {sample_rate}")
|
| 142 |
try:
|
| 143 |
+
audio_segment = ensure_stereo(audio_segment, sample_rate, audio_segment.sample_width)
|
| 144 |
samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
|
| 145 |
if audio_segment.channels == 2:
|
| 146 |
stereo_samples = samples.reshape(-1, 2)
|
|
|
|
| 149 |
stereo_samples = stereo_samples * mask
|
| 150 |
left_nonzero = stereo_samples[:, 0][stereo_samples[:, 0] != 0]
|
| 151 |
right_nonzero = stereo_samples[:, 1][stereo_samples[:, 1] != 0]
|
| 152 |
+
left_rms = np.sqrt(np.mean(left_nonzero**2)) if len(left_nonzero) > 0 machts 0
|
| 153 |
right_rms = np.sqrt(np.mean(right_nonzero**2)) if len(right_nonzero) > 0 else 0
|
| 154 |
if left_rms > 0 and right_rms > 0:
|
| 155 |
avg_rms = (left_rms + right_rms) / 2
|
|
|
|
| 164 |
)
|
| 165 |
logger.debug("Stereo balancing completed")
|
| 166 |
return balanced_segment
|
| 167 |
+
logger.error("Failed to ensure stereo channels")
|
| 168 |
return audio_segment
|
| 169 |
except Exception as e:
|
| 170 |
logger.error(f"Failed to balance stereo: {e}")
|
|
|
|
| 185 |
def rms_normalize(segment, target_rms_db=-23.0, peak_limit_db=-3.0, sample_rate=16000):
|
| 186 |
logger.debug(f"Normalizing RMS for segment with target {target_rms_db} dBFS")
|
| 187 |
try:
|
| 188 |
+
segment = ensure_stereo(segment, sample_rate, segment.sample_width)
|
| 189 |
target_rms = 10 ** (target_rms_db / 20) * (2**23 if segment.sample_width == 3 else 32767)
|
| 190 |
current_rms = calculate_rms(segment)
|
| 191 |
if current_rms > 0:
|
|
|
|
| 202 |
def hard_limit(audio_segment, limit_db=-3.0, sample_rate=16000):
|
| 203 |
logger.debug(f"Applying hard limit at {limit_db} dBFS")
|
| 204 |
try:
|
| 205 |
+
audio_segment = ensure_stereo(audio_segment, sample_rate, audio_segment.sample_width)
|
| 206 |
limit = 10 ** (limit_db / 20.0) * (2**23 if audio_segment.sample_width == 3 else 32767)
|
| 207 |
samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
|
| 208 |
samples = np.clip(samples, -limit, limit).astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
|
|
|
|
| 210 |
samples.tobytes(),
|
| 211 |
frame_rate=sample_rate,
|
| 212 |
sample_width=audio_segment.sample_width,
|
| 213 |
+
channels=2
|
| 214 |
)
|
| 215 |
logger.debug("Hard limit applied")
|
| 216 |
return limited_segment
|
|
|
|
| 222 |
def apply_eq(segment, sample_rate=16000):
|
| 223 |
logger.debug(f"Applying EQ with sample rate {sample_rate}")
|
| 224 |
try:
|
| 225 |
+
segment = ensure_stereo(segment, sample_rate, segment.sample_width)
|
| 226 |
segment = segment.high_pass_filter(20)
|
| 227 |
segment = segment.low_pass_filter(20000)
|
| 228 |
logger.debug("EQ applied")
|
|
|
|
| 235 |
def apply_fade(segment, fade_in_duration=500, fade_out_duration=500):
|
| 236 |
logger.debug(f"Applying fade: in={fade_in_duration}ms, out={fade_out_duration}ms")
|
| 237 |
try:
|
| 238 |
+
segment = ensure_stereo(segment, segment.frame_rate, segment.sample_width)
|
| 239 |
segment = segment.fade_in(fade_in_duration)
|
| 240 |
segment = segment.fade_out(fade_out_duration)
|
| 241 |
logger.debug("Fade applied")
|
|
|
|
| 553 |
max_duration = min(max_steps_int / 50, 30) # Convert steps to seconds, cap at 30s
|
| 554 |
total_duration = min(max(total_duration, 30), 120) # Clamp between 30s and 120s
|
| 555 |
processing_sample_rate = 16000 # Fixed for processing
|
| 556 |
+
channels = 2 # Enforce stereo
|
| 557 |
audio_segments = []
|
| 558 |
overlap_duration = 0.2 # 200ms for continuation and crossfade
|
| 559 |
remaining_duration = total_duration
|
|
|
|
| 617 |
if prev_sr != processing_sample_rate:
|
| 618 |
logger.debug(f"Resampling from {prev_sr} to {processing_sample_rate}")
|
| 619 |
prev_audio = torchaudio.transforms.Resample(prev_sr, processing_sample_rate)(prev_audio)
|
| 620 |
+
if prev_audio.shape[0] != 2:
|
| 621 |
+
logger.debug(f"Converting to stereo: {prev_audio.shape[0]} channels detected")
|
| 622 |
+
prev_audio = prev_audio.repeat(2, 1)[:, :prev_audio.shape[1]]
|
| 623 |
prev_audio = prev_audio.to(device)
|
| 624 |
os.remove(temp_wav_path)
|
| 625 |
logger.debug(f"Deleted temporary file {temp_wav_path}")
|
|
|
|
| 644 |
logger.debug("Converting mono to stereo on CPU")
|
| 645 |
audio_np = np.stack([audio_np, audio_np], axis=0)
|
| 646 |
elif audio_np.ndim == 2 and audio_np.shape[0] != 2:
|
| 647 |
+
logger.debug(f"Adjusting to stereo on CPU: {audio_np.shape[0]} channels detected")
|
| 648 |
audio_np = np.concatenate([audio_np, audio_np], axis=0)[:2]
|
| 649 |
if audio_np.shape[0] != 2:
|
| 650 |
logger.error(f"Expected stereo audio with shape (2, samples), got shape {audio_np.shape}")
|
|
|
|
| 688 |
continue
|
| 689 |
|
| 690 |
try:
|
| 691 |
+
segment = ensure_stereo(segment, processing_sample_rate, sample_width)
|
| 692 |
segment = segment - 15
|
| 693 |
if segment.frame_rate != processing_sample_rate:
|
| 694 |
logger.debug(f"Setting segment sample rate to {processing_sample_rate}")
|
|
|
|
| 719 |
logger.info("Combining audio chunks...")
|
| 720 |
try:
|
| 721 |
final_segment = audio_segments[0][:min(max_duration, total_duration) * 1000]
|
| 722 |
+
final_segment = ensure_stereo(final_segment, processing_sample_rate, sample_width)
|
| 723 |
overlap_ms = int(overlap_duration * 1000)
|
| 724 |
|
| 725 |
for i in range(1, len(audio_segments)):
|
| 726 |
current_segment = audio_segments[i]
|
| 727 |
current_segment = current_segment[:min(max_duration, total_duration - (i * max_duration)) * 1000]
|
| 728 |
+
current_segment = ensure_stereo(current_segment, processing_sample_rate, sample_width)
|
| 729 |
|
| 730 |
if overlap_ms > 0 and len(current_segment) > overlap_ms:
|
| 731 |
logger.debug(f"Applying crossfade between chunks {i} and {i+1}")
|
| 732 |
prev_overlap = final_segment[-overlap_ms:]
|
| 733 |
curr_overlap = current_segment[:overlap_ms]
|
| 734 |
+
# Ensure stereo and consistent sample length
|
| 735 |
+
prev_overlap = ensure_stereo(prev_overlap, processing_sample_rate, sample_width)
|
| 736 |
+
curr_overlap = ensure_stereo(curr_overlap, processing_sample_rate, sample_width)
|
| 737 |
+
# Calculate samples using torchaudio for precision
|
| 738 |
+
prev_audio, _ = torchaudio.load(io.BytesIO(prev_overlap.raw_data))
|
| 739 |
+
curr_audio, _ = torchaudio.load(io.BytesIO(curr_overlap.raw_data))
|
| 740 |
+
num_samples = min(prev_audio.shape[1], curr_audio.shape[1])
|
| 741 |
+
# Ensure num_samples is even for stereo
|
| 742 |
+
num_samples = num_samples - (num_samples % 2)
|
| 743 |
+
if num_samples <= 0:
|
| 744 |
+
logger.warning(f"Skipping crossfade for chunk {i+1} due to insufficient samples")
|
| 745 |
+
final_segment += current_segment
|
| 746 |
+
continue
|
| 747 |
blended_samples = np.zeros((num_samples, 2), dtype=np.float32)
|
| 748 |
+
prev_samples = prev_audio[:, :num_samples].numpy().T
|
| 749 |
+
curr_samples = curr_audio[:, :num_samples].numpy().T
|
| 750 |
hann_window = 0.5 * (1 - np.cos(2 * np.pi * np.arange(num_samples) / num_samples))
|
| 751 |
fade_out = hann_window[::-1]
|
| 752 |
fade_in = hann_window
|
| 753 |
blended_samples = (prev_samples * fade_out[:, None] + curr_samples * fade_in[:, None])
|
| 754 |
+
# Ensure byte length is multiple of sample_width * channels
|
| 755 |
+
blended_samples = blended_samples.astype(np.int32 if sample_width == 3 else np.int16)
|
| 756 |
+
byte_data = blended_samples.tobytes()
|
| 757 |
+
byte_length = len(byte_data)
|
| 758 |
+
expected_length = byte_length - (byte_length % (sample_width * channels))
|
| 759 |
+
if byte_length != expected_length:
|
| 760 |
+
logger.debug(f"Truncating blended samples from {byte_length} to {expected_length} bytes")
|
| 761 |
+
byte_data = byte_data[:expected_length]
|
| 762 |
blended_segment = AudioSegment(
|
| 763 |
+
byte_data,
|
| 764 |
frame_rate=processing_sample_rate,
|
| 765 |
sample_width=sample_width,
|
| 766 |
channels=2
|
|
|
|
| 784 |
logger.info("⚠️ WARNING: Audio is set to safe levels (~ -23 dBFS RMS, -3 dBFS peak). Start playback at LOW volume (10-20%) and adjust gradually.")
|
| 785 |
logger.info("VERIFY: Open the file in Audacity to check for static. RMS should be ~ -23 dBFS, peaks ≤ -3 dBFS. Report any static or issues.")
|
| 786 |
try:
|
| 787 |
+
clean_memory() # Pre-export cleanup
|
| 788 |
logger.debug(f"Exporting final audio to {mp3_path} with bitrate {bitrate}, sample rate {output_sample_rate_int} Hz, bit depth {bit_depth_int}-bit")
|
| 789 |
final_segment.export(
|
| 790 |
mp3_path,
|