Spaces:
Running
on
Zero
Running
on
Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -37,18 +37,6 @@ def resample_audio(input_audio, original_sr, target_sr=16000):
|
|
| 37 |
# If sample rate is already 16000, no resampling is needed
|
| 38 |
return input_audio
|
| 39 |
|
| 40 |
-
def convert_wav_to_16khz(input_path, output_path):
|
| 41 |
-
with wave.open(input_path, "rb") as wav_in:
|
| 42 |
-
params = wav_in.getparams()
|
| 43 |
-
channels, sampwidth, framerate, nframes = params[:4]
|
| 44 |
-
|
| 45 |
-
# Read and convert audio data
|
| 46 |
-
audio_data = np.frombuffer(wav_in.readframes(nframes), dtype=np.int16)
|
| 47 |
-
new_framerate = 16000
|
| 48 |
-
|
| 49 |
-
# Save as a new WAV file
|
| 50 |
-
write(output_path, new_framerate, audio_data)
|
| 51 |
-
return output_path
|
| 52 |
|
| 53 |
def save_spectrogram_image(spectrogram, filename):
|
| 54 |
"""Save a spectrogram as an image."""
|
|
@@ -58,58 +46,6 @@ def save_spectrogram_image(spectrogram, filename):
|
|
| 58 |
plt.savefig(filename, bbox_inches='tight', pad_inches=0)
|
| 59 |
plt.close()
|
| 60 |
|
| 61 |
-
def debug_spectrogram(audio, spec, label="Current File"):
|
| 62 |
-
print(f"==== [{label}] ====")
|
| 63 |
-
print(f"🔹 Raw Audio Min/Max: {audio.min()}, {audio.max()}")
|
| 64 |
-
print(f"🔹 Spectrogram Min/Max Before Normalization: {spec.min()}, {spec.max()}")
|
| 65 |
-
print(f"🔹 Spectrogram Mean Before Normalization: {spec.mean()}")
|
| 66 |
-
|
| 67 |
-
normalized_spec = normalize_spectrogram(spec)
|
| 68 |
-
|
| 69 |
-
print(f"🔹 Spectrogram Min/Max After Normalization: {normalized_spec.min()}, {normalized_spec.max()}")
|
| 70 |
-
print(f"🔹 Spectrogram Mean After Normalization: {normalized_spec.mean()}")
|
| 71 |
-
|
| 72 |
-
return normalized_spec
|
| 73 |
-
|
| 74 |
-
def extract_pitch(y, sr, hop_length=512):
|
| 75 |
-
# Use librosa's yin method to estimate the pitch (fundamental frequency)
|
| 76 |
-
f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C1'), fmax=librosa.note_to_hz('C8'))
|
| 77 |
-
|
| 78 |
-
# Calculate the mean pitch (only for voiced segments)
|
| 79 |
-
f0_mean = np.mean(f0[voiced_flag]) if len(f0[voiced_flag]) > 0 else None
|
| 80 |
-
return f0_mean
|
| 81 |
-
|
| 82 |
-
def compare_pitch(original_audio, processed_audio, sr=16000):
|
| 83 |
-
# Extract pitch from the original and processed audio
|
| 84 |
-
pitch_original = extract_pitch(original_audio, sr)
|
| 85 |
-
pitch_processed = extract_pitch(processed_audio, sr)
|
| 86 |
-
|
| 87 |
-
if pitch_original is not None and pitch_processed is not None:
|
| 88 |
-
pitch_diff = pitch_original - pitch_processed
|
| 89 |
-
print(f"Original Pitch: {pitch_original} Hz")
|
| 90 |
-
print(f"Processed Pitch: {pitch_processed} Hz")
|
| 91 |
-
print(f"Pitch Difference: {pitch_diff} Hz")
|
| 92 |
-
else:
|
| 93 |
-
print("Could not extract pitch from one of the signals.")
|
| 94 |
-
|
| 95 |
-
def adjust_spectrogram_mean(spec, target_mean=-5.0):
|
| 96 |
-
# Calculate the current mean of the spectrogram
|
| 97 |
-
current_mean = spec.mean().item()
|
| 98 |
-
|
| 99 |
-
# If the current mean is below the target mean, shift the values up
|
| 100 |
-
if current_mean < target_mean:
|
| 101 |
-
shift_value = target_mean - current_mean
|
| 102 |
-
print(f"Current mean: {current_mean}. Shifting by: {shift_value}")
|
| 103 |
-
|
| 104 |
-
# Shift the entire spectrogram by the calculated shift value
|
| 105 |
-
adjusted_spec = spec + shift_value
|
| 106 |
-
|
| 107 |
-
# Ensure that the adjusted values are still valid (in the expected range)
|
| 108 |
-
adjusted_spec = torch.clamp(adjusted_spec, min=0.0) # Optional: prevent negative values if needed
|
| 109 |
-
return adjusted_spec
|
| 110 |
-
else:
|
| 111 |
-
print(f"Current mean: {current_mean}. No adjustment needed.")
|
| 112 |
-
return spec
|
| 113 |
|
| 114 |
def infer(prompt, progress=gr.Progress(track_tqdm=True)):
|
| 115 |
pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion")
|
|
@@ -151,16 +87,6 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
|
|
| 151 |
audio, sampling_rate = load_wav(audio_path)
|
| 152 |
audio, spec = get_mel_spectrogram_from_audio(audio)
|
| 153 |
|
| 154 |
-
# Check if the spectrogram mean before normalization is too low
|
| 155 |
-
spec_mean_before = spec.mean().item()
|
| 156 |
-
|
| 157 |
-
# Apply fix only if the spectrogram mean is too low
|
| 158 |
-
if spec_mean_before < -5.0:
|
| 159 |
-
print(f"⚠️ Spectrogram too low (Mean: {spec_mean_before}).")
|
| 160 |
-
else:
|
| 161 |
-
print(f"✅ Spectrogram looks normal (Mean: {spec_mean_before}). No boost needed.")
|
| 162 |
-
|
| 163 |
-
|
| 164 |
# Normalize the spectrogram
|
| 165 |
norm_spec = normalize_spectrogram(spec)
|
| 166 |
|
|
@@ -233,17 +159,16 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
|
|
| 233 |
output_spec_image_path = "output_spectrogram.png"
|
| 234 |
concat_image.save(output_spec_image_path)
|
| 235 |
|
| 236 |
-
# ——
|
| 237 |
-
original_audio, sr = librosa.load(audio_path, sr=None)
|
| 238 |
-
processed_audio, sr = librosa.load("output.wav", sr=None)
|
| 239 |
-
|
| 240 |
-
compare_pitch(original_audio, processed_audio)
|
| 241 |
-
|
| 242 |
return "output.wav", input_spec_image_path, output_spec_image_path
|
| 243 |
|
| 244 |
def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)):
|
| 245 |
|
| 246 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
|
| 248 |
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
|
| 249 |
dtype = torch.float16
|
|
@@ -263,14 +188,9 @@ def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.
|
|
| 263 |
seed = 42
|
| 264 |
|
| 265 |
# Loading
|
| 266 |
-
audio, sampling_rate = load_wav(audio_path)
|
| 267 |
-
print(f"Raw audio min/max: {audio.min()}, {audio.max()}")
|
| 268 |
-
|
| 269 |
audio, spec = get_mel_spectrogram_from_audio(audio)
|
| 270 |
-
print(f"Spectrogram min/max before normalization: {spec.min()}, {spec.max()}")
|
| 271 |
-
|
| 272 |
norm_spec = normalize_spectrogram(spec)
|
| 273 |
-
print(f"Spectrogram min/max after normalization: {norm_spec.min()}, {norm_spec.max()}")
|
| 274 |
|
| 275 |
norm_spec = pad_spec(norm_spec, 1024)
|
| 276 |
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
|
|
|
|
| 37 |
# If sample rate is already 16000, no resampling is needed
|
| 38 |
return input_audio
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
|
| 41 |
def save_spectrogram_image(spectrogram, filename):
|
| 42 |
"""Save a spectrogram as an image."""
|
|
|
|
| 46 |
plt.savefig(filename, bbox_inches='tight', pad_inches=0)
|
| 47 |
plt.close()
|
| 48 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
def infer(prompt, progress=gr.Progress(track_tqdm=True)):
|
| 51 |
pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion")
|
|
|
|
| 87 |
audio, sampling_rate = load_wav(audio_path)
|
| 88 |
audio, spec = get_mel_spectrogram_from_audio(audio)
|
| 89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
# Normalize the spectrogram
|
| 91 |
norm_spec = normalize_spectrogram(spec)
|
| 92 |
|
|
|
|
| 159 |
output_spec_image_path = "output_spectrogram.png"
|
| 160 |
concat_image.save(output_spec_image_path)
|
| 161 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 162 |
return "output.wav", input_spec_image_path, output_spec_image_path
|
| 163 |
|
| 164 |
def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)):
|
| 165 |
|
| 166 |
+
# Load your audio file
|
| 167 |
+
input_audio, original_sr = librosa.load(audio_path, sr=None) # Load with original sampling rate
|
| 168 |
+
resampled_audio = resample_audio(input_audio, original_sr, target_sr=16000)
|
| 169 |
+
# Save the resampled audio to a new file
|
| 170 |
+
sf.write('resampled_audio.wav', resampled_audio, 16000)
|
| 171 |
+
audio_path = 'resampled_audio.wav'
|
| 172 |
|
| 173 |
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
|
| 174 |
dtype = torch.float16
|
|
|
|
| 188 |
seed = 42
|
| 189 |
|
| 190 |
# Loading
|
| 191 |
+
audio, sampling_rate = load_wav(audio_path)
|
|
|
|
|
|
|
| 192 |
audio, spec = get_mel_spectrogram_from_audio(audio)
|
|
|
|
|
|
|
| 193 |
norm_spec = normalize_spectrogram(spec)
|
|
|
|
| 194 |
|
| 195 |
norm_spec = pad_spec(norm_spec, 1024)
|
| 196 |
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input
|