Spaces:

fffiloni
/

auffusion

Running on Zero

App Files Files Community

fffiloni commited on Feb 5, 2025

Commit

c01e8f8

verified ·

1 Parent(s): cf62874

Update app.py

Browse files

Files changed (1) hide show

app.py +7 -87

app.py CHANGED Viewed

@@ -37,18 +37,6 @@ def resample_audio(input_audio, original_sr, target_sr=16000):
         # If sample rate is already 16000, no resampling is needed
         return input_audio
-def convert_wav_to_16khz(input_path, output_path):
-    with wave.open(input_path, "rb") as wav_in:
-        params = wav_in.getparams()
-        channels, sampwidth, framerate, nframes = params[:4]
-        # Read and convert audio data
-        audio_data = np.frombuffer(wav_in.readframes(nframes), dtype=np.int16)
-        new_framerate = 16000
-        # Save as a new WAV file
-        write(output_path, new_framerate, audio_data)
-        return output_path
 def save_spectrogram_image(spectrogram, filename):
     """Save a spectrogram as an image."""
@@ -58,58 +46,6 @@ def save_spectrogram_image(spectrogram, filename):
     plt.savefig(filename, bbox_inches='tight', pad_inches=0)
     plt.close()
-def debug_spectrogram(audio, spec, label="Current File"):
-    print(f"==== [{label}] ====")
-    print(f"🔹 Raw Audio Min/Max: {audio.min()}, {audio.max()}")
-    print(f"🔹 Spectrogram Min/Max Before Normalization: {spec.min()}, {spec.max()}")
-    print(f"🔹 Spectrogram Mean Before Normalization: {spec.mean()}")
-    normalized_spec = normalize_spectrogram(spec)
-    print(f"🔹 Spectrogram Min/Max After Normalization: {normalized_spec.min()}, {normalized_spec.max()}")
-    print(f"🔹 Spectrogram Mean After Normalization: {normalized_spec.mean()}")
-    return normalized_spec
-def extract_pitch(y, sr, hop_length=512):
-    # Use librosa's yin method to estimate the pitch (fundamental frequency)
-    f0, voiced_flag, voiced_probs = librosa.pyin(y, fmin=librosa.note_to_hz('C1'), fmax=librosa.note_to_hz('C8'))
-    # Calculate the mean pitch (only for voiced segments)
-    f0_mean = np.mean(f0[voiced_flag]) if len(f0[voiced_flag]) > 0 else None
-    return f0_mean
-def compare_pitch(original_audio, processed_audio, sr=16000):
-    # Extract pitch from the original and processed audio
-    pitch_original = extract_pitch(original_audio, sr)
-    pitch_processed = extract_pitch(processed_audio, sr)
-    if pitch_original is not None and pitch_processed is not None:
-        pitch_diff = pitch_original - pitch_processed
-        print(f"Original Pitch: {pitch_original} Hz")
-        print(f"Processed Pitch: {pitch_processed} Hz")
-        print(f"Pitch Difference: {pitch_diff} Hz")
-    else:
-        print("Could not extract pitch from one of the signals.")
-def adjust_spectrogram_mean(spec, target_mean=-5.0):
-    # Calculate the current mean of the spectrogram
-    current_mean = spec.mean().item()
-    # If the current mean is below the target mean, shift the values up
-    if current_mean < target_mean:
-        shift_value = target_mean - current_mean
-        print(f"Current mean: {current_mean}. Shifting by: {shift_value}")
-        # Shift the entire spectrogram by the calculated shift value
-        adjusted_spec = spec + shift_value
-        # Ensure that the adjusted values are still valid (in the expected range)
-        adjusted_spec = torch.clamp(adjusted_spec, min=0.0)  # Optional: prevent negative values if needed
-        return adjusted_spec
-    else:
-        print(f"Current mean: {current_mean}. No adjustment needed.")
-        return spec
 def infer(prompt, progress=gr.Progress(track_tqdm=True)):
     pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion")
@@ -151,16 +87,6 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
     audio, sampling_rate = load_wav(audio_path)
     audio, spec = get_mel_spectrogram_from_audio(audio)
-    # Check if the spectrogram mean before normalization is too low
-    spec_mean_before = spec.mean().item()
-    # Apply fix only if the spectrogram mean is too low
-    if spec_mean_before < -5.0:
-        print(f"⚠️ Spectrogram too low (Mean: {spec_mean_before}).")
-    else:
-        print(f"✅ Spectrogram looks normal (Mean: {spec_mean_before}). No boost needed.")
     # Normalize the spectrogram
     norm_spec = normalize_spectrogram(spec)
@@ -233,17 +159,16 @@ def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(tra
     output_spec_image_path = "output_spectrogram.png"
     concat_image.save(output_spec_image_path)
-    # ——
-    original_audio, sr = librosa.load(audio_path, sr=None)
-    processed_audio, sr = librosa.load("output.wav", sr=None)
-    compare_pitch(original_audio, processed_audio)
     return "output.wav", input_spec_image_path, output_spec_image_path
 def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)):
-    audio_path = convert_wav_to_16khz(audio_path, "output_16khz.wav")
     pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
     dtype = torch.float16
@@ -263,14 +188,9 @@ def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.
     seed = 42
     # Loading
-    audio, sampling_rate = load_wav(audio_path)
-    print(f"Raw audio min/max: {audio.min()}, {audio.max()}")
     audio, spec = get_mel_spectrogram_from_audio(audio)
-    print(f"Spectrogram min/max before normalization: {spec.min()}, {spec.max()}")
     norm_spec = normalize_spectrogram(spec)
-    print(f"Spectrogram min/max after normalization: {norm_spec.min()}, {norm_spec.max()}")
     norm_spec = pad_spec(norm_spec, 1024)
     norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input

         # If sample rate is already 16000, no resampling is needed
         return input_audio
 def save_spectrogram_image(spectrogram, filename):
     """Save a spectrogram as an image."""
     plt.savefig(filename, bbox_inches='tight', pad_inches=0)
     plt.close()
 def infer(prompt, progress=gr.Progress(track_tqdm=True)):
     pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion")
     audio, sampling_rate = load_wav(audio_path)
     audio, spec = get_mel_spectrogram_from_audio(audio)
     # Normalize the spectrogram
     norm_spec = normalize_spectrogram(spec)
     output_spec_image_path = "output_spectrogram.png"
     concat_image.save(output_spec_image_path)
     return "output.wav", input_spec_image_path, output_spec_image_path
 def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)):
+    # Load your audio file
+    input_audio, original_sr = librosa.load(audio_path, sr=None)  # Load with original sampling rate
+    resampled_audio = resample_audio(input_audio, original_sr, target_sr=16000)
+    # Save the resampled audio to a new file
+    sf.write('resampled_audio.wav', resampled_audio, 16000)
+    audio_path = 'resampled_audio.wav'
     pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter"
     dtype = torch.float16
     seed = 42
     # Loading
+    audio, sampling_rate = load_wav(audio_path)
     audio, spec = get_mel_spectrogram_from_audio(audio)
     norm_spec = normalize_spectrogram(spec)
     norm_spec = pad_spec(norm_spec, 1024)
     norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input