Spaces:

humair025
/

neucodec

Sleeping

humair025 commited on Nov 6

Commit

fa7ee39

verified ·

1 Parent(s): 12d2fec

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import subprocess
 import sys
-# Install neucodec if not already installed
 try:
     import neucodec
 except ImportError:
@@ -14,28 +14,32 @@ import torch
 import torchaudio
 from torchaudio import transforms as T
 from neucodec import DistillNeuCodec
 # Load model on CPU
 model = DistillNeuCodec.from_pretrained("neuphonic/distill-neucodec")
 model.eval()  # CPU only
 def reconstruct_audio(audio_file):
-    # Load uploaded audio
-    y, sr = torchaudio.load(audio_file)
-    # Resample if needed
-    if sr != 16_000:
-        y = T.Resample(sr, 16_000)(y)
-    y = y[None, ...]  # Add batch dim (B, 1, T)
-    # Encode and decode on CPU
     with torch.no_grad():
         fsq_codes = model.encode_code(y)
         recon = model.decode_code(fsq_codes)
-    # Save to temporary file
     recon_path = "reconstructed.wav"
-    torchaudio.save(recon_path, recon[0], 24_000)
     return recon_path
@@ -44,7 +48,7 @@ iface = gr.Interface(
     fn=reconstruct_audio,
     inputs=gr.Audio(type="filepath", label="Upload Audio"),
     outputs=gr.Audio(type="filepath", label="Reconstructed Audio"),
-    title="Audio Reconstruction with DistillNeuCodec (CPU)",
     description="Upload any audio file, and this app will reconstruct it using DistillNeuCodec at 24kHz on CPU."
 )

 import subprocess
 import sys
+# Auto-install neucodec if missing
 try:
     import neucodec
 except ImportError:
 import torchaudio
 from torchaudio import transforms as T
 from neucodec import DistillNeuCodec
+import librosa
+import soundfile as sf
+import numpy as np
 # Load model on CPU
 model = DistillNeuCodec.from_pretrained("neuphonic/distill-neucodec")
 model.eval()  # CPU only
 def reconstruct_audio(audio_file):
+    # Load audio with librosa (avoids torchcodec issues)
+    y, sr = librosa.load(audio_file, sr=None, mono=True)  # Keep original sr
+    if sr != 16000:
+        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
+        sr = 16000
+    y = torch.from_numpy(y).unsqueeze(0).unsqueeze(0)  # (1, 1, T)
+    # Encode & decode
     with torch.no_grad():
         fsq_codes = model.encode_code(y)
         recon = model.decode_code(fsq_codes)
+    recon = recon.squeeze().cpu().numpy()
+    # Save reconstructed audio
     recon_path = "reconstructed.wav"
+    sf.write(recon_path, recon, 24000)
     return recon_path
     fn=reconstruct_audio,
     inputs=gr.Audio(type="filepath", label="Upload Audio"),
     outputs=gr.Audio(type="filepath", label="Reconstructed Audio"),
+    title="Audio Reconstruction with DistillNeuCodec (CPU + Librosa)",
     description="Upload any audio file, and this app will reconstruct it using DistillNeuCodec at 24kHz on CPU."
 )