import subprocess import sys import time # Auto-install neucodec if missing try: import neucodec except ImportError: print("Installing neucodec...") subprocess.check_call([sys.executable, "-m", "pip", "install", "neucodec"]) # Other imports import gradio as gr import torch import torchaudio from torchaudio import transforms as T from neucodec import DistillNeuCodec import librosa import soundfile as sf import numpy as np # Load model on CPU model = DistillNeuCodec.from_pretrained("neuphonic/distill-neucodec") model.eval() # CPU only def reconstruct_audio(audio_file): # Start timer start_time = time.time() # Load audio with librosa y, sr = librosa.load(audio_file, sr=None, mono=True) # Keep original sr orig_sr = sr orig_len = len(y) # Resample to 16kHz if needed for model encoding if sr != 16000: y = librosa.resample(y, orig_sr=sr, target_sr=16000) sr = 16000 # Convert to tensor (1, 1, T) y_tensor = torch.from_numpy(y).unsqueeze(0).unsqueeze(0) # Encode & decode with torch.no_grad(): fsq_codes = model.encode_code(y_tensor) recon = model.decode_code(fsq_codes) recon = recon.squeeze().cpu().numpy() # Save reconstructed audio recon_path = "reconstructed.wav" sf.write(recon_path, recon, 24000) # End timer elapsed_time = time.time() - start_time # Metadata metadata = { "original_sr": orig_sr, "original_length_samples": orig_len, "resampled_sr": sr, "reconstructed_sr": 24000, "num_tokens": fsq_codes.shape, "processing_time_sec": round(elapsed_time, 3), "input_file": audio_file, "output_file": recon_path } # Print info print("\n=== Audio Reconstruction Info ===") for k, v in metadata.items(): print(f"{k}: {v}") # Return both reconstructed file and metadata for Gradio return recon_path, f"Tokens: {fsq_codes.shape}, Processing time: {elapsed_time:.3f}s" # Gradio interface iface = gr.Interface( fn=reconstruct_audio, inputs=gr.Audio(type="filepath", label="Upload Audio"), outputs=[gr.Audio(type="filepath", label="Reconstructed Audio"), gr.Textbox(label="Info")], title="Audio Reconstruction with DistillNeuCodec (CPU + Librosa)", description="Upload any audio file, and this app will reconstruct it using DistillNeuCodec at 24kHz on CPU. Metadata and token info are also displayed." ) if __name__ == "__main__": iface.launch()