File size: 2,564 Bytes
315325b fc1def1 315325b fa7ee39 315325b fc1def1 fa7ee39 315325b 12d2fec 315325b fc1def1 315325b fc1def1 12d2fec fc1def1 315325b fc1def1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 |
import subprocess
import sys
import time
# Auto-install neucodec if missing
try:
import neucodec
except ImportError:
print("Installing neucodec...")
subprocess.check_call([sys.executable, "-m", "pip", "install", "neucodec"])
# Other imports
import gradio as gr
import torch
import torchaudio
from torchaudio import transforms as T
from neucodec import DistillNeuCodec
import librosa
import soundfile as sf
import numpy as np
# Load model on CPU
model = DistillNeuCodec.from_pretrained("neuphonic/distill-neucodec")
model.eval() # CPU only
def reconstruct_audio(audio_file):
# Start timer
start_time = time.time()
# Load audio with librosa
y, sr = librosa.load(audio_file, sr=None, mono=True) # Keep original sr
orig_sr = sr
orig_len = len(y)
# Resample to 16kHz if needed for model encoding
if sr != 16000:
y = librosa.resample(y, orig_sr=sr, target_sr=16000)
sr = 16000
# Convert to tensor (1, 1, T)
y_tensor = torch.from_numpy(y).unsqueeze(0).unsqueeze(0)
# Encode & decode
with torch.no_grad():
fsq_codes = model.encode_code(y_tensor)
recon = model.decode_code(fsq_codes)
recon = recon.squeeze().cpu().numpy()
# Save reconstructed audio
recon_path = "reconstructed.wav"
sf.write(recon_path, recon, 24000)
# End timer
elapsed_time = time.time() - start_time
# Metadata
metadata = {
"original_sr": orig_sr,
"original_length_samples": orig_len,
"resampled_sr": sr,
"reconstructed_sr": 24000,
"num_tokens": fsq_codes.shape,
"processing_time_sec": round(elapsed_time, 3),
"input_file": audio_file,
"output_file": recon_path
}
# Print info
print("\n=== Audio Reconstruction Info ===")
for k, v in metadata.items():
print(f"{k}: {v}")
# Return both reconstructed file and metadata for Gradio
return recon_path, f"Tokens: {fsq_codes.shape}, Processing time: {elapsed_time:.3f}s"
# Gradio interface
iface = gr.Interface(
fn=reconstruct_audio,
inputs=gr.Audio(type="filepath", label="Upload Audio"),
outputs=[gr.Audio(type="filepath", label="Reconstructed Audio"),
gr.Textbox(label="Info")],
title="Audio Reconstruction with DistillNeuCodec (CPU + Librosa)",
description="Upload any audio file, and this app will reconstruct it using DistillNeuCodec at 24kHz on CPU. Metadata and token info are also displayed."
)
if __name__ == "__main__":
iface.launch()
|