|
|
import subprocess |
|
|
import sys |
|
|
import time |
|
|
|
|
|
|
|
|
try: |
|
|
import neucodec |
|
|
except ImportError: |
|
|
print("Installing neucodec...") |
|
|
subprocess.check_call([sys.executable, "-m", "pip", "install", "neucodec"]) |
|
|
|
|
|
|
|
|
import gradio as gr |
|
|
import torch |
|
|
import torchaudio |
|
|
from torchaudio import transforms as T |
|
|
from neucodec import DistillNeuCodec |
|
|
import librosa |
|
|
import soundfile as sf |
|
|
import numpy as np |
|
|
|
|
|
|
|
|
model = DistillNeuCodec.from_pretrained("neuphonic/distill-neucodec") |
|
|
model.eval() |
|
|
|
|
|
def reconstruct_audio(audio_file): |
|
|
|
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
y, sr = librosa.load(audio_file, sr=None, mono=True) |
|
|
orig_sr = sr |
|
|
orig_len = len(y) |
|
|
|
|
|
|
|
|
if sr != 16000: |
|
|
y = librosa.resample(y, orig_sr=sr, target_sr=16000) |
|
|
sr = 16000 |
|
|
|
|
|
|
|
|
y_tensor = torch.from_numpy(y).unsqueeze(0).unsqueeze(0) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
fsq_codes = model.encode_code(y_tensor) |
|
|
recon = model.decode_code(fsq_codes) |
|
|
|
|
|
recon = recon.squeeze().cpu().numpy() |
|
|
|
|
|
|
|
|
recon_path = "reconstructed.wav" |
|
|
sf.write(recon_path, recon, 24000) |
|
|
|
|
|
|
|
|
elapsed_time = time.time() - start_time |
|
|
|
|
|
|
|
|
metadata = { |
|
|
"original_sr": orig_sr, |
|
|
"original_length_samples": orig_len, |
|
|
"resampled_sr": sr, |
|
|
"reconstructed_sr": 24000, |
|
|
"num_tokens": fsq_codes.shape, |
|
|
"processing_time_sec": round(elapsed_time, 3), |
|
|
"input_file": audio_file, |
|
|
"output_file": recon_path |
|
|
} |
|
|
|
|
|
|
|
|
print("\n=== Audio Reconstruction Info ===") |
|
|
for k, v in metadata.items(): |
|
|
print(f"{k}: {v}") |
|
|
|
|
|
|
|
|
return recon_path, f"Tokens: {fsq_codes.shape}, Processing time: {elapsed_time:.3f}s" |
|
|
|
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=reconstruct_audio, |
|
|
inputs=gr.Audio(type="filepath", label="Upload Audio"), |
|
|
outputs=[gr.Audio(type="filepath", label="Reconstructed Audio"), |
|
|
gr.Textbox(label="Info")], |
|
|
title="Audio Reconstruction with DistillNeuCodec (CPU + Librosa)", |
|
|
description="Upload any audio file, and this app will reconstruct it using DistillNeuCodec at 24kHz on CPU. Metadata and token info are also displayed." |
|
|
) |
|
|
|
|
|
if __name__ == "__main__": |
|
|
iface.launch() |
|
|
|