File size: 2,564 Bytes
315325b
 
fc1def1
315325b
fa7ee39
315325b
 
 
 
 
 
 
 
 
fc1def1
 
 
fa7ee39
 
 
315325b
12d2fec
315325b
fc1def1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315325b
fc1def1
12d2fec
fc1def1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
315325b
 
fc1def1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import subprocess
import sys
import time

# Auto-install neucodec if missing
try:
    import neucodec
except ImportError:
    print("Installing neucodec...")
    subprocess.check_call([sys.executable, "-m", "pip", "install", "neucodec"])

# Other imports
import gradio as gr
import torch
import torchaudio
from torchaudio import transforms as T
from neucodec import DistillNeuCodec
import librosa
import soundfile as sf
import numpy as np

# Load model on CPU
model = DistillNeuCodec.from_pretrained("neuphonic/distill-neucodec")
model.eval()  # CPU only

def reconstruct_audio(audio_file):
    # Start timer
    start_time = time.time()
    
    # Load audio with librosa
    y, sr = librosa.load(audio_file, sr=None, mono=True)  # Keep original sr
    orig_sr = sr
    orig_len = len(y)
    
    # Resample to 16kHz if needed for model encoding
    if sr != 16000:
        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
        sr = 16000
    
    # Convert to tensor (1, 1, T)
    y_tensor = torch.from_numpy(y).unsqueeze(0).unsqueeze(0)
    
    # Encode & decode
    with torch.no_grad():
        fsq_codes = model.encode_code(y_tensor)
        recon = model.decode_code(fsq_codes)
    
    recon = recon.squeeze().cpu().numpy()
    
    # Save reconstructed audio
    recon_path = "reconstructed.wav"
    sf.write(recon_path, recon, 24000)
    
    # End timer
    elapsed_time = time.time() - start_time
    
    # Metadata
    metadata = {
        "original_sr": orig_sr,
        "original_length_samples": orig_len,
        "resampled_sr": sr,
        "reconstructed_sr": 24000,
        "num_tokens": fsq_codes.shape,
        "processing_time_sec": round(elapsed_time, 3),
        "input_file": audio_file,
        "output_file": recon_path
    }
    
    # Print info
    print("\n=== Audio Reconstruction Info ===")
    for k, v in metadata.items():
        print(f"{k}: {v}")
    
    # Return both reconstructed file and metadata for Gradio
    return recon_path, f"Tokens: {fsq_codes.shape}, Processing time: {elapsed_time:.3f}s"

# Gradio interface
iface = gr.Interface(
    fn=reconstruct_audio,
    inputs=gr.Audio(type="filepath", label="Upload Audio"),
    outputs=[gr.Audio(type="filepath", label="Reconstructed Audio"),
             gr.Textbox(label="Info")],
    title="Audio Reconstruction with DistillNeuCodec (CPU + Librosa)",
    description="Upload any audio file, and this app will reconstruct it using DistillNeuCodec at 24kHz on CPU. Metadata and token info are also displayed."
)

if __name__ == "__main__":
    iface.launch()