File size: 2,826 Bytes
c82ec83 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 | """
nano-codec inference: reconstruct audio through the codec.
Usage:
python inference.py --input input.wav --output reconstructed.wav
Downloads model weights from HuggingFace on first run.
"""
import argparse
import torch
import soundfile as sf
import torchaudio
import yaml
from huggingface_hub import hf_hub_download
from model import RVQCodec
REPO_ID = "taresh18/nano-codec"
def load_model(device="cpu"):
model_path = hf_hub_download(REPO_ID, "model.pt")
config_path = hf_hub_download(REPO_ID, "config.yaml")
with open(config_path) as f:
cfg = yaml.safe_load(f)
model = RVQCodec(
in_ch=1,
latent_ch=cfg['latent_dim'],
K=cfg['codebook_size'],
num_rvq_levels=cfg['num_rvq_levels'],
codebook_dim=cfg.get('codebook_dim', 8),
)
state = torch.load(model_path, map_location=device, weights_only=True)
model.load_state_dict(state)
model = model.to(device)
model.eval()
return model, cfg
def reconstruct(model, audio_path, output_path, sample_rate=16000, chunk_size=16384, device="cpu"):
audio, sr = sf.read(audio_path, dtype='float32')
if audio.ndim > 1:
audio = audio.mean(axis=1)
waveform = torch.from_numpy(audio).unsqueeze(0)
if sr != sample_rate:
waveform = torchaudio.functional.resample(waveform, sr, sample_rate)
waveform = waveform / waveform.abs().max().clamp(min=1e-8)
total_samples = waveform.shape[1]
pad_len = (chunk_size - total_samples % chunk_size) % chunk_size
if pad_len > 0:
waveform = torch.nn.functional.pad(waveform, (0, pad_len))
recon_chunks = []
with torch.no_grad():
for start in range(0, waveform.shape[1], chunk_size):
chunk = waveform[:, start:start + chunk_size].unsqueeze(0).to(device)
recon, _, _, _ = model(chunk)
recon = recon[..., :chunk_size]
recon_chunks.append(recon.cpu())
recon_full = torch.cat(recon_chunks, dim=-1)
recon_full = recon_full[0, :, :total_samples]
sf.write(output_path, recon_full[0].float().numpy(), sample_rate)
print(f"saved: {output_path} ({total_samples / sample_rate:.2f}s)")
def main():
parser = argparse.ArgumentParser(description="nano-codec inference")
parser.add_argument("--input", required=True, help="input wav file")
parser.add_argument("--output", default="reconstructed.wav", help="output wav file")
parser.add_argument("--device", default="cuda" if torch.cuda.is_available() else "cpu")
args = parser.parse_args()
model, cfg = load_model(device=args.device)
reconstruct(
model,
args.input,
args.output,
sample_rate=cfg['sample_rate'],
chunk_size=cfg['chunk_size'],
device=args.device,
)
if __name__ == "__main__":
main()
|