Spaces:

humair025
/

neucodec

Sleeping

App Files Files Community

neucodec / app.py

humair025

Update app.py

fc1def1 verified 28 days ago

raw

history blame contribute delete

2.56 kB

	import subprocess
	import sys
	import time

	# Auto-install neucodec if missing
	try:
	import neucodec
	except ImportError:
	print("Installing neucodec...")
	subprocess.check_call([sys.executable, "-m", "pip", "install", "neucodec"])

	# Other imports
	import gradio as gr
	import torch
	import torchaudio
	from torchaudio import transforms as T
	from neucodec import DistillNeuCodec
	import librosa
	import soundfile as sf
	import numpy as np

	# Load model on CPU
	model = DistillNeuCodec.from_pretrained("neuphonic/distill-neucodec")
	model.eval() # CPU only

	def reconstruct_audio(audio_file):
	# Start timer
	start_time = time.time()

	# Load audio with librosa
	y, sr = librosa.load(audio_file, sr=None, mono=True) # Keep original sr
	orig_sr = sr
	orig_len = len(y)

	# Resample to 16kHz if needed for model encoding
	if sr != 16000:
	y = librosa.resample(y, orig_sr=sr, target_sr=16000)
	sr = 16000

	# Convert to tensor (1, 1, T)
	y_tensor = torch.from_numpy(y).unsqueeze(0).unsqueeze(0)

	# Encode & decode
	with torch.no_grad():
	fsq_codes = model.encode_code(y_tensor)
	recon = model.decode_code(fsq_codes)

	recon = recon.squeeze().cpu().numpy()

	# Save reconstructed audio
	recon_path = "reconstructed.wav"
	sf.write(recon_path, recon, 24000)

	# End timer
	elapsed_time = time.time() - start_time

	# Metadata
	metadata = {
	"original_sr": orig_sr,
	"original_length_samples": orig_len,
	"resampled_sr": sr,
	"reconstructed_sr": 24000,
	"num_tokens": fsq_codes.shape,
	"processing_time_sec": round(elapsed_time, 3),
	"input_file": audio_file,
	"output_file": recon_path
	}

	# Print info
	print("\n=== Audio Reconstruction Info ===")
	for k, v in metadata.items():
	print(f"{k}: {v}")

	# Return both reconstructed file and metadata for Gradio
	return recon_path, f"Tokens: {fsq_codes.shape}, Processing time: {elapsed_time:.3f}s"

	# Gradio interface
	iface = gr.Interface(
	fn=reconstruct_audio,
	inputs=gr.Audio(type="filepath", label="Upload Audio"),
	outputs=[gr.Audio(type="filepath", label="Reconstructed Audio"),
	gr.Textbox(label="Info")],
	title="Audio Reconstruction with DistillNeuCodec (CPU + Librosa)",
	description="Upload any audio file, and this app will reconstruct it using DistillNeuCodec at 24kHz on CPU. Metadata and token info are also displayed."
	)

	if __name__ == "__main__":
	iface.launch()