Spaces:

humair025
/

LinaCodec

Runtime error

App Files Files Community

LinaCodec / app.py

humair025

Update app.py

9dcc975 verified 9 days ago

raw

history blame contribute delete

11.9 kB

	import gradio as gr
	import torch
	import numpy as np
	import torchaudio
	import tempfile
	import os

	# Patch LinaCodec to work on CPU
	print("Setting up LinaCodec for CPU...")

	# Import and patch before initializing
	from linacodec.tokenizer import LinaCodecModel
	from huggingface_hub import hf_hub_download
	import torch.nn as nn

	class CPULinaCodec:
	"""CPU-compatible wrapper for LinaCodec"""

	def __init__(self):
	print("Loading LinaCodec model on CPU...")

	# Download model files
	repo_id = "YatharthS/LinaCodec"
	config_path = hf_hub_download(repo_id=repo_id, filename="config.yaml")
	weights_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors")

	# Load model on CPU instead of CUDA
	self.device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Using device: {self.device}")

	self.model = LinaCodecModel.from_pretrained(
	config_path=config_path,
	weights_path=weights_path
	).eval()

	# Move to appropriate device
	self.model = self.model.to(self.device)

	self.sample_rate = 48000
	print(f"Model loaded successfully on {self.device}!")

	def encode(self, audio_path):
	"""Encode audio file to tokens and embeddings"""
	import torchaudio

	# Load audio
	wav, sr = torchaudio.load(audio_path)
	wav = wav.to(self.device)

	# Resample if needed
	if sr != 24000:
	resampler = torchaudio.transforms.Resample(sr, 24000).to(self.device)
	wav = resampler(wav)

	# Ensure mono
	if wav.shape[0] > 1:
	wav = wav.mean(dim=0, keepdim=True)

	# Encode
	with torch.no_grad():
	codes, embedding = self.model.encode(wav.unsqueeze(0))

	return codes, embedding

	def decode(self, codes, embedding):
	"""Decode tokens and embeddings back to audio"""
	with torch.no_grad():
	wav = self.model.decode(codes, embedding)

	return wav.squeeze(0)

	def convert_voice(self, source_path, reference_path):
	"""Convert voice using source content and reference timbre"""
	import torchaudio

	# Load source audio
	source_wav, source_sr = torchaudio.load(source_path)
	source_wav = source_wav.to(self.device)

	if source_sr != 24000:
	resampler = torchaudio.transforms.Resample(source_sr, 24000).to(self.device)
	source_wav = resampler(source_wav)

	if source_wav.shape[0] > 1:
	source_wav = source_wav.mean(dim=0, keepdim=True)

	# Load reference audio
	ref_wav, ref_sr = torchaudio.load(reference_path)
	ref_wav = ref_wav.to(self.device)

	if ref_sr != 24000:
	resampler = torchaudio.transforms.Resample(ref_sr, 24000).to(self.device)
	ref_wav = resampler(ref_wav)

	if ref_wav.shape[0] > 1:
	ref_wav = ref_wav.mean(dim=0, keepdim=True)

	# Encode source for content
	with torch.no_grad():
	source_codes, _ = self.model.encode(source_wav.unsqueeze(0))

	# Encode reference for timbre
	_, ref_embedding = self.model.encode(ref_wav.unsqueeze(0))

	# Decode with source codes but reference embedding
	converted_wav = self.model.decode(source_codes, ref_embedding)

	return converted_wav.squeeze(0)

	# Initialize the CPU-compatible model
	lina_tokenizer = CPULinaCodec()

	def encode_decode_audio(audio_input):
	"""Encode and decode audio to demonstrate compression."""
	try:
	if audio_input is None:
	return None, "Please upload an audio file."

	# audio_input is a tuple (sample_rate, audio_data)
	sr, audio_data = audio_input

	# Save temporary file
	with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
	temp_path = tmp.name

	# Convert to tensor and save
	if audio_data.dtype == np.int16:
	audio_data = audio_data.astype(np.float32) / 32768.0
	elif audio_data.dtype == np.int32:
	audio_data = audio_data.astype(np.float32) / 2147483648.0

	# Handle mono/stereo
	if len(audio_data.shape) == 1:
	audio_tensor = torch.FloatTensor(audio_data).unsqueeze(0)
	else:
	audio_tensor = torch.FloatTensor(audio_data.T)

	# Save as wav
	torchaudio.save(temp_path, audio_tensor, sr)

	# Encode
	speech_tokens, global_embedding = lina_tokenizer.encode(temp_path)

	# Decode
	decoded_audio = lina_tokenizer.decode(speech_tokens, global_embedding)

	# Clean up
	os.unlink(temp_path)

	# Convert to numpy for Gradio
	decoded_audio = decoded_audio.cpu().squeeze().numpy()

	device_info = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
	info = f"✅ Success!\n"
	info += f"Device: {device_info}\n"
	info += f"Original sample rate: {sr} Hz\n"
	info += f"Output sample rate: 48000 Hz\n"
	info += f"Speech tokens shape: {speech_tokens.shape}\n"
	info += f"Global embedding shape: {global_embedding.shape}"

	return (48000, decoded_audio), info

	except Exception as e:
	return None, f"❌ Error: {str(e)}"

	def voice_conversion(source_audio, reference_audio):
	"""Convert voice using source content and reference timbre."""
	try:
	if source_audio is None or reference_audio is None:
	return None, "Please upload both source and reference audio files."

	# Save source audio
	sr_source, audio_source = source_audio
	with tempfile.NamedTemporaryFile(delete=False, suffix='_source.wav') as tmp:
	source_path = tmp.name

	if audio_source.dtype == np.int16:
	audio_source = audio_source.astype(np.float32) / 32768.0
	elif audio_source.dtype == np.int32:
	audio_source = audio_source.astype(np.float32) / 2147483648.0

	if len(audio_source.shape) == 1:
	audio_tensor = torch.FloatTensor(audio_source).unsqueeze(0)
	else:
	audio_tensor = torch.FloatTensor(audio_source.T)

	torchaudio.save(source_path, audio_tensor, sr_source)

	# Save reference audio
	sr_ref, audio_ref = reference_audio
	with tempfile.NamedTemporaryFile(delete=False, suffix='_ref.wav') as tmp:
	ref_path = tmp.name

	if audio_ref.dtype == np.int16:
	audio_ref = audio_ref.astype(np.float32) / 32768.0
	elif audio_ref.dtype == np.int32:
	audio_ref = audio_ref.astype(np.float32) / 2147483648.0

	if len(audio_ref.shape) == 1:
	audio_tensor = torch.FloatTensor(audio_ref).unsqueeze(0)
	else:
	audio_tensor = torch.FloatTensor(audio_ref.T)

	torchaudio.save(ref_path, audio_tensor, sr_ref)

	# Convert voice
	converted_audio = lina_tokenizer.convert_voice(source_path, ref_path)

	# Clean up
	os.unlink(source_path)
	os.unlink(ref_path)

	# Convert to numpy
	converted_audio = converted_audio.cpu().squeeze().numpy()

	device_info = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
	info = f"✅ Voice conversion successful!\n"
	info += f"Device: {device_info}\n"
	info += f"Source sample rate: {sr_source} Hz\n"
	info += f"Reference sample rate: {sr_ref} Hz\n"
	info += f"Output sample rate: 48000 Hz\n"
	info += f"Content taken from source, timbre/style from reference"

	return (48000, converted_audio), info

	except Exception as e:
	return None, f"❌ Error: {str(e)}"

	# Create Gradio interface
	with gr.Blocks(title="LinaCodec Audio Tool", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🎵 LinaCodec Audio Tool

	LinaCodec is a neural audio codec for high-quality speech compression and voice conversion.

	### Features:
	- 🔄 Encode & Decode: Compress and reconstruct audio at 48kHz
	- 🎭 Voice Conversion: Transfer timbre/style from one speaker to another
	- 💻 CPU Compatible: Works on both CPU and GPU
	""")

	with gr.Tabs():
	# Tab 1: Encode/Decode
	with gr.Tab("🔄 Encode & Decode"):
	gr.Markdown("""
	Upload an audio file to encode it into speech tokens and then decode it back.
	This demonstrates the codec's compression and reconstruction capabilities.
	""")

	with gr.Row():
	with gr.Column():
	audio_input = gr.Audio(
	label="Upload Audio",
	type="numpy",
	sources=["upload", "microphone"]
	)
	encode_btn = gr.Button("🚀 Encode & Decode", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(label="Decoded Audio")
	info_output = gr.Textbox(label="Info", lines=6)

	encode_btn.click(
	fn=encode_decode_audio,
	inputs=[audio_input],
	outputs=[audio_output, info_output]
	)

	gr.Examples(
	examples=[],
	inputs=[audio_input],
	label="Examples (upload your own audio)"
	)

	# Tab 2: Voice Conversion
	with gr.Tab("🎭 Voice Conversion"):
	gr.Markdown("""
	Convert voice by taking content from source audio and timbre/style from reference audio.

	- Source: The speech content you want to keep
	- Reference: The voice style/timbre you want to apply
	""")

	with gr.Row():
	with gr.Column():
	source_input = gr.Audio(
	label="Source Audio (Content)",
	type="numpy",
	sources=["upload", "microphone"]
	)
	reference_input = gr.Audio(
	label="Reference Audio (Timbre/Style)",
	type="numpy",
	sources=["upload", "microphone"]
	)
	convert_btn = gr.Button("✨ Convert Voice", variant="primary")

	with gr.Column():
	converted_output = gr.Audio(label="Converted Audio")
	convert_info = gr.Textbox(label="Info", lines=6)

	convert_btn.click(
	fn=voice_conversion,
	inputs=[source_input, reference_input],
	outputs=[converted_output, convert_info]
	)

	gr.Markdown("""
	---
	### 📚 About LinaCodec

	LinaCodec is a neural audio codec designed for high-quality speech compression and voice conversion.
	It encodes audio into discrete tokens and a global embedding, enabling efficient storage and manipulation of speech.

	Model: [YatharthS/LinaCodec](https://huggingface.co/YatharthS/LinaCodec)

	### ⚙️ Technical Details
	- Output sample rate: 48 kHz
	- Supports various input formats
	- Neural compression with high reconstruction quality
	- Works on both CPU and GPU (GPU recommended for faster processing)
	""")

	# Launch the app
	if __name__ == "__main__":
	demo.launch()