LinaCodec / app.py
humair025's picture
Update app.py
9dcc975 verified
import gradio as gr
import torch
import numpy as np
import torchaudio
import tempfile
import os
# Patch LinaCodec to work on CPU
print("Setting up LinaCodec for CPU...")
# Import and patch before initializing
from linacodec.tokenizer import LinaCodecModel
from huggingface_hub import hf_hub_download
import torch.nn as nn
class CPULinaCodec:
"""CPU-compatible wrapper for LinaCodec"""
def __init__(self):
print("Loading LinaCodec model on CPU...")
# Download model files
repo_id = "YatharthS/LinaCodec"
config_path = hf_hub_download(repo_id=repo_id, filename="config.yaml")
weights_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors")
# Load model on CPU instead of CUDA
self.device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {self.device}")
self.model = LinaCodecModel.from_pretrained(
config_path=config_path,
weights_path=weights_path
).eval()
# Move to appropriate device
self.model = self.model.to(self.device)
self.sample_rate = 48000
print(f"Model loaded successfully on {self.device}!")
def encode(self, audio_path):
"""Encode audio file to tokens and embeddings"""
import torchaudio
# Load audio
wav, sr = torchaudio.load(audio_path)
wav = wav.to(self.device)
# Resample if needed
if sr != 24000:
resampler = torchaudio.transforms.Resample(sr, 24000).to(self.device)
wav = resampler(wav)
# Ensure mono
if wav.shape[0] > 1:
wav = wav.mean(dim=0, keepdim=True)
# Encode
with torch.no_grad():
codes, embedding = self.model.encode(wav.unsqueeze(0))
return codes, embedding
def decode(self, codes, embedding):
"""Decode tokens and embeddings back to audio"""
with torch.no_grad():
wav = self.model.decode(codes, embedding)
return wav.squeeze(0)
def convert_voice(self, source_path, reference_path):
"""Convert voice using source content and reference timbre"""
import torchaudio
# Load source audio
source_wav, source_sr = torchaudio.load(source_path)
source_wav = source_wav.to(self.device)
if source_sr != 24000:
resampler = torchaudio.transforms.Resample(source_sr, 24000).to(self.device)
source_wav = resampler(source_wav)
if source_wav.shape[0] > 1:
source_wav = source_wav.mean(dim=0, keepdim=True)
# Load reference audio
ref_wav, ref_sr = torchaudio.load(reference_path)
ref_wav = ref_wav.to(self.device)
if ref_sr != 24000:
resampler = torchaudio.transforms.Resample(ref_sr, 24000).to(self.device)
ref_wav = resampler(ref_wav)
if ref_wav.shape[0] > 1:
ref_wav = ref_wav.mean(dim=0, keepdim=True)
# Encode source for content
with torch.no_grad():
source_codes, _ = self.model.encode(source_wav.unsqueeze(0))
# Encode reference for timbre
_, ref_embedding = self.model.encode(ref_wav.unsqueeze(0))
# Decode with source codes but reference embedding
converted_wav = self.model.decode(source_codes, ref_embedding)
return converted_wav.squeeze(0)
# Initialize the CPU-compatible model
lina_tokenizer = CPULinaCodec()
def encode_decode_audio(audio_input):
"""Encode and decode audio to demonstrate compression."""
try:
if audio_input is None:
return None, "Please upload an audio file."
# audio_input is a tuple (sample_rate, audio_data)
sr, audio_data = audio_input
# Save temporary file
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
temp_path = tmp.name
# Convert to tensor and save
if audio_data.dtype == np.int16:
audio_data = audio_data.astype(np.float32) / 32768.0
elif audio_data.dtype == np.int32:
audio_data = audio_data.astype(np.float32) / 2147483648.0
# Handle mono/stereo
if len(audio_data.shape) == 1:
audio_tensor = torch.FloatTensor(audio_data).unsqueeze(0)
else:
audio_tensor = torch.FloatTensor(audio_data.T)
# Save as wav
torchaudio.save(temp_path, audio_tensor, sr)
# Encode
speech_tokens, global_embedding = lina_tokenizer.encode(temp_path)
# Decode
decoded_audio = lina_tokenizer.decode(speech_tokens, global_embedding)
# Clean up
os.unlink(temp_path)
# Convert to numpy for Gradio
decoded_audio = decoded_audio.cpu().squeeze().numpy()
device_info = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
info = f"βœ… Success!\n"
info += f"Device: {device_info}\n"
info += f"Original sample rate: {sr} Hz\n"
info += f"Output sample rate: 48000 Hz\n"
info += f"Speech tokens shape: {speech_tokens.shape}\n"
info += f"Global embedding shape: {global_embedding.shape}"
return (48000, decoded_audio), info
except Exception as e:
return None, f"❌ Error: {str(e)}"
def voice_conversion(source_audio, reference_audio):
"""Convert voice using source content and reference timbre."""
try:
if source_audio is None or reference_audio is None:
return None, "Please upload both source and reference audio files."
# Save source audio
sr_source, audio_source = source_audio
with tempfile.NamedTemporaryFile(delete=False, suffix='_source.wav') as tmp:
source_path = tmp.name
if audio_source.dtype == np.int16:
audio_source = audio_source.astype(np.float32) / 32768.0
elif audio_source.dtype == np.int32:
audio_source = audio_source.astype(np.float32) / 2147483648.0
if len(audio_source.shape) == 1:
audio_tensor = torch.FloatTensor(audio_source).unsqueeze(0)
else:
audio_tensor = torch.FloatTensor(audio_source.T)
torchaudio.save(source_path, audio_tensor, sr_source)
# Save reference audio
sr_ref, audio_ref = reference_audio
with tempfile.NamedTemporaryFile(delete=False, suffix='_ref.wav') as tmp:
ref_path = tmp.name
if audio_ref.dtype == np.int16:
audio_ref = audio_ref.astype(np.float32) / 32768.0
elif audio_ref.dtype == np.int32:
audio_ref = audio_ref.astype(np.float32) / 2147483648.0
if len(audio_ref.shape) == 1:
audio_tensor = torch.FloatTensor(audio_ref).unsqueeze(0)
else:
audio_tensor = torch.FloatTensor(audio_ref.T)
torchaudio.save(ref_path, audio_tensor, sr_ref)
# Convert voice
converted_audio = lina_tokenizer.convert_voice(source_path, ref_path)
# Clean up
os.unlink(source_path)
os.unlink(ref_path)
# Convert to numpy
converted_audio = converted_audio.cpu().squeeze().numpy()
device_info = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
info = f"βœ… Voice conversion successful!\n"
info += f"Device: {device_info}\n"
info += f"Source sample rate: {sr_source} Hz\n"
info += f"Reference sample rate: {sr_ref} Hz\n"
info += f"Output sample rate: 48000 Hz\n"
info += f"Content taken from source, timbre/style from reference"
return (48000, converted_audio), info
except Exception as e:
return None, f"❌ Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="LinaCodec Audio Tool", theme=gr.themes.Soft()) as demo:
gr.Markdown("""
# 🎡 LinaCodec Audio Tool
**LinaCodec** is a neural audio codec for high-quality speech compression and voice conversion.
### Features:
- πŸ”„ **Encode & Decode**: Compress and reconstruct audio at 48kHz
- 🎭 **Voice Conversion**: Transfer timbre/style from one speaker to another
- πŸ’» **CPU Compatible**: Works on both CPU and GPU
""")
with gr.Tabs():
# Tab 1: Encode/Decode
with gr.Tab("πŸ”„ Encode & Decode"):
gr.Markdown("""
Upload an audio file to encode it into speech tokens and then decode it back.
This demonstrates the codec's compression and reconstruction capabilities.
""")
with gr.Row():
with gr.Column():
audio_input = gr.Audio(
label="Upload Audio",
type="numpy",
sources=["upload", "microphone"]
)
encode_btn = gr.Button("πŸš€ Encode & Decode", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Decoded Audio")
info_output = gr.Textbox(label="Info", lines=6)
encode_btn.click(
fn=encode_decode_audio,
inputs=[audio_input],
outputs=[audio_output, info_output]
)
gr.Examples(
examples=[],
inputs=[audio_input],
label="Examples (upload your own audio)"
)
# Tab 2: Voice Conversion
with gr.Tab("🎭 Voice Conversion"):
gr.Markdown("""
Convert voice by taking content from **source audio** and timbre/style from **reference audio**.
- **Source**: The speech content you want to keep
- **Reference**: The voice style/timbre you want to apply
""")
with gr.Row():
with gr.Column():
source_input = gr.Audio(
label="Source Audio (Content)",
type="numpy",
sources=["upload", "microphone"]
)
reference_input = gr.Audio(
label="Reference Audio (Timbre/Style)",
type="numpy",
sources=["upload", "microphone"]
)
convert_btn = gr.Button("✨ Convert Voice", variant="primary")
with gr.Column():
converted_output = gr.Audio(label="Converted Audio")
convert_info = gr.Textbox(label="Info", lines=6)
convert_btn.click(
fn=voice_conversion,
inputs=[source_input, reference_input],
outputs=[converted_output, convert_info]
)
gr.Markdown("""
---
### πŸ“š About LinaCodec
LinaCodec is a neural audio codec designed for high-quality speech compression and voice conversion.
It encodes audio into discrete tokens and a global embedding, enabling efficient storage and manipulation of speech.
**Model**: [YatharthS/LinaCodec](https://huggingface.co/YatharthS/LinaCodec)
### βš™οΈ Technical Details
- Output sample rate: 48 kHz
- Supports various input formats
- Neural compression with high reconstruction quality
- Works on both CPU and GPU (GPU recommended for faster processing)
""")
# Launch the app
if __name__ == "__main__":
demo.launch()