|
|
import gradio as gr |
|
|
import torch |
|
|
import numpy as np |
|
|
import torchaudio |
|
|
import tempfile |
|
|
import os |
|
|
|
|
|
|
|
|
print("Setting up LinaCodec for CPU...") |
|
|
|
|
|
|
|
|
from linacodec.tokenizer import LinaCodecModel |
|
|
from huggingface_hub import hf_hub_download |
|
|
import torch.nn as nn |
|
|
|
|
|
class CPULinaCodec: |
|
|
"""CPU-compatible wrapper for LinaCodec""" |
|
|
|
|
|
def __init__(self): |
|
|
print("Loading LinaCodec model on CPU...") |
|
|
|
|
|
|
|
|
repo_id = "YatharthS/LinaCodec" |
|
|
config_path = hf_hub_download(repo_id=repo_id, filename="config.yaml") |
|
|
weights_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors") |
|
|
|
|
|
|
|
|
self.device = "cuda" if torch.cuda.is_available() else "cpu" |
|
|
print(f"Using device: {self.device}") |
|
|
|
|
|
self.model = LinaCodecModel.from_pretrained( |
|
|
config_path=config_path, |
|
|
weights_path=weights_path |
|
|
).eval() |
|
|
|
|
|
|
|
|
self.model = self.model.to(self.device) |
|
|
|
|
|
self.sample_rate = 48000 |
|
|
print(f"Model loaded successfully on {self.device}!") |
|
|
|
|
|
def encode(self, audio_path): |
|
|
"""Encode audio file to tokens and embeddings""" |
|
|
import torchaudio |
|
|
|
|
|
|
|
|
wav, sr = torchaudio.load(audio_path) |
|
|
wav = wav.to(self.device) |
|
|
|
|
|
|
|
|
if sr != 24000: |
|
|
resampler = torchaudio.transforms.Resample(sr, 24000).to(self.device) |
|
|
wav = resampler(wav) |
|
|
|
|
|
|
|
|
if wav.shape[0] > 1: |
|
|
wav = wav.mean(dim=0, keepdim=True) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
codes, embedding = self.model.encode(wav.unsqueeze(0)) |
|
|
|
|
|
return codes, embedding |
|
|
|
|
|
def decode(self, codes, embedding): |
|
|
"""Decode tokens and embeddings back to audio""" |
|
|
with torch.no_grad(): |
|
|
wav = self.model.decode(codes, embedding) |
|
|
|
|
|
return wav.squeeze(0) |
|
|
|
|
|
def convert_voice(self, source_path, reference_path): |
|
|
"""Convert voice using source content and reference timbre""" |
|
|
import torchaudio |
|
|
|
|
|
|
|
|
source_wav, source_sr = torchaudio.load(source_path) |
|
|
source_wav = source_wav.to(self.device) |
|
|
|
|
|
if source_sr != 24000: |
|
|
resampler = torchaudio.transforms.Resample(source_sr, 24000).to(self.device) |
|
|
source_wav = resampler(source_wav) |
|
|
|
|
|
if source_wav.shape[0] > 1: |
|
|
source_wav = source_wav.mean(dim=0, keepdim=True) |
|
|
|
|
|
|
|
|
ref_wav, ref_sr = torchaudio.load(reference_path) |
|
|
ref_wav = ref_wav.to(self.device) |
|
|
|
|
|
if ref_sr != 24000: |
|
|
resampler = torchaudio.transforms.Resample(ref_sr, 24000).to(self.device) |
|
|
ref_wav = resampler(ref_wav) |
|
|
|
|
|
if ref_wav.shape[0] > 1: |
|
|
ref_wav = ref_wav.mean(dim=0, keepdim=True) |
|
|
|
|
|
|
|
|
with torch.no_grad(): |
|
|
source_codes, _ = self.model.encode(source_wav.unsqueeze(0)) |
|
|
|
|
|
|
|
|
_, ref_embedding = self.model.encode(ref_wav.unsqueeze(0)) |
|
|
|
|
|
|
|
|
converted_wav = self.model.decode(source_codes, ref_embedding) |
|
|
|
|
|
return converted_wav.squeeze(0) |
|
|
|
|
|
|
|
|
lina_tokenizer = CPULinaCodec() |
|
|
|
|
|
def encode_decode_audio(audio_input): |
|
|
"""Encode and decode audio to demonstrate compression.""" |
|
|
try: |
|
|
if audio_input is None: |
|
|
return None, "Please upload an audio file." |
|
|
|
|
|
|
|
|
sr, audio_data = audio_input |
|
|
|
|
|
|
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp: |
|
|
temp_path = tmp.name |
|
|
|
|
|
|
|
|
if audio_data.dtype == np.int16: |
|
|
audio_data = audio_data.astype(np.float32) / 32768.0 |
|
|
elif audio_data.dtype == np.int32: |
|
|
audio_data = audio_data.astype(np.float32) / 2147483648.0 |
|
|
|
|
|
|
|
|
if len(audio_data.shape) == 1: |
|
|
audio_tensor = torch.FloatTensor(audio_data).unsqueeze(0) |
|
|
else: |
|
|
audio_tensor = torch.FloatTensor(audio_data.T) |
|
|
|
|
|
|
|
|
torchaudio.save(temp_path, audio_tensor, sr) |
|
|
|
|
|
|
|
|
speech_tokens, global_embedding = lina_tokenizer.encode(temp_path) |
|
|
|
|
|
|
|
|
decoded_audio = lina_tokenizer.decode(speech_tokens, global_embedding) |
|
|
|
|
|
|
|
|
os.unlink(temp_path) |
|
|
|
|
|
|
|
|
decoded_audio = decoded_audio.cpu().squeeze().numpy() |
|
|
|
|
|
device_info = "GPU (CUDA)" if torch.cuda.is_available() else "CPU" |
|
|
info = f"β
Success!\n" |
|
|
info += f"Device: {device_info}\n" |
|
|
info += f"Original sample rate: {sr} Hz\n" |
|
|
info += f"Output sample rate: 48000 Hz\n" |
|
|
info += f"Speech tokens shape: {speech_tokens.shape}\n" |
|
|
info += f"Global embedding shape: {global_embedding.shape}" |
|
|
|
|
|
return (48000, decoded_audio), info |
|
|
|
|
|
except Exception as e: |
|
|
return None, f"β Error: {str(e)}" |
|
|
|
|
|
def voice_conversion(source_audio, reference_audio): |
|
|
"""Convert voice using source content and reference timbre.""" |
|
|
try: |
|
|
if source_audio is None or reference_audio is None: |
|
|
return None, "Please upload both source and reference audio files." |
|
|
|
|
|
|
|
|
sr_source, audio_source = source_audio |
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='_source.wav') as tmp: |
|
|
source_path = tmp.name |
|
|
|
|
|
if audio_source.dtype == np.int16: |
|
|
audio_source = audio_source.astype(np.float32) / 32768.0 |
|
|
elif audio_source.dtype == np.int32: |
|
|
audio_source = audio_source.astype(np.float32) / 2147483648.0 |
|
|
|
|
|
if len(audio_source.shape) == 1: |
|
|
audio_tensor = torch.FloatTensor(audio_source).unsqueeze(0) |
|
|
else: |
|
|
audio_tensor = torch.FloatTensor(audio_source.T) |
|
|
|
|
|
torchaudio.save(source_path, audio_tensor, sr_source) |
|
|
|
|
|
|
|
|
sr_ref, audio_ref = reference_audio |
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix='_ref.wav') as tmp: |
|
|
ref_path = tmp.name |
|
|
|
|
|
if audio_ref.dtype == np.int16: |
|
|
audio_ref = audio_ref.astype(np.float32) / 32768.0 |
|
|
elif audio_ref.dtype == np.int32: |
|
|
audio_ref = audio_ref.astype(np.float32) / 2147483648.0 |
|
|
|
|
|
if len(audio_ref.shape) == 1: |
|
|
audio_tensor = torch.FloatTensor(audio_ref).unsqueeze(0) |
|
|
else: |
|
|
audio_tensor = torch.FloatTensor(audio_ref.T) |
|
|
|
|
|
torchaudio.save(ref_path, audio_tensor, sr_ref) |
|
|
|
|
|
|
|
|
converted_audio = lina_tokenizer.convert_voice(source_path, ref_path) |
|
|
|
|
|
|
|
|
os.unlink(source_path) |
|
|
os.unlink(ref_path) |
|
|
|
|
|
|
|
|
converted_audio = converted_audio.cpu().squeeze().numpy() |
|
|
|
|
|
device_info = "GPU (CUDA)" if torch.cuda.is_available() else "CPU" |
|
|
info = f"β
Voice conversion successful!\n" |
|
|
info += f"Device: {device_info}\n" |
|
|
info += f"Source sample rate: {sr_source} Hz\n" |
|
|
info += f"Reference sample rate: {sr_ref} Hz\n" |
|
|
info += f"Output sample rate: 48000 Hz\n" |
|
|
info += f"Content taken from source, timbre/style from reference" |
|
|
|
|
|
return (48000, converted_audio), info |
|
|
|
|
|
except Exception as e: |
|
|
return None, f"β Error: {str(e)}" |
|
|
|
|
|
|
|
|
with gr.Blocks(title="LinaCodec Audio Tool", theme=gr.themes.Soft()) as demo: |
|
|
gr.Markdown(""" |
|
|
# π΅ LinaCodec Audio Tool |
|
|
|
|
|
**LinaCodec** is a neural audio codec for high-quality speech compression and voice conversion. |
|
|
|
|
|
### Features: |
|
|
- π **Encode & Decode**: Compress and reconstruct audio at 48kHz |
|
|
- π **Voice Conversion**: Transfer timbre/style from one speaker to another |
|
|
- π» **CPU Compatible**: Works on both CPU and GPU |
|
|
""") |
|
|
|
|
|
with gr.Tabs(): |
|
|
|
|
|
with gr.Tab("π Encode & Decode"): |
|
|
gr.Markdown(""" |
|
|
Upload an audio file to encode it into speech tokens and then decode it back. |
|
|
This demonstrates the codec's compression and reconstruction capabilities. |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
audio_input = gr.Audio( |
|
|
label="Upload Audio", |
|
|
type="numpy", |
|
|
sources=["upload", "microphone"] |
|
|
) |
|
|
encode_btn = gr.Button("π Encode & Decode", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
audio_output = gr.Audio(label="Decoded Audio") |
|
|
info_output = gr.Textbox(label="Info", lines=6) |
|
|
|
|
|
encode_btn.click( |
|
|
fn=encode_decode_audio, |
|
|
inputs=[audio_input], |
|
|
outputs=[audio_output, info_output] |
|
|
) |
|
|
|
|
|
gr.Examples( |
|
|
examples=[], |
|
|
inputs=[audio_input], |
|
|
label="Examples (upload your own audio)" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("π Voice Conversion"): |
|
|
gr.Markdown(""" |
|
|
Convert voice by taking content from **source audio** and timbre/style from **reference audio**. |
|
|
|
|
|
- **Source**: The speech content you want to keep |
|
|
- **Reference**: The voice style/timbre you want to apply |
|
|
""") |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(): |
|
|
source_input = gr.Audio( |
|
|
label="Source Audio (Content)", |
|
|
type="numpy", |
|
|
sources=["upload", "microphone"] |
|
|
) |
|
|
reference_input = gr.Audio( |
|
|
label="Reference Audio (Timbre/Style)", |
|
|
type="numpy", |
|
|
sources=["upload", "microphone"] |
|
|
) |
|
|
convert_btn = gr.Button("β¨ Convert Voice", variant="primary") |
|
|
|
|
|
with gr.Column(): |
|
|
converted_output = gr.Audio(label="Converted Audio") |
|
|
convert_info = gr.Textbox(label="Info", lines=6) |
|
|
|
|
|
convert_btn.click( |
|
|
fn=voice_conversion, |
|
|
inputs=[source_input, reference_input], |
|
|
outputs=[converted_output, convert_info] |
|
|
) |
|
|
|
|
|
gr.Markdown(""" |
|
|
--- |
|
|
### π About LinaCodec |
|
|
|
|
|
LinaCodec is a neural audio codec designed for high-quality speech compression and voice conversion. |
|
|
It encodes audio into discrete tokens and a global embedding, enabling efficient storage and manipulation of speech. |
|
|
|
|
|
**Model**: [YatharthS/LinaCodec](https://huggingface.co/YatharthS/LinaCodec) |
|
|
|
|
|
### βοΈ Technical Details |
|
|
- Output sample rate: 48 kHz |
|
|
- Supports various input formats |
|
|
- Neural compression with high reconstruction quality |
|
|
- Works on both CPU and GPU (GPU recommended for faster processing) |
|
|
""") |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch() |