import gradio as gr import torch import numpy as np import torchaudio import tempfile import os # Patch LinaCodec to work on CPU print("Setting up LinaCodec for CPU...") # Import and patch before initializing from linacodec.tokenizer import LinaCodecModel from huggingface_hub import hf_hub_download import torch.nn as nn class CPULinaCodec: """CPU-compatible wrapper for LinaCodec""" def __init__(self): print("Loading LinaCodec model on CPU...") # Download model files repo_id = "YatharthS/LinaCodec" config_path = hf_hub_download(repo_id=repo_id, filename="config.yaml") weights_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors") # Load model on CPU instead of CUDA self.device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Using device: {self.device}") self.model = LinaCodecModel.from_pretrained( config_path=config_path, weights_path=weights_path ).eval() # Move to appropriate device self.model = self.model.to(self.device) self.sample_rate = 48000 print(f"Model loaded successfully on {self.device}!") def encode(self, audio_path): """Encode audio file to tokens and embeddings""" import torchaudio # Load audio wav, sr = torchaudio.load(audio_path) wav = wav.to(self.device) # Resample if needed if sr != 24000: resampler = torchaudio.transforms.Resample(sr, 24000).to(self.device) wav = resampler(wav) # Ensure mono if wav.shape[0] > 1: wav = wav.mean(dim=0, keepdim=True) # Encode with torch.no_grad(): codes, embedding = self.model.encode(wav.unsqueeze(0)) return codes, embedding def decode(self, codes, embedding): """Decode tokens and embeddings back to audio""" with torch.no_grad(): wav = self.model.decode(codes, embedding) return wav.squeeze(0) def convert_voice(self, source_path, reference_path): """Convert voice using source content and reference timbre""" import torchaudio # Load source audio source_wav, source_sr = torchaudio.load(source_path) source_wav = source_wav.to(self.device) if source_sr != 24000: resampler = torchaudio.transforms.Resample(source_sr, 24000).to(self.device) source_wav = resampler(source_wav) if source_wav.shape[0] > 1: source_wav = source_wav.mean(dim=0, keepdim=True) # Load reference audio ref_wav, ref_sr = torchaudio.load(reference_path) ref_wav = ref_wav.to(self.device) if ref_sr != 24000: resampler = torchaudio.transforms.Resample(ref_sr, 24000).to(self.device) ref_wav = resampler(ref_wav) if ref_wav.shape[0] > 1: ref_wav = ref_wav.mean(dim=0, keepdim=True) # Encode source for content with torch.no_grad(): source_codes, _ = self.model.encode(source_wav.unsqueeze(0)) # Encode reference for timbre _, ref_embedding = self.model.encode(ref_wav.unsqueeze(0)) # Decode with source codes but reference embedding converted_wav = self.model.decode(source_codes, ref_embedding) return converted_wav.squeeze(0) # Initialize the CPU-compatible model lina_tokenizer = CPULinaCodec() def encode_decode_audio(audio_input): """Encode and decode audio to demonstrate compression.""" try: if audio_input is None: return None, "Please upload an audio file." # audio_input is a tuple (sample_rate, audio_data) sr, audio_data = audio_input # Save temporary file with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp: temp_path = tmp.name # Convert to tensor and save if audio_data.dtype == np.int16: audio_data = audio_data.astype(np.float32) / 32768.0 elif audio_data.dtype == np.int32: audio_data = audio_data.astype(np.float32) / 2147483648.0 # Handle mono/stereo if len(audio_data.shape) == 1: audio_tensor = torch.FloatTensor(audio_data).unsqueeze(0) else: audio_tensor = torch.FloatTensor(audio_data.T) # Save as wav torchaudio.save(temp_path, audio_tensor, sr) # Encode speech_tokens, global_embedding = lina_tokenizer.encode(temp_path) # Decode decoded_audio = lina_tokenizer.decode(speech_tokens, global_embedding) # Clean up os.unlink(temp_path) # Convert to numpy for Gradio decoded_audio = decoded_audio.cpu().squeeze().numpy() device_info = "GPU (CUDA)" if torch.cuda.is_available() else "CPU" info = f"✅ Success!\n" info += f"Device: {device_info}\n" info += f"Original sample rate: {sr} Hz\n" info += f"Output sample rate: 48000 Hz\n" info += f"Speech tokens shape: {speech_tokens.shape}\n" info += f"Global embedding shape: {global_embedding.shape}" return (48000, decoded_audio), info except Exception as e: return None, f"❌ Error: {str(e)}" def voice_conversion(source_audio, reference_audio): """Convert voice using source content and reference timbre.""" try: if source_audio is None or reference_audio is None: return None, "Please upload both source and reference audio files." # Save source audio sr_source, audio_source = source_audio with tempfile.NamedTemporaryFile(delete=False, suffix='_source.wav') as tmp: source_path = tmp.name if audio_source.dtype == np.int16: audio_source = audio_source.astype(np.float32) / 32768.0 elif audio_source.dtype == np.int32: audio_source = audio_source.astype(np.float32) / 2147483648.0 if len(audio_source.shape) == 1: audio_tensor = torch.FloatTensor(audio_source).unsqueeze(0) else: audio_tensor = torch.FloatTensor(audio_source.T) torchaudio.save(source_path, audio_tensor, sr_source) # Save reference audio sr_ref, audio_ref = reference_audio with tempfile.NamedTemporaryFile(delete=False, suffix='_ref.wav') as tmp: ref_path = tmp.name if audio_ref.dtype == np.int16: audio_ref = audio_ref.astype(np.float32) / 32768.0 elif audio_ref.dtype == np.int32: audio_ref = audio_ref.astype(np.float32) / 2147483648.0 if len(audio_ref.shape) == 1: audio_tensor = torch.FloatTensor(audio_ref).unsqueeze(0) else: audio_tensor = torch.FloatTensor(audio_ref.T) torchaudio.save(ref_path, audio_tensor, sr_ref) # Convert voice converted_audio = lina_tokenizer.convert_voice(source_path, ref_path) # Clean up os.unlink(source_path) os.unlink(ref_path) # Convert to numpy converted_audio = converted_audio.cpu().squeeze().numpy() device_info = "GPU (CUDA)" if torch.cuda.is_available() else "CPU" info = f"✅ Voice conversion successful!\n" info += f"Device: {device_info}\n" info += f"Source sample rate: {sr_source} Hz\n" info += f"Reference sample rate: {sr_ref} Hz\n" info += f"Output sample rate: 48000 Hz\n" info += f"Content taken from source, timbre/style from reference" return (48000, converted_audio), info except Exception as e: return None, f"❌ Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="LinaCodec Audio Tool", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🎵 LinaCodec Audio Tool **LinaCodec** is a neural audio codec for high-quality speech compression and voice conversion. ### Features: - 🔄 **Encode & Decode**: Compress and reconstruct audio at 48kHz - 🎭 **Voice Conversion**: Transfer timbre/style from one speaker to another - 💻 **CPU Compatible**: Works on both CPU and GPU """) with gr.Tabs(): # Tab 1: Encode/Decode with gr.Tab("🔄 Encode & Decode"): gr.Markdown(""" Upload an audio file to encode it into speech tokens and then decode it back. This demonstrates the codec's compression and reconstruction capabilities. """) with gr.Row(): with gr.Column(): audio_input = gr.Audio( label="Upload Audio", type="numpy", sources=["upload", "microphone"] ) encode_btn = gr.Button("🚀 Encode & Decode", variant="primary") with gr.Column(): audio_output = gr.Audio(label="Decoded Audio") info_output = gr.Textbox(label="Info", lines=6) encode_btn.click( fn=encode_decode_audio, inputs=[audio_input], outputs=[audio_output, info_output] ) gr.Examples( examples=[], inputs=[audio_input], label="Examples (upload your own audio)" ) # Tab 2: Voice Conversion with gr.Tab("🎭 Voice Conversion"): gr.Markdown(""" Convert voice by taking content from **source audio** and timbre/style from **reference audio**. - **Source**: The speech content you want to keep - **Reference**: The voice style/timbre you want to apply """) with gr.Row(): with gr.Column(): source_input = gr.Audio( label="Source Audio (Content)", type="numpy", sources=["upload", "microphone"] ) reference_input = gr.Audio( label="Reference Audio (Timbre/Style)", type="numpy", sources=["upload", "microphone"] ) convert_btn = gr.Button("✨ Convert Voice", variant="primary") with gr.Column(): converted_output = gr.Audio(label="Converted Audio") convert_info = gr.Textbox(label="Info", lines=6) convert_btn.click( fn=voice_conversion, inputs=[source_input, reference_input], outputs=[converted_output, convert_info] ) gr.Markdown(""" --- ### 📚 About LinaCodec LinaCodec is a neural audio codec designed for high-quality speech compression and voice conversion. It encodes audio into discrete tokens and a global embedding, enabling efficient storage and manipulation of speech. **Model**: [YatharthS/LinaCodec](https://huggingface.co/YatharthS/LinaCodec) ### ⚙️ Technical Details - Output sample rate: 48 kHz - Supports various input formats - Neural compression with high reconstruction quality - Works on both CPU and GPU (GPU recommended for faster processing) """) # Launch the app if __name__ == "__main__": demo.launch()