Spaces:

humair025
/

LinaCodec

Runtime error

File size: 11,935 Bytes

import gradio as gr
import torch
import numpy as np
import torchaudio
import tempfile
import os

# Patch LinaCodec to work on CPU
print("Setting up LinaCodec for CPU...")

# Import and patch before initializing
from linacodec.tokenizer import LinaCodecModel
from huggingface_hub import hf_hub_download
import torch.nn as nn

class CPULinaCodec:
    """CPU-compatible wrapper for LinaCodec"""
    
    def __init__(self):
        print("Loading LinaCodec model on CPU...")
        
        # Download model files
        repo_id = "YatharthS/LinaCodec"
        config_path = hf_hub_download(repo_id=repo_id, filename="config.yaml")
        weights_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors")
        
        # Load model on CPU instead of CUDA
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Using device: {self.device}")
        
        self.model = LinaCodecModel.from_pretrained(
            config_path=config_path,
            weights_path=weights_path
        ).eval()
        
        # Move to appropriate device
        self.model = self.model.to(self.device)
        
        self.sample_rate = 48000
        print(f"Model loaded successfully on {self.device}!")
    
    def encode(self, audio_path):
        """Encode audio file to tokens and embeddings"""
        import torchaudio
        
        # Load audio
        wav, sr = torchaudio.load(audio_path)
        wav = wav.to(self.device)
        
        # Resample if needed
        if sr != 24000:
            resampler = torchaudio.transforms.Resample(sr, 24000).to(self.device)
            wav = resampler(wav)
        
        # Ensure mono
        if wav.shape[0] > 1:
            wav = wav.mean(dim=0, keepdim=True)
        
        # Encode
        with torch.no_grad():
            codes, embedding = self.model.encode(wav.unsqueeze(0))
        
        return codes, embedding
    
    def decode(self, codes, embedding):
        """Decode tokens and embeddings back to audio"""
        with torch.no_grad():
            wav = self.model.decode(codes, embedding)
        
        return wav.squeeze(0)
    
    def convert_voice(self, source_path, reference_path):
        """Convert voice using source content and reference timbre"""
        import torchaudio
        
        # Load source audio
        source_wav, source_sr = torchaudio.load(source_path)
        source_wav = source_wav.to(self.device)
        
        if source_sr != 24000:
            resampler = torchaudio.transforms.Resample(source_sr, 24000).to(self.device)
            source_wav = resampler(source_wav)
        
        if source_wav.shape[0] > 1:
            source_wav = source_wav.mean(dim=0, keepdim=True)
        
        # Load reference audio
        ref_wav, ref_sr = torchaudio.load(reference_path)
        ref_wav = ref_wav.to(self.device)
        
        if ref_sr != 24000:
            resampler = torchaudio.transforms.Resample(ref_sr, 24000).to(self.device)
            ref_wav = resampler(ref_wav)
        
        if ref_wav.shape[0] > 1:
            ref_wav = ref_wav.mean(dim=0, keepdim=True)
        
        # Encode source for content
        with torch.no_grad():
            source_codes, _ = self.model.encode(source_wav.unsqueeze(0))
            
            # Encode reference for timbre
            _, ref_embedding = self.model.encode(ref_wav.unsqueeze(0))
            
            # Decode with source codes but reference embedding
            converted_wav = self.model.decode(source_codes, ref_embedding)
        
        return converted_wav.squeeze(0)

# Initialize the CPU-compatible model
lina_tokenizer = CPULinaCodec()

def encode_decode_audio(audio_input):
    """Encode and decode audio to demonstrate compression."""
    try:
        if audio_input is None:
            return None, "Please upload an audio file."
        
        # audio_input is a tuple (sample_rate, audio_data)
        sr, audio_data = audio_input
        
        # Save temporary file
        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
            temp_path = tmp.name
            
        # Convert to tensor and save
        if audio_data.dtype == np.int16:
            audio_data = audio_data.astype(np.float32) / 32768.0
        elif audio_data.dtype == np.int32:
            audio_data = audio_data.astype(np.float32) / 2147483648.0
        
        # Handle mono/stereo
        if len(audio_data.shape) == 1:
            audio_tensor = torch.FloatTensor(audio_data).unsqueeze(0)
        else:
            audio_tensor = torch.FloatTensor(audio_data.T)
        
        # Save as wav
        torchaudio.save(temp_path, audio_tensor, sr)
        
        # Encode
        speech_tokens, global_embedding = lina_tokenizer.encode(temp_path)
        
        # Decode
        decoded_audio = lina_tokenizer.decode(speech_tokens, global_embedding)
        
        # Clean up
        os.unlink(temp_path)
        
        # Convert to numpy for Gradio
        decoded_audio = decoded_audio.cpu().squeeze().numpy()
        
        device_info = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
        info = f"✅ Success!\n"
        info += f"Device: {device_info}\n"
        info += f"Original sample rate: {sr} Hz\n"
        info += f"Output sample rate: 48000 Hz\n"
        info += f"Speech tokens shape: {speech_tokens.shape}\n"
        info += f"Global embedding shape: {global_embedding.shape}"
        
        return (48000, decoded_audio), info
        
    except Exception as e:
        return None, f"❌ Error: {str(e)}"

def voice_conversion(source_audio, reference_audio):
    """Convert voice using source content and reference timbre."""
    try:
        if source_audio is None or reference_audio is None:
            return None, "Please upload both source and reference audio files."
        
        # Save source audio
        sr_source, audio_source = source_audio
        with tempfile.NamedTemporaryFile(delete=False, suffix='_source.wav') as tmp:
            source_path = tmp.name
        
        if audio_source.dtype == np.int16:
            audio_source = audio_source.astype(np.float32) / 32768.0
        elif audio_source.dtype == np.int32:
            audio_source = audio_source.astype(np.float32) / 2147483648.0
        
        if len(audio_source.shape) == 1:
            audio_tensor = torch.FloatTensor(audio_source).unsqueeze(0)
        else:
            audio_tensor = torch.FloatTensor(audio_source.T)
        
        torchaudio.save(source_path, audio_tensor, sr_source)
        
        # Save reference audio
        sr_ref, audio_ref = reference_audio
        with tempfile.NamedTemporaryFile(delete=False, suffix='_ref.wav') as tmp:
            ref_path = tmp.name
        
        if audio_ref.dtype == np.int16:
            audio_ref = audio_ref.astype(np.float32) / 32768.0
        elif audio_ref.dtype == np.int32:
            audio_ref = audio_ref.astype(np.float32) / 2147483648.0
        
        if len(audio_ref.shape) == 1:
            audio_tensor = torch.FloatTensor(audio_ref).unsqueeze(0)
        else:
            audio_tensor = torch.FloatTensor(audio_ref.T)
        
        torchaudio.save(ref_path, audio_tensor, sr_ref)
        
        # Convert voice
        converted_audio = lina_tokenizer.convert_voice(source_path, ref_path)
        
        # Clean up
        os.unlink(source_path)
        os.unlink(ref_path)
        
        # Convert to numpy
        converted_audio = converted_audio.cpu().squeeze().numpy()
        
        device_info = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
        info = f"✅ Voice conversion successful!\n"
        info += f"Device: {device_info}\n"
        info += f"Source sample rate: {sr_source} Hz\n"
        info += f"Reference sample rate: {sr_ref} Hz\n"
        info += f"Output sample rate: 48000 Hz\n"
        info += f"Content taken from source, timbre/style from reference"
        
        return (48000, converted_audio), info
        
    except Exception as e:
        return None, f"❌ Error: {str(e)}"

# Create Gradio interface
with gr.Blocks(title="LinaCodec Audio Tool", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🎵 LinaCodec Audio Tool
    
    **LinaCodec** is a neural audio codec for high-quality speech compression and voice conversion.
    
    ### Features:
    - 🔄 **Encode & Decode**: Compress and reconstruct audio at 48kHz
    - 🎭 **Voice Conversion**: Transfer timbre/style from one speaker to another
    - 💻 **CPU Compatible**: Works on both CPU and GPU
    """)
    
    with gr.Tabs():
        # Tab 1: Encode/Decode
        with gr.Tab("🔄 Encode & Decode"):
            gr.Markdown("""
            Upload an audio file to encode it into speech tokens and then decode it back.
            This demonstrates the codec's compression and reconstruction capabilities.
            """)
            
            with gr.Row():
                with gr.Column():
                    audio_input = gr.Audio(
                        label="Upload Audio",
                        type="numpy",
                        sources=["upload", "microphone"]
                    )
                    encode_btn = gr.Button("🚀 Encode & Decode", variant="primary")
                
                with gr.Column():
                    audio_output = gr.Audio(label="Decoded Audio")
                    info_output = gr.Textbox(label="Info", lines=6)
            
            encode_btn.click(
                fn=encode_decode_audio,
                inputs=[audio_input],
                outputs=[audio_output, info_output]
            )
            
            gr.Examples(
                examples=[],
                inputs=[audio_input],
                label="Examples (upload your own audio)"
            )
        
        # Tab 2: Voice Conversion
        with gr.Tab("🎭 Voice Conversion"):
            gr.Markdown("""
            Convert voice by taking content from **source audio** and timbre/style from **reference audio**.
            
            - **Source**: The speech content you want to keep
            - **Reference**: The voice style/timbre you want to apply
            """)
            
            with gr.Row():
                with gr.Column():
                    source_input = gr.Audio(
                        label="Source Audio (Content)",
                        type="numpy",
                        sources=["upload", "microphone"]
                    )
                    reference_input = gr.Audio(
                        label="Reference Audio (Timbre/Style)",
                        type="numpy",
                        sources=["upload", "microphone"]
                    )
                    convert_btn = gr.Button("✨ Convert Voice", variant="primary")
                
                with gr.Column():
                    converted_output = gr.Audio(label="Converted Audio")
                    convert_info = gr.Textbox(label="Info", lines=6)
            
            convert_btn.click(
                fn=voice_conversion,
                inputs=[source_input, reference_input],
                outputs=[converted_output, convert_info]
            )
    
    gr.Markdown("""
    ---
    ### 📚 About LinaCodec
    
    LinaCodec is a neural audio codec designed for high-quality speech compression and voice conversion.
    It encodes audio into discrete tokens and a global embedding, enabling efficient storage and manipulation of speech.
    
    **Model**: [YatharthS/LinaCodec](https://huggingface.co/YatharthS/LinaCodec)
    
    ### ⚙️ Technical Details
    - Output sample rate: 48 kHz
    - Supports various input formats
    - Neural compression with high reconstruction quality
    - Works on both CPU and GPU (GPU recommended for faster processing)
    """)

# Launch the app
if __name__ == "__main__":
    demo.launch()