Spaces:

humair025
/

LinaCodec

Runtime error

App Files Files Community

humair025 commited on 22 days ago

Commit

c9c6cc6

verified ·

1 Parent(s): ca7a45e

Update app.py

Browse files

Files changed (1) hide show

app.py +108 -5

app.py CHANGED Viewed

@@ -1,15 +1,114 @@
 import gradio as gr
 import torch
 import numpy as np
-from linacodec.codec import LinaCodec
 import torchaudio
 import tempfile
 import os
-# Initialize the model
-print("Loading LinaCodec model...")
-lina_tokenizer = LinaCodec()
-print("Model loaded successfully!")
 def encode_decode_audio(audio_input):
     """Encode and decode audio to demonstrate compression."""
@@ -51,7 +150,9 @@ def encode_decode_audio(audio_input):
         # Convert to numpy for Gradio
         decoded_audio = decoded_audio.cpu().squeeze().numpy()
         info = f"✅ Success!\n"
         info += f"Original sample rate: {sr} Hz\n"
         info += f"Output sample rate: 48000 Hz\n"
         info += f"Speech tokens shape: {speech_tokens.shape}\n"
@@ -133,6 +234,7 @@ with gr.Blocks(title="LinaCodec Audio Tool", theme=gr.themes.Soft()) as demo:
     ### Features:
     - 🔄 **Encode & Decode**: Compress and reconstruct audio at 48kHz
     - 🎭 **Voice Conversion**: Transfer timbre/style from one speaker to another
     """)
     with gr.Tabs():
@@ -214,6 +316,7 @@ with gr.Blocks(title="LinaCodec Audio Tool", theme=gr.themes.Soft()) as demo:
     - Output sample rate: 48 kHz
     - Supports various input formats
     - Neural compression with high reconstruction quality
     """)
 # Launch the app

 import gradio as gr
 import torch
 import numpy as np
 import torchaudio
 import tempfile
 import os
+# Patch LinaCodec to work on CPU
+print("Setting up LinaCodec for CPU...")
+# Import and patch before initializing
+from linacodec.tokenizer import LinaCodecModel
+from huggingface_hub import hf_hub_download
+import torch.nn as nn
+class CPULinaCodec:
+    """CPU-compatible wrapper for LinaCodec"""
+    def __init__(self):
+        print("Loading LinaCodec model on CPU...")
+        # Download model files
+        repo_id = "YatharthS/LinaCodec"
+        config_path = hf_hub_download(repo_id=repo_id, filename="config.yaml")
+        weights_path = hf_hub_download(repo_id=repo_id, filename="model.safetensors")
+        # Load model on CPU instead of CUDA
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Using device: {self.device}")
+        self.model = LinaCodecModel.from_pretrained(
+            config_path=config_path,
+            weights_path=weights_path
+        ).eval()
+        # Move to appropriate device
+        self.model = self.model.to(self.device)
+        self.sample_rate = 48000
+        print(f"Model loaded successfully on {self.device}!")
+    def encode(self, audio_path):
+        """Encode audio file to tokens and embeddings"""
+        import torchaudio
+        # Load audio
+        wav, sr = torchaudio.load(audio_path)
+        wav = wav.to(self.device)
+        # Resample if needed
+        if sr != 24000:
+            resampler = torchaudio.transforms.Resample(sr, 24000).to(self.device)
+            wav = resampler(wav)
+        # Ensure mono
+        if wav.shape[0] > 1:
+            wav = wav.mean(dim=0, keepdim=True)
+        # Encode
+        with torch.no_grad():
+            codes, embedding = self.model.encode(wav.unsqueeze(0))
+        return codes, embedding
+    def decode(self, codes, embedding):
+        """Decode tokens and embeddings back to audio"""
+        with torch.no_grad():
+            wav = self.model.decode(codes, embedding)
+        return wav.squeeze(0)
+    def convert_voice(self, source_path, reference_path):
+        """Convert voice using source content and reference timbre"""
+        import torchaudio
+        # Load source audio
+        source_wav, source_sr = torchaudio.load(source_path)
+        source_wav = source_wav.to(self.device)
+        if source_sr != 24000:
+            resampler = torchaudio.transforms.Resample(source_sr, 24000).to(self.device)
+            source_wav = resampler(source_wav)
+        if source_wav.shape[0] > 1:
+            source_wav = source_wav.mean(dim=0, keepdim=True)
+        # Load reference audio
+        ref_wav, ref_sr = torchaudio.load(reference_path)
+        ref_wav = ref_wav.to(self.device)
+        if ref_sr != 24000:
+            resampler = torchaudio.transforms.Resample(ref_sr, 24000).to(self.device)
+            ref_wav = resampler(ref_wav)
+        if ref_wav.shape[0] > 1:
+            ref_wav = ref_wav.mean(dim=0, keepdim=True)
+        # Encode source for content
+        with torch.no_grad():
+            source_codes, _ = self.model.encode(source_wav.unsqueeze(0))
+            # Encode reference for timbre
+            _, ref_embedding = self.model.encode(ref_wav.unsqueeze(0))
+            # Decode with source codes but reference embedding
+            converted_wav = self.model.decode(source_codes, ref_embedding)
+        return converted_wav.squeeze(0)
+# Initialize the CPU-compatible model
+lina_tokenizer = CPULinaCodec()
 def encode_decode_audio(audio_input):
     """Encode and decode audio to demonstrate compression."""
         # Convert to numpy for Gradio
         decoded_audio = decoded_audio.cpu().squeeze().numpy()
+        device_info = "GPU (CUDA)" if torch.cuda.is_available() else "CPU"
         info = f"✅ Success!\n"
+        info += f"Device: {device_info}\n"
         info += f"Original sample rate: {sr} Hz\n"
         info += f"Output sample rate: 48000 Hz\n"
         info += f"Speech tokens shape: {speech_tokens.shape}\n"
     ### Features:
     - 🔄 **Encode & Decode**: Compress and reconstruct audio at 48kHz
     - 🎭 **Voice Conversion**: Transfer timbre/style from one speaker to another
+    - 💻 **CPU Compatible**: Works on both CPU and GPU
     """)
     with gr.Tabs():
     - Output sample rate: 48 kHz
     - Supports various input formats
     - Neural compression with high reconstruction quality
+    - Works on both CPU and GPU (GPU recommended for faster processing)
     """)
 # Launch the app