Spaces:

humair025
/

LinaCodec

Runtime error

App Files Files Community

humair025 commited on 13 days ago

Commit

87e044e

verified ·

1 Parent(s): 91fcfb1

Create app.py

Browse files

Files changed (1) hide show

app.py +221 -0

app.py ADDED Viewed

	@@ -0,0 +1,221 @@

+import gradio as gr
+import torch
+import numpy as np
+from linacodec.codec import LinaCodec
+import torchaudio
+import tempfile
+import os
+# Initialize the model
+print("Loading LinaCodec model...")
+lina_tokenizer = LinaCodec()
+print("Model loaded successfully!")
+def encode_decode_audio(audio_input):
+    """Encode and decode audio to demonstrate compression."""
+    try:
+        if audio_input is None:
+            return None, "Please upload an audio file."
+        # audio_input is a tuple (sample_rate, audio_data)
+        sr, audio_data = audio_input
+        # Save temporary file
+        with tempfile.NamedTemporaryFile(delete=False, suffix='.wav') as tmp:
+            temp_path = tmp.name
+        # Convert to tensor and save
+        if audio_data.dtype == np.int16:
+            audio_data = audio_data.astype(np.float32) / 32768.0
+        elif audio_data.dtype == np.int32:
+            audio_data = audio_data.astype(np.float32) / 2147483648.0
+        # Handle mono/stereo
+        if len(audio_data.shape) == 1:
+            audio_tensor = torch.FloatTensor(audio_data).unsqueeze(0)
+        else:
+            audio_tensor = torch.FloatTensor(audio_data.T)
+        # Save as wav
+        torchaudio.save(temp_path, audio_tensor, sr)
+        # Encode
+        speech_tokens, global_embedding = lina_tokenizer.encode(temp_path)
+        # Decode
+        decoded_audio = lina_tokenizer.decode(speech_tokens, global_embedding)
+        # Clean up
+        os.unlink(temp_path)
+        # Convert to numpy for Gradio
+        decoded_audio = decoded_audio.cpu().squeeze().numpy()
+        info = f"✅ Success!\n"
+        info += f"Original sample rate: {sr} Hz\n"
+        info += f"Output sample rate: 48000 Hz\n"
+        info += f"Speech tokens shape: {speech_tokens.shape}\n"
+        info += f"Global embedding shape: {global_embedding.shape}"
+        return (48000, decoded_audio), info
+    except Exception as e:
+        return None, f"❌ Error: {str(e)}"
+def voice_conversion(source_audio, reference_audio):
+    """Convert voice using source content and reference timbre."""
+    try:
+        if source_audio is None or reference_audio is None:
+            return None, "Please upload both source and reference audio files."
+        # Save source audio
+        sr_source, audio_source = source_audio
+        with tempfile.NamedTemporaryFile(delete=False, suffix='_source.wav') as tmp:
+            source_path = tmp.name
+        if audio_source.dtype == np.int16:
+            audio_source = audio_source.astype(np.float32) / 32768.0
+        elif audio_source.dtype == np.int32:
+            audio_source = audio_source.astype(np.float32) / 2147483648.0
+        if len(audio_source.shape) == 1:
+            audio_tensor = torch.FloatTensor(audio_source).unsqueeze(0)
+        else:
+            audio_tensor = torch.FloatTensor(audio_source.T)
+        torchaudio.save(source_path, audio_tensor, sr_source)
+        # Save reference audio
+        sr_ref, audio_ref = reference_audio
+        with tempfile.NamedTemporaryFile(delete=False, suffix='_ref.wav') as tmp:
+            ref_path = tmp.name
+        if audio_ref.dtype == np.int16:
+            audio_ref = audio_ref.astype(np.float32) / 32768.0
+        elif audio_ref.dtype == np.int32:
+            audio_ref = audio_ref.astype(np.float32) / 2147483648.0
+        if len(audio_ref.shape) == 1:
+            audio_tensor = torch.FloatTensor(audio_ref).unsqueeze(0)
+        else:
+            audio_tensor = torch.FloatTensor(audio_ref.T)
+        torchaudio.save(ref_path, audio_tensor, sr_ref)
+        # Convert voice
+        converted_audio = lina_tokenizer.convert_voice(source_path, ref_path)
+        # Clean up
+        os.unlink(source_path)
+        os.unlink(ref_path)
+        # Convert to numpy
+        converted_audio = converted_audio.cpu().squeeze().numpy()
+        info = f"✅ Voice conversion successful!\n"
+        info += f"Source sample rate: {sr_source} Hz\n"
+        info += f"Reference sample rate: {sr_ref} Hz\n"
+        info += f"Output sample rate: 48000 Hz\n"
+        info += f"Content taken from source, timbre/style from reference"
+        return (48000, converted_audio), info
+    except Exception as e:
+        return None, f"❌ Error: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="LinaCodec Audio Tool", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🎵 LinaCodec Audio Tool
+    **LinaCodec** is a neural audio codec for high-quality speech compression and voice conversion.
+    ### Features:
+    - 🔄 **Encode & Decode**: Compress and reconstruct audio at 48kHz
+    - 🎭 **Voice Conversion**: Transfer timbre/style from one speaker to another
+    """)
+    with gr.Tabs():
+        # Tab 1: Encode/Decode
+        with gr.Tab("🔄 Encode & Decode"):
+            gr.Markdown("""
+            Upload an audio file to encode it into speech tokens and then decode it back.
+            This demonstrates the codec's compression and reconstruction capabilities.
+            """)
+            with gr.Row():
+                with gr.Column():
+                    audio_input = gr.Audio(
+                        label="Upload Audio",
+                        type="numpy",
+                        sources=["upload", "microphone"]
+                    )
+                    encode_btn = gr.Button("🚀 Encode & Decode", variant="primary")
+                with gr.Column():
+                    audio_output = gr.Audio(label="Decoded Audio")
+                    info_output = gr.Textbox(label="Info", lines=6)
+            encode_btn.click(
+                fn=encode_decode_audio,
+                inputs=[audio_input],
+                outputs=[audio_output, info_output]
+            )
+            gr.Examples(
+                examples=[],
+                inputs=[audio_input],
+                label="Examples (upload your own audio)"
+            )
+        # Tab 2: Voice Conversion
+        with gr.Tab("🎭 Voice Conversion"):
+            gr.Markdown("""
+            Convert voice by taking content from **source audio** and timbre/style from **reference audio**.
+            - **Source**: The speech content you want to keep
+            - **Reference**: The voice style/timbre you want to apply
+            """)
+            with gr.Row():
+                with gr.Column():
+                    source_input = gr.Audio(
+                        label="Source Audio (Content)",
+                        type="numpy",
+                        sources=["upload", "microphone"]
+                    )
+                    reference_input = gr.Audio(
+                        label="Reference Audio (Timbre/Style)",
+                        type="numpy",
+                        sources=["upload", "microphone"]
+                    )
+                    convert_btn = gr.Button("✨ Convert Voice", variant="primary")
+                with gr.Column():
+                    converted_output = gr.Audio(label="Converted Audio")
+                    convert_info = gr.Textbox(label="Info", lines=6)
+            convert_btn.click(
+                fn=voice_conversion,
+                inputs=[source_input, reference_input],
+                outputs=[converted_output, convert_info]
+            )
+    gr.Markdown("""
+    ---
+    ### 📚 About LinaCodec
+    LinaCodec is a neural audio codec designed for high-quality speech compression and voice conversion.
+    It encodes audio into discrete tokens and a global embedding, enabling efficient storage and manipulation of speech.
+    **Model**: [YatharthS/LinaCodec](https://huggingface.co/YatharthS/LinaCodec)
+    ### ⚙️ Technical Details
+    - Output sample rate: 48 kHz
+    - Supports various input formats
+    - Neural compression with high reconstruction quality
+    """)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()