Spaces:

MihaiPopa-1
/

FocalCodec-Demo

Sleeping

App Files Files Community

MihaiPopa-1 commited on 11 days ago

Commit

6fb8d93

verified ·

1 Parent(s): 85393f4

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -37

app.py CHANGED Viewed

@@ -1,81 +1,89 @@
 import torch
 import torchaudio
-from focal_codec.focal_codec import FocalCodec
 import gradio as gr
-import os # Need this for file path management
-import tempfile # A good way to manage temporary files in Gradio Spaces
-# Define the model ID for the 0.16 kbps codec
-MODEL_ID = "lucadellalib/focalcodec_12_5hz"
-# Load the model globally when the app starts
 try:
-    model = FocalCodec.from_pretrained(MODEL_ID)
     if torch.cuda.is_available():
-        model.cuda()
 except Exception as e:
-    print(f"Error loading model: {e}")
-    model = None
 def encode_decode_focal(audio_input):
     """
     Processes input audio through the 160 bps FocalCodec, saves the tokens,
     and returns both the decoded WAV and the path to the FC file for download.
     """
-    if model is None:
         return (16000, None), None
     sr, wav_numpy = audio_input
-    # Convert numpy to torch tensor and ensure float32, mono channel
-    wav = torch.tensor(wav_numpy, dtype=torch.float32).unsqueeze(0)
-    if wav.shape > 1: # Convert stereo to mono by taking the first channel
-        wav = wav[:, 0].unsqueeze(0)
-    # Resample to 16kHz if necessary (FocalCodec requires 16k input)
-    if sr != 16000:
-        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=16000)
-        wav = resampler(wav)
     if torch.cuda.is_available():
-        wav = wav.cuda()
     # --- Process (Encode and Decode) ---
     with torch.no_grad():
-        # Encode returns codes and bandwidth
-        codes, bandwidth = model.encode(wav)
-        # Decode returns the reconstructed waveform
-        decoded_wav = model.decode(codes)
     # --- Save the compressed tokens to a temporary .fc file ---
-    # Use tempfile to ensure safe file management in a shared environment
     temp_dir = tempfile.mkdtemp()
     fc_file_path = os.path.join(temp_dir, "compressed_tokens.fc")
-    torch.save(codes, fc_file_path)
-    print(f"Codes saved to {fc_file_path}")
     # Move audio back to CPU for Gradio output and formatting
-    decoded_wav_output = decoded_wav.cpu().numpy().squeeze()
-    # Return both the audio tuple and the file path string
-    return (16000, decoded_wav_output), fc_file_path
-# --- Gradio Interface ---
 with gr.Blocks() as iface:
-    gr.Markdown(f"## FocalCodec at 160 bps ({MODEL_ID.split('/')[-1]})")
-    gr.Markdown("Test the lowest bitrate neural speech codec! This model is optimized ONLY for speech. Upload your audio or record your voice.")
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="Input Audio (Speech Only Recommended)")
         with gr.Column():
             audio_output = gr.Audio(type="numpy", label="Decoded Output Audio (160 bps)")
-            # The gr.File component handles the download functionality
             file_output = gr.File(label="Download Compressed Tokens (*.fc file)", file_count="single", file_types=[".fc"])
-    # Map the function to the components
-    # We use a button explicitly to manage the output flow better than gr.Interface
     process_button = gr.Button("Process Audio", variant="primary")
     process_button.click(
         fn=encode_decode_focal,

 import torch
 import torchaudio
 import gradio as gr
+import os
+import tempfile
+import numpy as np
+# Define the model ID for the 0.16 kbps codec config
+MODEL_CONFIG = "lucadellalib/focalcodec_12_5hz"
+# Load the model globally using torch.hub
 try:
+    # torch.hub handles cloning the repo internally
+    codec = torch.hub.load(
+        repo_or_dir="lucadellalib/focalcodec",
+        model="focalcodec",
+        config=MODEL_CONFIG,
+        force_reload=False # Use cached version after first load
+    )
+    codec.eval().requires_grad_(False) # Set to evaluation mode
     if torch.cuda.is_available():
+        codec.cuda()
 except Exception as e:
+    print(f"Error loading model via torch.hub: {e}")
+    codec = None
 def encode_decode_focal(audio_input):
     """
     Processes input audio through the 160 bps FocalCodec, saves the tokens,
     and returns both the decoded WAV and the path to the FC file for download.
     """
+    if codec is None:
         return (16000, None), None
     sr, wav_numpy = audio_input
+    # Convert numpy to torch tensor and ensure float32
+    sig = torch.tensor(wav_numpy, dtype=torch.float32).unsqueeze(0)
+    # Resample input audio to the sample rate required by the codec (16kHz)
+    if sr != codec.sample_rate_input:
+        resampler = torchaudio.transforms.Resample(orig_freq=sr, new_freq=codec.sample_rate_input)
+        sig = resampler(sig)
+    # Ensure mono channel if needed
+    if sig.shape[0] > 1:
+        sig = sig[0, :].unsqueeze(0)
     if torch.cuda.is_available():
+        sig = sig.cuda()
     # --- Process (Encode and Decode) ---
     with torch.no_grad():
+        # 1. Encode signal to discrete tokens (the compressed data)
+        toks = codec.sig_to_toks(sig)
+        # 2. Decode tokens back into a waveform
+        rec_sig = codec.toks_to_sig(toks)
     # --- Save the compressed tokens to a temporary .fc file ---
     temp_dir = tempfile.mkdtemp()
     fc_file_path = os.path.join(temp_dir, "compressed_tokens.fc")
+    # Save the tokens tensor
+    torch.save(toks, fc_file_path)
+    print(f"Tokens saved to {fc_file_path}")
     # Move audio back to CPU for Gradio output and formatting
+    # Note: Codec output is already at sample_rate_input (16kHz)
+    decoded_wav_output = rec_sig.cpu().numpy().squeeze()
+    return (codec.sample_rate_output, decoded_wav_output), fc_file_path
+# --- Gradio Interface (Use the same Blocks interface as before) ---
 with gr.Blocks() as iface:
+    gr.Markdown(f"## FocalCodec at 160 bps ({MODEL_CONFIG.split('/')[-1]})")
+    gr.Markdown("Test the lowest bitrate neural speech codec! Optimized ONLY for speech. Upload your audio or record your voice.")
     with gr.Row():
         audio_input = gr.Audio(sources=["microphone", "upload"], type="numpy", label="Input Audio (Speech Only Recommended)")
         with gr.Column():
             audio_output = gr.Audio(type="numpy", label="Decoded Output Audio (160 bps)")
             file_output = gr.File(label="Download Compressed Tokens (*.fc file)", file_count="single", file_types=[".fc"])
     process_button = gr.Button("Process Audio", variant="primary")
     process_button.click(
         fn=encode_decode_focal,