Spaces:

MihaiPopa-1
/

FocalCodec-Demo

Sleeping

App Files Files Community

MihaiPopa-1 commited on 11 days ago

Commit

91d64e3

verified ·

1 Parent(s): 3e9538b

Update app.py

Browse files

Files changed (1) hide show

app.py +279 -50

app.py CHANGED Viewed

@@ -4,6 +4,7 @@ import gradio as gr
 import os
 import tempfile
 import numpy as np
 # Define the model ID for the 0.16 kbps codec config
 MODEL_CONFIG = "lucadellalib/focalcodec_12_5hz"
@@ -17,7 +18,7 @@ try:
         model="focalcodec",
         config=MODEL_CONFIG,
         force_reload=False,
-        trust_repo=True  # Add this if needed
     )
     codec.eval()
     for param in codec.parameters():
@@ -47,6 +48,116 @@ except Exception as e:
         print(f"ERROR with alternative method: {e2}")
         codec = None
 def encode_decode_focal(audio_input):
     """
     Processes input audio through the 160 bps FocalCodec, saves the tokens,
@@ -61,63 +172,75 @@ def encode_decode_focal(audio_input):
     try:
         sr, wav_numpy = audio_input
         # Handle stereo to mono conversion
         if len(wav_numpy.shape) > 1:
-            if wav_numpy.shape[1] == 2:
                 wav_numpy = wav_numpy.mean(axis=1)
-            elif wav_numpy.shape[0] == 2:
                 wav_numpy = wav_numpy.mean(axis=0)
         # Ensure float32 and normalize
         wav_numpy = wav_numpy.astype(np.float32)
         if wav_numpy.max() > 1.0 or wav_numpy.min() < -1.0:
-            wav_numpy = wav_numpy / 32768.0
         # Convert to torch tensor [1, samples]
         sig = torch.from_numpy(wav_numpy).unsqueeze(0)
-        # Resample to 16kHz
         if sr != codec.sample_rate_input:
             resampler = torchaudio.transforms.Resample(
                 orig_freq=sr,
                 new_freq=codec.sample_rate_input
             )
             sig = resampler(sig)
         if torch.cuda.is_available():
             sig = sig.cuda()
         # --- Encode and Decode ---
         with torch.no_grad():
             toks = codec.sig_to_toks(sig)
-            rec_sig = codec.toks_to_sig(toks)
-            # Get binary codes for true compression
-            codes = codec.toks_to_codes(toks)
-       # --- Save the compressed tokens to a temporary .fc file ---
         temp_dir = tempfile.mkdtemp()
         fc_file_path = os.path.join(temp_dir, "compressed_tokens.fc")
-        # Save as raw binary data (just the token values)
-        toks_cpu = toks.cpu().numpy().astype(np.int16)  # Convert to numpy
-        with open(fc_file_path, 'wb') as f:
-            f.write(toks_cpu.tobytes())  # Write raw bytes
-        file_size_bytes = os.path.getsize(fc_file_path)
         duration_sec = sig.shape[-1] / codec.sample_rate_input
-        expected_size = (160 * duration_sec) / 8  # 160 bits/sec → bytes
-        actual_bitrate = (file_size_bytes * 8) / duration_sec
-        print(f"Tokens saved to {fc_file_path}")
-        print(f"File size: {file_size_bytes} bytes (expected: ~{expected_size:.0f} bytes)")
-        # Move audio back to CPU
         decoded_wav_output = rec_sig.cpu().numpy().squeeze()
         if len(decoded_wav_output.shape) == 0:
             decoded_wav_output = decoded_wav_output.reshape(1)
-        status_msg = f"✅ Duration: {duration_sec:.1f}s | File: {file_size_bytes} bytes | Bitrate: {actual_bitrate:.0f} bps"
         return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg
@@ -128,40 +251,146 @@ def encode_decode_focal(audio_input):
         traceback.print_exc()
         return None, None, error_msg
 # --- Gradio Interface ---
-with gr.Blocks() as iface:
-    gr.Markdown(f"## FocalCodec at 160 bps ({MODEL_CONFIG.split('/')[-1]})")
-    gr.Markdown("Test the lowest bitrate neural speech codec! **Optimized for speech only.** Upload audio or record your voice.")
-    with gr.Row():
-        audio_input = gr.Audio(
-            sources=["microphone", "upload"],
-            type="numpy",
-            label="Input Audio (Speech - any format/sample rate)"
-        )
-        with gr.Column():
-            audio_output = gr.Audio(
                 type="numpy",
-                label="Decoded Output Audio (16kHz, 160 bps)"
             )
-            file_output = gr.File(
-                label="Download Compressed Tokens (*.fc file)",
-                file_count="single"
-            )
-            status_output = gr.Textbox(label="Status", lines=2)
-    process_button = gr.Button("Process Audio", variant="primary")
-    process_button.click(
-        fn=encode_decode_focal,
-        inputs=[audio_input],
-        outputs=[audio_output, file_output, status_output]
-    )
-    gr.Markdown("### Notes:")
-    gr.Markdown("- Input audio will be automatically resampled to 16kHz")
-    gr.Markdown("- Stereo audio will be converted to mono")
-    gr.Markdown("- The .fc file contains the compressed tokens (160 bits per second)")
 if __name__ == "__main__":
     iface.launch()

 import os
 import tempfile
 import numpy as np
+import struct
 # Define the model ID for the 0.16 kbps codec config
 MODEL_CONFIG = "lucadellalib/focalcodec_12_5hz"
         model="focalcodec",
         config=MODEL_CONFIG,
         force_reload=False,
+        trust_repo=True
     )
     codec.eval()
     for param in codec.parameters():
         print(f"ERROR with alternative method: {e2}")
         codec = None
+# --- SAVE function (encoding) ---
+def save_compressed_tokens(toks, fc_file_path, codec):
+    """Save tokens in the most compressed format with metadata for decoding"""
+    toks_cpu = toks.cpu()
+    min_tok = toks_cpu.min().item()
+    max_tok = toks_cpu.max().item()
+    print(f"\n=== Saving Tokens ===")
+    print(f"Shape: {toks.shape}")
+    print(f"Range: {min_tok} to {max_tok}")
+    # Determine bit width
+    if max_tok <= 1:
+        bits_per_token = 1
+        dtype_code = 0
+    elif max_tok <= 15:
+        bits_per_token = 4
+        dtype_code = 1
+    elif max_tok <= 255:
+        bits_per_token = 8
+        dtype_code = 2
+    else:
+        bits_per_token = 16
+        dtype_code = 3
+    # Convert to numpy
+    toks_np = toks_cpu.numpy().flatten()
+    # Pack data
+    if bits_per_token == 1:
+        packed = np.packbits(toks_np.astype(np.uint8))
+    elif bits_per_token == 4:
+        if len(toks_np) % 2:
+            toks_np = np.append(toks_np, 0)
+        packed = ((toks_np[::2] << 4) | toks_np[1::2]).astype(np.uint8)
+    elif bits_per_token == 8:
+        packed = toks_np.astype(np.uint8)
+    else:  # 16-bit
+        packed = toks_np.astype(np.int16)
+    # Write file with header
+    with open(fc_file_path, 'wb') as f:
+        # Magic number (to verify it's our format)
+        f.write(b'FC01')  # FocalCodec version 0.1
+        # Metadata
+        f.write(struct.pack('<B', dtype_code))  # Data type (1 byte)
+        f.write(struct.pack('<I', toks.shape[0]))  # Batch size
+        f.write(struct.pack('<I', toks.shape[1]))  # Sequence length
+        f.write(struct.pack('<I', len(toks_np)))  # Total tokens
+        # Packed token data
+        f.write(packed.tobytes())
+    file_size = os.path.getsize(fc_file_path)
+    print(f"Saved {file_size} bytes ({bits_per_token} bits/token)")
+    print(f"====================\n")
+    return file_size, bits_per_token
+# --- LOAD function (decoding) ---
+def load_compressed_tokens(fc_file_path):
+    """Load and unpack tokens from .fc file"""
+    with open(fc_file_path, 'rb') as f:
+        # Verify magic number
+        magic = f.read(4)
+        if magic != b'FC01':
+            raise ValueError("Invalid .fc file format!")
+        # Read metadata
+        dtype_code = struct.unpack('<B', f.read(1))[0]
+        batch_size = struct.unpack('<I', f.read(4))[0]
+        seq_length = struct.unpack('<I', f.read(4))[0]
+        total_tokens = struct.unpack('<I', f.read(4))[0]
+        # Read packed data
+        packed_data = np.frombuffer(f.read(), dtype=np.uint8)
+    print(f"\n=== Loading Tokens ===")
+    print(f"Dtype code: {dtype_code}")
+    print(f"Shape: ({batch_size}, {seq_length})")
+    # Unpack based on dtype
+    if dtype_code == 0:  # 1-bit
+        unpacked = np.unpackbits(packed_data)[:total_tokens]
+    elif dtype_code == 1:  # 4-bit
+        high = (packed_data >> 4) & 0x0F
+        low = packed_data & 0x0F
+        unpacked = np.empty(len(packed_data) * 2, dtype=np.uint8)
+        unpacked[::2] = high
+        unpacked[1::2] = low
+        unpacked = unpacked[:total_tokens]
+    elif dtype_code == 2:  # 8-bit
+        unpacked = packed_data[:total_tokens]
+    else:  # 16-bit
+        unpacked = np.frombuffer(packed_data.tobytes(), dtype=np.int16)[:total_tokens]
+    # Reshape to original shape
+    toks = torch.from_numpy(unpacked.astype(np.int64)).reshape(batch_size, seq_length)
+    print(f"Loaded tokens: {toks.shape}")
+    print(f"======================\n")
+    return toks
 def encode_decode_focal(audio_input):
     """
     Processes input audio through the 160 bps FocalCodec, saves the tokens,
     try:
         sr, wav_numpy = audio_input
+        print(f"Input audio: sample_rate={sr}, shape={wav_numpy.shape}, dtype={wav_numpy.dtype}")
         # Handle stereo to mono conversion
         if len(wav_numpy.shape) > 1:
+            if wav_numpy.shape[1] == 2:  # Stereo
                 wav_numpy = wav_numpy.mean(axis=1)
+                print("Converted stereo to mono")
+            elif wav_numpy.shape[0] == 2:  # Channels first
                 wav_numpy = wav_numpy.mean(axis=0)
+                print("Converted stereo to mono (channels first)")
         # Ensure float32 and normalize
         wav_numpy = wav_numpy.astype(np.float32)
         if wav_numpy.max() > 1.0 or wav_numpy.min() < -1.0:
+            wav_numpy = wav_numpy / 32768.0  # Normalize int16 to float
         # Convert to torch tensor [1, samples]
         sig = torch.from_numpy(wav_numpy).unsqueeze(0)
+        print(f"Tensor shape before resample: {sig.shape}")
+        # Resample to 16kHz (required by FocalCodec)
         if sr != codec.sample_rate_input:
+            print(f"Resampling from {sr}Hz to {codec.sample_rate_input}Hz...")
             resampler = torchaudio.transforms.Resample(
                 orig_freq=sr,
                 new_freq=codec.sample_rate_input
             )
             sig = resampler(sig)
+        print(f"Tensor shape after resample: {sig.shape}")
+        # Move to GPU if available
         if torch.cuda.is_available():
             sig = sig.cuda()
         # --- Encode and Decode ---
         with torch.no_grad():
+            print("Encoding to tokens...")
             toks = codec.sig_to_toks(sig)
+            print(f"Tokens shape: {toks.shape}")
+            print(f"Token range: {toks.min().item()} to {toks.max().item()}")
+            print("Decoding tokens to audio...")
+            rec_sig = codec.toks_to_sig(toks)
+            print(f"Reconstructed signal shape: {rec_sig.shape}")
+        # --- Save the compressed tokens ---
         temp_dir = tempfile.mkdtemp()
         fc_file_path = os.path.join(temp_dir, "compressed_tokens.fc")
+        file_size, bits_per_token = save_compressed_tokens(toks, fc_file_path, codec)
+        # Calculate stats
         duration_sec = sig.shape[-1] / codec.sample_rate_input
+        actual_bitrate = (file_size * 8) / duration_sec
+        print(f"Duration: {duration_sec:.2f}s")
+        print(f"File size: {file_size} bytes")
+        print(f"Actual bitrate: {actual_bitrate:.1f} bps")
+        # Move audio back to CPU for Gradio output
         decoded_wav_output = rec_sig.cpu().numpy().squeeze()
+        # Ensure proper shape for Gradio
         if len(decoded_wav_output.shape) == 0:
             decoded_wav_output = decoded_wav_output.reshape(1)
+        status_msg = f"✅ Duration: {duration_sec:.1f}s | File: {file_size} bytes | Bitrate: {actual_bitrate:.0f} bps ({bits_per_token} bits/token)"
         return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg
         traceback.print_exc()
         return None, None, error_msg
+def decode_from_fc_file(fc_file):
+    """Decode audio from uploaded .fc file"""
+    if codec is None:
+        return None, "❌ Model not loaded"
+    if fc_file is None:
+        return None, "❌ Please upload a .fc file"
+    try:
+        # Load tokens from file
+        toks = load_compressed_tokens(fc_file.name)
+        if torch.cuda.is_available():
+            toks = toks.cuda()
+        # Decode to audio
+        with torch.no_grad():
+            rec_sig = codec.toks_to_sig(toks)
+        decoded_wav = rec_sig.cpu().numpy().squeeze()
+        # Calculate duration
+        duration_sec = decoded_wav.shape[0] / codec.sample_rate_output
+        file_size = os.path.getsize(fc_file.name)
+        bitrate = (file_size * 8) / duration_sec
+        status = f"✅ Decoded successfully! Duration: {duration_sec:.1f}s | Bitrate: {bitrate:.0f} bps"
+        return (codec.sample_rate_output, decoded_wav), status
+    except Exception as e:
+        import traceback
+        traceback.print_exc()
+        return None, f"❌ Error: {str(e)}"
 # --- Gradio Interface ---
+with gr.Blocks(title="FocalCodec 160 bps") as iface:
+    gr.Markdown("# 🎙️ FocalCodec at 160 bps")
+    gr.Markdown(f"**Neural speech codec at insanely low bitrate!** Using `{MODEL_CONFIG}`")
+    gr.Markdown("⚠️ **Optimized for speech only** - not suitable for music")
+    with gr.Tab("🎤 Encode Audio"):
+        gr.Markdown("### Compress audio to 160 bps tokens")
+        with gr.Row():
+            audio_input = gr.Audio(
+                sources=["microphone", "upload"],
                 type="numpy",
+                label="Input Audio (any format/sample rate)"
             )
+            with gr.Column():
+                audio_output = gr.Audio(
+                    type="numpy",
+                    label="Decoded Output (16kHz)"
+                )
+                file_output = gr.File(
+                    label="Download Compressed .fc File"
+                )
+                status_output = gr.Textbox(label="Status", lines=2)
+        encode_btn = gr.Button("🔄 Encode & Decode", variant="primary", size="lg")
+        encode_btn.click(
+            fn=encode_decode_focal,
+            inputs=[audio_input],
+            outputs=[audio_output, file_output, status_output]
+        )
+        gr.Markdown("### How it works:")
+        gr.Markdown("- Automatically resamples to 16kHz")
+        gr.Markdown("- Converts stereo to mono")
+        gr.Markdown("- Encodes to discrete tokens (~160 bps)")
+        gr.Markdown("- Decodes tokens back to audio")
+    with gr.Tab("📂 Decode from .fc File"):
+        gr.Markdown("### Decode previously compressed audio")
+        with gr.Row():
+            fc_input = gr.File(
+                label="Upload .fc File",
+                file_types=[".fc"]
+            )
+            with gr.Column():
+                decoded_output = gr.Audio(
+                    type="numpy",
+                    label="Decoded Audio"
+                )
+                decode_status = gr.Textbox(label="Status", lines=2)
+        decode_btn = gr.Button("🔊 Decode Audio", variant="primary", size="lg")
+        decode_btn.click(
+            fn=decode_from_fc_file,
+            inputs=[fc_input],
+            outputs=[decoded_output, decode_status]
+        )
+    with gr.Tab("ℹ️ About"):
+        gr.Markdown("""
+        ## FocalCodec - Ultra Low Bitrate Neural Audio Codec
+        ### Compression Ratios:
+        - **Uncompressed PCM** (16kHz mono): 256 kbps
+        - **MP3** (standard): ~128 kbps
+        - **Opus** (voice): ~16 kbps
+        - **FocalCodec**: **0.16 kbps** (160 bps) 🔥
+        ### That's 1600x compression!
+        For a 1-hour podcast:
+        - Uncompressed: ~115 MB
+        - FocalCodec: **~72 KB**
+        ### Use Cases:
+        - 📞 Ultra-low bandwidth voice calls
+        - 🤖 AI-generated podcasts
+        - 🌍 Low-bandwidth regions
+        - 📻 Emergency communications
+        ### Trade-offs:
+        - ✅ Extremely efficient compression
+        - ✅ Speech remains intelligible
+        - ❌ Voice characteristics may change
+        - ❌ Not suitable for music
+        - ❌ Some pronunciation artifacts
+        ### Technical Details:
+        - Model: `lucadellalib/focalcodec_12_5hz`
+        - Sample Rate: 16 kHz
+        - Token Rate: 12.5 Hz
+        - Bits per Token: Auto-detected (1/4/8/16 bit)
+        - Target Bitrate: 160 bps
+        ---
+        🔗 [GitHub Repository](https://github.com/lucadellalib/focalcodec)
+        """)
 if __name__ == "__main__":
     iface.launch()