Spaces:

MihaiPopa-1
/

FocalCodec-Demo

Sleeping

App Files Files Community

MihaiPopa-1 commited on 11 days ago

Commit

01e25a6

verified ·

1 Parent(s): 91d64e3

Update app.py

Browse files

Files changed (1) hide show

app.py +274 -148

app.py CHANGED Viewed

@@ -49,113 +49,157 @@ except Exception as e:
         codec = None
-# --- SAVE function (encoding) ---
-def save_compressed_tokens(toks, fc_file_path, codec):
-    """Save tokens in the most compressed format with metadata for decoding"""
-    toks_cpu = toks.cpu()
-    min_tok = toks_cpu.min().item()
-    max_tok = toks_cpu.max().item()
-    print(f"\n=== Saving Tokens ===")
-    print(f"Shape: {toks.shape}")
-    print(f"Range: {min_tok} to {max_tok}")
-    # Determine bit width
-    if max_tok <= 1:
-        bits_per_token = 1
-        dtype_code = 0
-    elif max_tok <= 15:
-        bits_per_token = 4
-        dtype_code = 1
-    elif max_tok <= 255:
-        bits_per_token = 8
-        dtype_code = 2
     else:
-        bits_per_token = 16
-        dtype_code = 3
-    # Convert to numpy
-    toks_np = toks_cpu.numpy().flatten()
-    # Pack data
-    if bits_per_token == 1:
-        packed = np.packbits(toks_np.astype(np.uint8))
-    elif bits_per_token == 4:
-        if len(toks_np) % 2:
-            toks_np = np.append(toks_np, 0)
-        packed = ((toks_np[::2] << 4) | toks_np[1::2]).astype(np.uint8)
-    elif bits_per_token == 8:
-        packed = toks_np.astype(np.uint8)
-    else:  # 16-bit
-        packed = toks_np.astype(np.int16)
-    # Write file with header
     with open(fc_file_path, 'wb') as f:
-        # Magic number (to verify it's our format)
-        f.write(b'FC01')  # FocalCodec version 0.1
         # Metadata
-        f.write(struct.pack('<B', dtype_code))  # Data type (1 byte)
-        f.write(struct.pack('<I', toks.shape[0]))  # Batch size
-        f.write(struct.pack('<I', toks.shape[1]))  # Sequence length
-        f.write(struct.pack('<I', len(toks_np)))  # Total tokens
-        # Packed token data
-        f.write(packed.tobytes())
     file_size = os.path.getsize(fc_file_path)
-    print(f"Saved {file_size} bytes ({bits_per_token} bits/token)")
-    print(f"====================\n")
-    return file_size, bits_per_token
-# --- LOAD function (decoding) ---
-def load_compressed_tokens(fc_file_path):
-    """Load and unpack tokens from .fc file"""
     with open(fc_file_path, 'rb') as f:
-        # Verify magic number
         magic = f.read(4)
         if magic != b'FC01':
-            raise ValueError("Invalid .fc file format!")
         # Read metadata
-        dtype_code = struct.unpack('<B', f.read(1))[0]
         batch_size = struct.unpack('<I', f.read(4))[0]
-        seq_length = struct.unpack('<I', f.read(4))[0]
-        total_tokens = struct.unpack('<I', f.read(4))[0]
         # Read packed data
         packed_data = np.frombuffer(f.read(), dtype=np.uint8)
-    print(f"\n=== Loading Tokens ===")
-    print(f"Dtype code: {dtype_code}")
-    print(f"Shape: ({batch_size}, {seq_length})")
-    # Unpack based on dtype
-    if dtype_code == 0:  # 1-bit
-        unpacked = np.unpackbits(packed_data)[:total_tokens]
-    elif dtype_code == 1:  # 4-bit
-        high = (packed_data >> 4) & 0x0F
-        low = packed_data & 0x0F
-        unpacked = np.empty(len(packed_data) * 2, dtype=np.uint8)
-        unpacked[::2] = high
-        unpacked[1::2] = low
-        unpacked = unpacked[:total_tokens]
-    elif dtype_code == 2:  # 8-bit
-        unpacked = packed_data[:total_tokens]
-    else:  # 16-bit
-        unpacked = np.frombuffer(packed_data.tobytes(), dtype=np.int16)[:total_tokens]
-    # Reshape to original shape
-    toks = torch.from_numpy(unpacked.astype(np.int64)).reshape(batch_size, seq_length)
-    print(f"Loaded tokens: {toks.shape}")
-    print(f"======================\n")
-    return toks
 def encode_decode_focal(audio_input):
@@ -172,28 +216,28 @@ def encode_decode_focal(audio_input):
     try:
         sr, wav_numpy = audio_input
-        print(f"Input audio: sample_rate={sr}, shape={wav_numpy.shape}, dtype={wav_numpy.dtype}")
         # Handle stereo to mono conversion
         if len(wav_numpy.shape) > 1:
-            if wav_numpy.shape[1] == 2:  # Stereo
                 wav_numpy = wav_numpy.mean(axis=1)
                 print("Converted stereo to mono")
-            elif wav_numpy.shape[0] == 2:  # Channels first
                 wav_numpy = wav_numpy.mean(axis=0)
                 print("Converted stereo to mono (channels first)")
         # Ensure float32 and normalize
         wav_numpy = wav_numpy.astype(np.float32)
         if wav_numpy.max() > 1.0 or wav_numpy.min() < -1.0:
-            wav_numpy = wav_numpy / 32768.0  # Normalize int16 to float
-        # Convert to torch tensor [1, samples]
         sig = torch.from_numpy(wav_numpy).unsqueeze(0)
-        print(f"Tensor shape before resample: {sig.shape}")
-        # Resample to 16kHz (required by FocalCodec)
         if sr != codec.sample_rate_input:
             print(f"Resampling from {sr}Hz to {codec.sample_rate_input}Hz...")
             resampler = torchaudio.transforms.Resample(
@@ -202,50 +246,68 @@ def encode_decode_focal(audio_input):
             )
             sig = resampler(sig)
-        print(f"Tensor shape after resample: {sig.shape}")
-        # Move to GPU if available
         if torch.cuda.is_available():
             sig = sig.cuda()
         # --- Encode and Decode ---
         with torch.no_grad():
-            print("Encoding to tokens...")
             toks = codec.sig_to_toks(sig)
             print(f"Tokens shape: {toks.shape}")
             print(f"Token range: {toks.min().item()} to {toks.max().item()}")
-            print("Decoding tokens to audio...")
             rec_sig = codec.toks_to_sig(toks)
             print(f"Reconstructed signal shape: {rec_sig.shape}")
-        # --- Save the compressed tokens ---
         temp_dir = tempfile.mkdtemp()
         fc_file_path = os.path.join(temp_dir, "compressed_tokens.fc")
-        file_size, bits_per_token = save_compressed_tokens(toks, fc_file_path, codec)
-        # Calculate stats
-        duration_sec = sig.shape[-1] / codec.sample_rate_input
-        actual_bitrate = (file_size * 8) / duration_sec
-        print(f"Duration: {duration_sec:.2f}s")
-        print(f"File size: {file_size} bytes")
-        print(f"Actual bitrate: {actual_bitrate:.1f} bps")
-        # Move audio back to CPU for Gradio output
         decoded_wav_output = rec_sig.cpu().numpy().squeeze()
-        # Ensure proper shape for Gradio
         if len(decoded_wav_output.shape) == 0:
             decoded_wav_output = decoded_wav_output.reshape(1)
-        status_msg = f"✅ Duration: {duration_sec:.1f}s | File: {file_size} bytes | Bitrate: {actual_bitrate:.0f} bps ({bits_per_token} bits/token)"
         return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg
     except Exception as e:
-        error_msg = f"❌ Processing error: {str(e)}"
         print(error_msg)
         import traceback
         traceback.print_exc()
@@ -262,24 +324,35 @@ def decode_from_fc_file(fc_file):
         return None, "❌ Please upload a .fc file"
     try:
-        # Load tokens from file
-        toks = load_compressed_tokens(fc_file.name)
         if torch.cuda.is_available():
             toks = toks.cuda()
         # Decode to audio
         with torch.no_grad():
             rec_sig = codec.toks_to_sig(toks)
         decoded_wav = rec_sig.cpu().numpy().squeeze()
-        # Calculate duration
         duration_sec = decoded_wav.shape[0] / codec.sample_rate_output
         file_size = os.path.getsize(fc_file.name)
-        bitrate = (file_size * 8) / duration_sec
-        status = f"✅ Decoded successfully! Duration: {duration_sec:.1f}s | Bitrate: {bitrate:.0f} bps"
         return (codec.sample_rate_output, decoded_wav), status
@@ -290,13 +363,13 @@ def decode_from_fc_file(fc_file):
 # --- Gradio Interface ---
-with gr.Blocks(title="FocalCodec 160 bps") as iface:
     gr.Markdown("# 🎙️ FocalCodec at 160 bps")
     gr.Markdown(f"**Neural speech codec at insanely low bitrate!** Using `{MODEL_CONFIG}`")
-    gr.Markdown("⚠️ **Optimized for speech only** - not suitable for music")
     with gr.Tab("🎤 Encode Audio"):
-        gr.Markdown("### Compress audio to 160 bps tokens")
         with gr.Row():
             audio_input = gr.Audio(
@@ -308,12 +381,12 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
             with gr.Column():
                 audio_output = gr.Audio(
                     type="numpy",
-                    label="Decoded Output (16kHz)"
                 )
                 file_output = gr.File(
-                    label="Download Compressed .fc File"
                 )
-                status_output = gr.Textbox(label="Status", lines=2)
         encode_btn = gr.Button("🔄 Encode & Decode", variant="primary", size="lg")
         encode_btn.click(
@@ -323,10 +396,12 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
         )
         gr.Markdown("### How it works:")
-        gr.Markdown("- Automatically resamples to 16kHz")
-        gr.Markdown("- Converts stereo to mono")
-        gr.Markdown("- Encodes to discrete tokens (~160 bps)")
-        gr.Markdown("- Decodes tokens back to audio")
     with gr.Tab("📂 Decode from .fc File"):
         gr.Markdown("### Decode previously compressed audio")
@@ -340,9 +415,9 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
             with gr.Column():
                 decoded_output = gr.Audio(
                     type="numpy",
-                    label="Decoded Audio"
                 )
-                decode_status = gr.Textbox(label="Status", lines=2)
         decode_btn = gr.Button("🔊 Decode Audio", variant="primary", size="lg")
         decode_btn.click(
@@ -350,47 +425,98 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
             inputs=[fc_input],
             outputs=[decoded_output, decode_status]
         )
     with gr.Tab("ℹ️ About"):
         gr.Markdown("""
         ## FocalCodec - Ultra Low Bitrate Neural Audio Codec
-        ### Compression Ratios:
-        - **Uncompressed PCM** (16kHz mono): 256 kbps
-        - **MP3** (standard): ~128 kbps
-        - **Opus** (voice): ~16 kbps
-        - **FocalCodec**: **0.16 kbps** (160 bps) 🔥
-        ### That's 1600x compression!
-        For a 1-hour podcast:
-        - Uncompressed: ~115 MB
-        - FocalCodec: **~72 KB**
-        ### Use Cases:
-        - 📞 Ultra-low bandwidth voice calls
-        - 🤖 AI-generated podcasts
-        - 🌍 Low-bandwidth regions
-        - 📻 Emergency communications
-        ### Trade-offs:
-        - ✅ Extremely efficient compression
-        - ✅ Speech remains intelligible
         - ❌ Voice characteristics may change
-        - ❌ Not suitable for music
-        - ❌ Some pronunciation artifacts
-        ### Technical Details:
-        - Model: `lucadellalib/focalcodec_12_5hz`
-        - Sample Rate: 16 kHz
-        - Token Rate: 12.5 Hz
-        - Bits per Token: Auto-detected (1/4/8/16 bit)
-        - Target Bitrate: 160 bps
         ---
-        🔗 [GitHub Repository](https://github.com/lucadellalib/focalcodec)
         """)
 if __name__ == "__main__":
     iface.launch()

         codec = None
+def save_compressed_codes_optimal(toks, codes, fc_file_path, codec):
+    """Save codes with optimal bit packing to achieve true 160 bps"""
+    codes_cpu = codes.cpu().numpy()
+    toks_cpu = toks.cpu().numpy()
+    print(f"\n=== Optimal Compression ===")
+    print(f"Codes shape: {codes.shape}")
+    print(f"Codes dtype: {codes.dtype}")
+    # Determine actual bits needed based on token range
+    max_token = int(toks_cpu.max())
+    if max_token <= 1:
+        bits_needed = 1
+    elif max_token <= 3:
+        bits_needed = 2
+    elif max_token <= 7:
+        bits_needed = 3
+    elif max_token <= 15:
+        bits_needed = 4
+    elif max_token <= 31:
+        bits_needed = 5
+    elif max_token <= 63:
+        bits_needed = 6
+    elif max_token <= 127:
+        bits_needed = 7
+    elif max_token <= 255:
+        bits_needed = 8
+    elif max_token <= 511:
+        bits_needed = 9
+    elif max_token <= 1023:
+        bits_needed = 10
+    elif max_token <= 2047:
+        bits_needed = 11
+    elif max_token <= 4095:
+        bits_needed = 12
+    elif max_token <= 8191:
+        bits_needed = 13
+    elif max_token <= 16383:
+        bits_needed = 14
+    elif max_token <= 32767:
+        bits_needed = 15
     else:
+        bits_needed = 16
+    print(f"Token range: 0 to {max_token}")
+    print(f"Bits needed per token: {bits_needed}")
+    # If codes are already binary (batch, time, bits), use them directly
+    if len(codes.shape) == 3 and codes.dtype in [torch.bool, torch.uint8]:
+        print(f"Using binary codes directly: {codes.shape[2]} bits per token")
+        # Pack the binary codes
+        codes_flat = codes_cpu.flatten()
+        packed_bits = np.packbits(codes_flat)
+        bits_per_token = codes.shape[2]
+        num_tokens = codes.shape[1]
+    else:
+        # Pack tokens manually using exact bit width
+        print(f"Packing tokens with {bits_needed} bits each")
+        toks_flat = toks_cpu.flatten().astype(np.uint32)
+        num_tokens = len(toks_flat)
+        # Convert to binary string and pack
+        total_bits = num_tokens * bits_needed
+        # Create bit array
+        bit_array = []
+        for tok in toks_flat:
+            # Convert to binary with exact bit width
+            bits = format(int(tok), f'0{bits_needed}b')
+            bit_array.extend([int(b) for b in bits])
+        # Pad to byte boundary
+        while len(bit_array) % 8 != 0:
+            bit_array.append(0)
+        # Pack into bytes
+        packed_bits = np.packbits(np.array(bit_array, dtype=np.uint8))
+        bits_per_token = bits_needed
+    # Write to file
     with open(fc_file_path, 'wb') as f:
+        # Magic number
+        f.write(b'FC01')
         # Metadata
+        f.write(struct.pack('<I', toks.shape[0]))  # batch size
+        f.write(struct.pack('<I', num_tokens))      # number of tokens
+        f.write(struct.pack('<B', bits_per_token))  # bits per token
+        # Packed data
+        f.write(packed_bits.tobytes())
     file_size = os.path.getsize(fc_file_path)
+    header_size = 4 + 4 + 4 + 1  # magic + 2 ints + 1 byte
+    data_size = file_size - header_size
+    print(f"File size: {file_size} bytes (header: {header_size}B, data: {data_size}B)")
+    print(f"===========================\n")
+    return file_size, bits_per_token, data_size
+def load_compressed_codes_optimal(fc_file_path):
+    """Load optimally packed codes"""
     with open(fc_file_path, 'rb') as f:
+        # Verify magic
         magic = f.read(4)
         if magic != b'FC01':
+            raise ValueError("Invalid .fc file!")
         # Read metadata
         batch_size = struct.unpack('<I', f.read(4))[0]
+        num_tokens = struct.unpack('<I', f.read(4))[0]
+        bits_per_token = struct.unpack('<B', f.read(1))[0]
         # Read packed data
         packed_data = np.frombuffer(f.read(), dtype=np.uint8)
+    print(f"\n=== Loading Optimal Codes ===")
+    print(f"Batch: {batch_size}, Tokens: {num_tokens}, Bits/token: {bits_per_token}")
+    # Unpack bits
+    unpacked_bits = np.unpackbits(packed_data)
+    # Extract exact number of bits needed
+    total_bits = num_tokens * bits_per_token
+    token_bits = unpacked_bits[:total_bits]
+    # Reconstruct tokens
+    tokens = []
+    for i in range(num_tokens):
+        start = i * bits_per_token
+        end = start + bits_per_token
+        token_bits_slice = token_bits[start:end]
+        # Convert binary to integer
+        token_value = 0
+        for bit in token_bits_slice:
+            token_value = (token_value << 1) | bit
+        tokens.append(token_value)
+    tokens_array = np.array(tokens, dtype=np.int64).reshape(batch_size, -1)
+    tokens_tensor = torch.from_numpy(tokens_array)
+    print(f"Loaded tokens: {tokens_tensor.shape}")
+    print(f"==============================\n")
+    return tokens_tensor
 def encode_decode_focal(audio_input):
     try:
         sr, wav_numpy = audio_input
+        print(f"\n{'='*50}")
+        print(f"Processing new audio...")
+        print(f"Input audio: sample_rate={sr}, shape={wav_numpy.shape}")
         # Handle stereo to mono conversion
         if len(wav_numpy.shape) > 1:
+            if wav_numpy.shape[1] == 2:
                 wav_numpy = wav_numpy.mean(axis=1)
                 print("Converted stereo to mono")
+            elif wav_numpy.shape[0] == 2:
                 wav_numpy = wav_numpy.mean(axis=0)
                 print("Converted stereo to mono (channels first)")
         # Ensure float32 and normalize
         wav_numpy = wav_numpy.astype(np.float32)
         if wav_numpy.max() > 1.0 or wav_numpy.min() < -1.0:
+            wav_numpy = wav_numpy / 32768.0
+        # Convert to torch tensor
         sig = torch.from_numpy(wav_numpy).unsqueeze(0)
+        # Resample to 16kHz
         if sr != codec.sample_rate_input:
             print(f"Resampling from {sr}Hz to {codec.sample_rate_input}Hz...")
             resampler = torchaudio.transforms.Resample(
             )
             sig = resampler(sig)
+        print(f"Signal shape: {sig.shape}")
         if torch.cuda.is_available():
             sig = sig.cuda()
         # --- Encode and Decode ---
         with torch.no_grad():
+            print("\n--- Encoding ---")
             toks = codec.sig_to_toks(sig)
+            duration_sec = sig.shape[-1] / codec.sample_rate_input
+            token_rate = toks.shape[1] / duration_sec
             print(f"Tokens shape: {toks.shape}")
             print(f"Token range: {toks.min().item()} to {toks.max().item()}")
+            print(f"Duration: {duration_sec:.2f}s")
+            print(f"Token rate: {token_rate:.2f} tokens/sec")
+            # Get binary codes
+            codes = codec.toks_to_codes(toks)
+            print(f"Codes shape: {codes.shape}")
+            print(f"Codes dtype: {codes.dtype}")
+            if len(codes.shape) == 3:
+                print(f"Bits per token (from codes): {codes.shape[2]}")
+            print("\n--- Decoding ---")
             rec_sig = codec.toks_to_sig(toks)
             print(f"Reconstructed signal shape: {rec_sig.shape}")
+        # --- Save with optimal bit packing ---
         temp_dir = tempfile.mkdtemp()
         fc_file_path = os.path.join(temp_dir, "compressed_tokens.fc")
+        file_size, bits_per_token, data_size = save_compressed_codes_optimal(
+            toks, codes, fc_file_path, codec
+        )
+        # Calculate bitrates
+        total_bitrate = (file_size * 8) / duration_sec
+        data_bitrate = (data_size * 8) / duration_sec
+        theoretical_bitrate = token_rate * bits_per_token
+        print(f"--- Results ---")
+        print(f"Total bitrate: {total_bitrate:.1f} bps (with header)")
+        print(f"Data bitrate: {data_bitrate:.1f} bps (data only)")
+        print(f"Theoretical: {theoretical_bitrate:.1f} bps")
+        print(f"Target: 160 bps")
+        print(f"Efficiency: {(160/data_bitrate)*100:.1f}% of target")
+        print(f"{'='*50}\n")
+        # Prepare output
         decoded_wav_output = rec_sig.cpu().numpy().squeeze()
         if len(decoded_wav_output.shape) == 0:
             decoded_wav_output = decoded_wav_output.reshape(1)
+        status_msg = f"✅ {duration_sec:.1f}s | {file_size}B | {data_bitrate:.0f} bps | {bits_per_token} bits/tok | target: 160 bps"
         return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg
     except Exception as e:
+        error_msg = f"❌ Error: {str(e)}"
         print(error_msg)
         import traceback
         traceback.print_exc()
         return None, "❌ Please upload a .fc file"
     try:
+        print(f"\n{'='*50}")
+        print(f"Decoding from file: {fc_file.name}")
+        # Load tokens
+        toks = load_compressed_codes_optimal(fc_file.name)
         if torch.cuda.is_available():
             toks = toks.cuda()
         # Decode to audio
         with torch.no_grad():
+            print("Decoding tokens to audio...")
             rec_sig = codec.toks_to_sig(toks)
+            print(f"Reconstructed signal shape: {rec_sig.shape}")
         decoded_wav = rec_sig.cpu().numpy().squeeze()
+        # Calculate stats
         duration_sec = decoded_wav.shape[0] / codec.sample_rate_output
         file_size = os.path.getsize(fc_file.name)
+        header_size = 4 + 4 + 4 + 1
+        data_size = file_size - header_size
+        bitrate = (data_size * 8) / duration_sec
+        print(f"Duration: {duration_sec:.2f}s")
+        print(f"Bitrate: {bitrate:.1f} bps")
+        print(f"{'='*50}\n")
+        status = f"✅ Decoded! {duration_sec:.1f}s | {bitrate:.0f} bps"
         return (codec.sample_rate_output, decoded_wav), status
 # --- Gradio Interface ---
+with gr.Blocks(title="FocalCodec 160 bps", theme=gr.themes.Soft()) as iface:
     gr.Markdown("# 🎙️ FocalCodec at 160 bps")
     gr.Markdown(f"**Neural speech codec at insanely low bitrate!** Using `{MODEL_CONFIG}`")
+    gr.Markdown("⚠️ **Optimized for speech only** - not suitable for music | 🔥 **1600x compression ratio!**")
     with gr.Tab("🎤 Encode Audio"):
+        gr.Markdown("### Compress audio to ~160 bps with optimal bit packing")
         with gr.Row():
             audio_input = gr.Audio(
             with gr.Column():
                 audio_output = gr.Audio(
                     type="numpy",
+                    label="🔊 Decoded Output (16kHz)"
                 )
                 file_output = gr.File(
+                    label="💾 Download Compressed .fc File"
                 )
+                status_output = gr.Textbox(label="📊 Status", lines=2)
         encode_btn = gr.Button("🔄 Encode & Decode", variant="primary", size="lg")
         encode_btn.click(
         )
         gr.Markdown("### How it works:")
+        gr.Markdown("- ✅ Automatically resamples to 16kHz")
+        gr.Markdown("- ✅ Converts stereo to mono")
+        gr.Markdown("- ✅ Encodes to discrete tokens (~12.5 tokens/sec)")
+        gr.Markdown("- ✅ Packs tokens using only needed bits (no waste!)")
+        gr.Markdown("- ✅ Decodes tokens back to audio")
+        gr.Markdown("- 📈 Check console for detailed bitrate analysis!")
     with gr.Tab("📂 Decode from .fc File"):
         gr.Markdown("### Decode previously compressed audio")
             with gr.Column():
                 decoded_output = gr.Audio(
                     type="numpy",
+                    label="🔊 Decoded Audio"
                 )
+                decode_status = gr.Textbox(label="📊 Status", lines=2)
         decode_btn = gr.Button("🔊 Decode Audio", variant="primary", size="lg")
         decode_btn.click(
             inputs=[fc_input],
             outputs=[decoded_output, decode_status]
         )
+        gr.Markdown("### Note:")
+        gr.Markdown("Upload a .fc file created by this tool to decode it back to audio.")
     with gr.Tab("ℹ️ About"):
         gr.Markdown("""
         ## FocalCodec - Ultra Low Bitrate Neural Audio Codec
+        ### 🎯 Compression Ratios:
+        | Format | Bitrate | 1-Hour File Size | Compression |
+        |--------|---------|------------------|-------------|
+        | **Uncompressed PCM** (16kHz mono) | 256 kbps | ~115 MB | 1x |
+        | **MP3** (standard) | 128 kbps | ~57 MB | 2x |
+        | **Opus** (voice optimized) | 16 kbps | ~7.2 MB | 16x |
+        | **FocalCodec** | **0.16 kbps** | **~72 KB** | **1600x** 🔥 |
+        ### 💡 Use Cases:
+        - 📞 **Ultra-low bandwidth voice calls** (satellite, deep space)
+        - 🤖 **AI-generated podcasts** (NotebookLM-style apps)
+        - 🌍 **Low-bandwidth regions** (2G networks)
+        - 📻 **Emergency communications** (disaster relief)
+        - 🎓 **Educational content distribution** (offline learning)
+        - 💾 **Voice memo storage** (years of recordings in MB)
+        ### ⚖️ Trade-offs:
+        **Pros:**
+        - ✅ Insanely efficient compression (1600x!)
+        - ✅ Speech remains highly intelligible
+        - ��� Works on any sample rate (auto-resamples)
+        - ✅ Tiny storage/bandwidth requirements
+        **Cons:**
         - ❌ Voice characteristics may change
+        - ❌ Emotional nuances can be lost
+        - ❌ Occasional pronunciation artifacts
+        - ❌ Not suitable for music or non-speech audio
+        ### 🔧 Technical Details:
+        - **Model:** `lucadellalib/focalcodec_12_5hz`
+        - **Sample Rate:** 16 kHz
+        - **Token Rate:** ~12.5 tokens/second
+        - **Bits per Token:** 13 bits (auto-detected, optimally packed)
+        - **Target Bitrate:** 160 bps (12.5 × 13 = 162.5 bps)
+        - **File Format:** Custom binary format with metadata header
+        ### 🧮 How We Achieve 160 bps:
+        Traditional approach would waste bits:
+        ```
+        Token (0-8191) → int16 (16 bits) → 16 × 12.5 = 200 bps ❌
+        Wasting 3 bits per token!
+        ```
+        Our optimal approach:
+        ```
+        Token (0-8191) → 13 bits exactly → 13 × 12.5 = 162.5 bps ✅
+        Zero waste!
+        ```
+        ### 🔬 Debug Information:
+        Check the **console/terminal** for detailed encoding information:
+        - Actual token rate and range
+        - Bits per token (detected automatically)
+        - Expected vs actual bitrate
+        - File size breakdown (header vs data)
+        - Compression efficiency
+        ### 📚 Example Use Case - AI Podcast Library:
+        Imagine storing **1000 hours** of AI-generated podcasts:
+        - **Uncompressed:** 115 GB
+        - **MP3:** 57 GB
+        - **Opus:** 7.2 GB
+        - **FocalCodec:** **72 MB** 🤯
+        You could fit an entire podcast library on a USB flash drive!
         ---
+        ### 🔗 Links:
+        - [FocalCodec GitHub](https://github.com/lucadellalib/focalcodec)
+        - [Research Paper](https://arxiv.org/abs/2410.03608)
+        ### 🏗️ Built with:
+        - PyTorch + TorchAudio
+        - Gradio
+        - FocalCodec (Luca Della Libera et al.)
         """)
 if __name__ == "__main__":
+    print("\n" + "="*50)
+    print("🎙️  FocalCodec 160 bps Demo")
+    print("="*50 + "\n")
     iface.launch()