Spaces:

MihaiPopa-1
/

FocalCodec-Demo

Sleeping

App Files Files Community

MihaiPopa-1 commited on Dec 1, 2025

Commit

ed3b7f8

verified ·

1 Parent(s): ac56286

Update app.py

Browse files

Files changed (1) hide show

app.py +134 -94

app.py CHANGED Viewed

@@ -53,11 +53,12 @@ def save_tokens_raw(toks, fc_file_path):
     toks_cpu = toks.cpu().numpy().flatten()
     max_token = int(toks_cpu.max())
     print(f"\n=== Saving Raw Tokens ===")
-    print(f"Token shape: {toks.shape}")
-    print(f"Token range: 0 to {max_token}")
-    print(f"Num tokens: {len(toks_cpu)}")
     # Determine bits needed
     if max_token <= 1:
@@ -101,9 +102,15 @@ def save_tokens_raw(toks, fc_file_path):
         bits = format(int(tok), f'0{bits_needed}b')
         bit_array.extend([int(b) for b in bits])
     # Pad to byte boundary
     while len(bit_array) % 8 != 0:
         bit_array.append(0)
     # Pack into bytes
     packed_bits = np.packbits(np.array(bit_array, dtype=np.uint8))
@@ -114,7 +121,7 @@ def save_tokens_raw(toks, fc_file_path):
     file_size = os.path.getsize(fc_file_path)
-    print(f"File size: {file_size} bytes (pure data, no header)")
     print(f"========================\n")
     return file_size, bits_needed, len(toks_cpu), toks.shape
@@ -124,40 +131,59 @@ def load_tokens_raw(fc_file_path, bits_per_token, num_tokens, original_shape):
     """Load raw tokens from headerless binary file"""
     print(f"\n=== Loading Raw Tokens ===")
-    print(f"Expected bits/token: {bits_per_token}")
-    print(f"Expected num tokens: {num_tokens}")
-    print(f"Expected shape: {original_shape}")
     # Read all bytes
     with open(fc_file_path, 'rb') as f:
         packed_data = np.frombuffer(f.read(), dtype=np.uint8)
     # Unpack bits
     unpacked_bits = np.unpackbits(packed_data)
     # Extract exact number of bits needed
-    total_bits = num_tokens * bits_per_token
-    token_bits = unpacked_bits[:total_bits]
     # Reconstruct tokens
     tokens = []
     for i in range(num_tokens):
-        start = i * bits_per_token
-        end = start + bits_per_token
-        token_bits_slice = token_bits[start:end]
-        # Convert binary to integer
         token_value = 0
         for bit in token_bits_slice:
-            token_value = (token_value << 1) | bit
         tokens.append(token_value)
     # Reshape to original shape
-    tokens_array = np.array(tokens, dtype=np.int64).reshape(original_shape)
     tokens_tensor = torch.from_numpy(tokens_array)
-    print(f"Loaded tokens: {tokens_tensor.shape}")
-    print(f"Token range: {tokens_tensor.min().item()} to {tokens_tensor.max().item()}")
     print(f"==========================\n")
     return tokens_tensor
@@ -168,7 +194,8 @@ last_encoding_metadata = {
     'bits_per_token': None,
     'num_tokens': None,
     'shape': None,
-    'duration': None
 }
@@ -236,7 +263,7 @@ def encode_decode_focal(audio_input):
             print(f"Duration: {duration_sec:.2f}s")
             print(f"Token rate: {token_rate:.2f} tokens/sec")
-            print("\n--- Decoding ---")
             rec_sig = codec.toks_to_sig(toks)
             print(f"Reconstructed signal shape: {rec_sig.shape}")
@@ -250,8 +277,9 @@ def encode_decode_focal(audio_input):
         last_encoding_metadata = {
             'bits_per_token': bits_per_token,
             'num_tokens': num_tokens,
-            'shape': shape,
-            'duration': duration_sec
         }
         # Calculate bitrates
@@ -259,10 +287,20 @@ def encode_decode_focal(audio_input):
         theoretical_bitrate = token_rate * bits_per_token
         print(f"--- Results ---")
-        print(f"File bitrate: {bitrate:.1f} bps (pure data)")
         print(f"Theoretical: {theoretical_bitrate:.1f} bps")
         print(f"Target: 160 bps")
-        print(f"Efficiency: {(160/bitrate)*100:.1f}% of target")
         print(f"{'='*50}\n")
         # Prepare output
@@ -271,8 +309,8 @@ def encode_decode_focal(audio_input):
         if len(decoded_wav_output.shape) == 0:
             decoded_wav_output = decoded_wav_output.reshape(1)
-        metadata_info = f"\n\nℹ️ SAVE THIS: bits={bits_per_token}, tokens={num_tokens}, shape={shape}"
-        status_msg = f"✅ {duration_sec:.1f}s | {file_size}B | {bitrate:.0f} bps | {bits_per_token} bits/tok{metadata_info}"
         return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg
@@ -293,25 +331,31 @@ def decode_from_fc_file(fc_file, bits_per_token_input, num_tokens_input, batch_s
     if fc_file is None:
         return None, "❌ Please upload a .fc file"
-    # Try to use provided metadata, or fall back to last encoding
     try:
-        bits_per_token = int(bits_per_token_input) if bits_per_token_input else last_encoding_metadata.get('bits_per_token')
-        num_tokens = int(num_tokens_input) if num_tokens_input else last_encoding_metadata.get('num_tokens')
-        if batch_size_input and seq_length_input:
             shape = (int(batch_size_input), int(seq_length_input))
         else:
-            shape = last_encoding_metadata.get('shape')
-        if not all([bits_per_token, num_tokens, shape]):
-            return None, "❌ Please provide metadata (bits/token, num tokens, batch, seq_length) OR encode a file first"
-    except Exception as e:
-        return None, f"❌ Invalid metadata format: {str(e)}"
-    try:
         print(f"\n{'='*50}")
-        print(f"Decoding from file: {fc_file.name}")
         # Load tokens
         toks = load_tokens_raw(fc_file.name, bits_per_token, num_tokens, shape)
@@ -336,14 +380,14 @@ def decode_from_fc_file(fc_file, bits_per_token_input, num_tokens_input, batch_s
         print(f"Bitrate: {bitrate:.1f} bps")
         print(f"{'='*50}\n")
-        status = f"✅ Decoded! {duration_sec:.1f}s | {bitrate:.0f} bps | {bits_per_token} bits/token"
         return (codec.sample_rate_output, decoded_wav), status
     except Exception as e:
         import traceback
         traceback.print_exc()
-        return None, f"❌ Error: {str(e)}"
 # --- Gradio Interface ---
@@ -370,7 +414,7 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
                 file_output = gr.File(
                     label="💾 Download Compressed .fc File (headerless)"
                 )
-                status_output = gr.Textbox(label="📊 Status", lines=4)
         encode_btn = gr.Button("🔄 Encode & Decode", variant="primary", size="lg")
         encode_btn.click(
@@ -380,9 +424,9 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
         )
         gr.Markdown("### ⚠️ Important:")
-        gr.Markdown("- The .fc file contains ONLY raw token data (no metadata/header)")
-        gr.Markdown("- **Save the metadata** from the status message to decode later!")
-        gr.Markdown("- You need: bits per token, number of tokens, and shape")
     with gr.Tab("📂 Decode from .fc File"):
         gr.Markdown("### Decode raw .fc file (requires metadata)")
@@ -394,42 +438,42 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
                     file_types=[".fc"]
                 )
-                gr.Markdown("#### Metadata (required for decoding):")
                 with gr.Row():
                     bits_input = gr.Number(
                         label="Bits per token",
-                        value=13,
-                        precision=0,
-                        info="Usually 13 for this model"
                     )
                     tokens_input = gr.Number(
                         label="Number of tokens",
-                        precision=0,
-                        info="Total tokens in file"
                     )
                 with gr.Row():
                     batch_input = gr.Number(
                         label="Batch size",
-                        value=1,
-                        precision=0,
-                        info="Usually 1"
                     )
                     seq_input = gr.Number(
                         label="Sequence length",
-                        precision=0,
-                        info="Tokens per batch"
                     )
-                gr.Markdown("💡 If you just encoded a file, leave these blank to use saved metadata")
             with gr.Column():
                 decoded_output = gr.Audio(
                     type="numpy",
                     label="🔊 Decoded Audio"
                 )
-                decode_status = gr.Textbox(label="📊 Status", lines=2)
         decode_btn = gr.Button("🔊 Decode Audio", variant="primary", size="lg")
         decode_btn.click(
@@ -444,46 +488,42 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
         ### 🎯 Pure Token Format (No Headers!)
-        This version saves **ONLY the compressed tokens** with no metadata overhead.
-        **Benefits:**
-        - ✅ Absolute minimum file size
-        - ✅ True 160 bps (no header padding)
-        - ✅ Maximum compression efficiency
-        **Trade-off:**
-        - ⚠️ You must save the metadata separately to decode
-        - Required info: bits per token, number of tokens, shape
-        ### 📊 Compression Ratios:
-        | Format | Bitrate | 1-Hour File Size |
-        |--------|---------|------------------|
-        | Uncompressed PCM | 256 kbps | ~115 MB |
-        | MP3 | 128 kbps | ~57 MB |
-        | Opus | 16 kbps | ~7.2 MB |
-        | **FocalCodec** | **0.16 kbps** | **~72 KB** 🔥 |
-        ### 🔧 Technical Details:
-        - **Token Rate:** ~12.5 tokens/sec
-        - **Bits per Token:** 13 bits (for most speech)
-        - **Bitrate:** 12.5 × 13 = 162.5 bps ≈ **160 bps**
-        - **Format:** Raw bit-packed tokens (no header)
-        ### 📝 Example Metadata:
-        After encoding, you'll see:
-        ```
-        ℹ️ SAVE THIS: bits=13, tokens=113, shape=(1, 113)
-        ```
-        Save this to decode the file later!
-        ### 💡 Pro Tip:
-        If you're building a system, embed the metadata in a separate JSON file:
         ```json
         {
-          "audio.fc": {
-            "bits_per_token": 13,
-            "num_tokens": 113,
             "shape": [1, 113],
             "duration": 9.04
           }

     toks_cpu = toks.cpu().numpy().flatten()
     max_token = int(toks_cpu.max())
+    min_token = int(toks_cpu.min())
     print(f"\n=== Saving Raw Tokens ===")
+    print(f"Original shape: {toks.shape}")
+    print(f"Flattened tokens: {len(toks_cpu)}")
+    print(f"Token range: {min_token} to {max_token}")
     # Determine bits needed
     if max_token <= 1:
         bits = format(int(tok), f'0{bits_needed}b')
         bit_array.extend([int(b) for b in bits])
+    print(f"Total bits: {len(bit_array)}")
     # Pad to byte boundary
+    padding = 0
     while len(bit_array) % 8 != 0:
         bit_array.append(0)
+        padding += 1
+    print(f"Padding bits: {padding}")
     # Pack into bytes
     packed_bits = np.packbits(np.array(bit_array, dtype=np.uint8))
     file_size = os.path.getsize(fc_file_path)
+    print(f"File size: {file_size} bytes")
     print(f"========================\n")
     return file_size, bits_needed, len(toks_cpu), toks.shape
     """Load raw tokens from headerless binary file"""
     print(f"\n=== Loading Raw Tokens ===")
+    print(f"File: {fc_file_path}")
+    print(f"Bits per token: {bits_per_token}")
+    print(f"Num tokens: {num_tokens}")
+    print(f"Target shape: {original_shape}")
     # Read all bytes
     with open(fc_file_path, 'rb') as f:
         packed_data = np.frombuffer(f.read(), dtype=np.uint8)
+    print(f"Read {len(packed_data)} bytes")
     # Unpack bits
     unpacked_bits = np.unpackbits(packed_data)
+    print(f"Unpacked to {len(unpacked_bits)} bits")
     # Extract exact number of bits needed
+    total_bits_needed = num_tokens * bits_per_token
+    print(f"Need {total_bits_needed} bits for {num_tokens} tokens")
+    if len(unpacked_bits) < total_bits_needed:
+        raise ValueError(f"Not enough bits in file! Have {len(unpacked_bits)}, need {total_bits_needed}")
+    token_bits = unpacked_bits[:total_bits_needed]
     # Reconstruct tokens
     tokens = []
     for i in range(num_tokens):
+        start_bit = i * bits_per_token
+        end_bit = start_bit + bits_per_token
+        token_bits_slice = token_bits[start_bit:end_bit]
+        # Convert binary array to integer
         token_value = 0
         for bit in token_bits_slice:
+            token_value = (token_value << 1) | int(bit)
         tokens.append(token_value)
+    print(f"Reconstructed {len(tokens)} tokens")
+    print(f"Token range: {min(tokens)} to {max(tokens)}")
     # Reshape to original shape
+    tokens_array = np.array(tokens, dtype=np.int64)
+    # Validate shape
+    if tokens_array.size != np.prod(original_shape):
+        raise ValueError(f"Shape mismatch! Have {tokens_array.size} tokens, need {np.prod(original_shape)}")
+    tokens_array = tokens_array.reshape(original_shape)
     tokens_tensor = torch.from_numpy(tokens_array)
+    print(f"Final tensor shape: {tokens_tensor.shape}")
+    print(f"Final token range: {tokens_tensor.min().item()} to {tokens_tensor.max().item()}")
     print(f"==========================\n")
     return tokens_tensor
     'bits_per_token': None,
     'num_tokens': None,
     'shape': None,
+    'duration': None,
+    'filename': None
 }
             print(f"Duration: {duration_sec:.2f}s")
             print(f"Token rate: {token_rate:.2f} tokens/sec")
+            print("\n--- Decoding (test) ---")
             rec_sig = codec.toks_to_sig(toks)
             print(f"Reconstructed signal shape: {rec_sig.shape}")
         last_encoding_metadata = {
             'bits_per_token': bits_per_token,
             'num_tokens': num_tokens,
+            'shape': tuple(shape),
+            'duration': duration_sec,
+            'filename': fc_file_path
         }
         # Calculate bitrates
         theoretical_bitrate = token_rate * bits_per_token
         print(f"--- Results ---")
+        print(f"File bitrate: {bitrate:.1f} bps")
         print(f"Theoretical: {theoretical_bitrate:.1f} bps")
         print(f"Target: 160 bps")
+        print(f"Efficiency: {(bitrate/160)*100:.1f}% of target")
+        # TEST: Try to decode immediately to verify
+        print(f"\n--- Verification: Decoding saved file ---")
+        try:
+            test_toks = load_tokens_raw(fc_file_path, bits_per_token, num_tokens, shape)
+            print(f"✅ Verification successful!")
+            print(f"Tokens match: {torch.equal(toks.cpu(), test_toks)}")
+        except Exception as e:
+            print(f"❌ Verification failed: {e}")
         print(f"{'='*50}\n")
         # Prepare output
         if len(decoded_wav_output.shape) == 0:
             decoded_wav_output = decoded_wav_output.reshape(1)
+        metadata_str = f"bits={bits_per_token}, tokens={num_tokens}, shape={shape}"
+        status_msg = f"✅ {duration_sec:.1f}s | {file_size}B | {bitrate:.0f} bps | {bits_per_token} bits/tok\n\n📋 METADATA: {metadata_str}"
         return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg
     if fc_file is None:
         return None, "❌ Please upload a .fc file"
     try:
+        # Parse metadata
+        if bits_per_token_input and num_tokens_input and batch_size_input and seq_length_input:
+            # Use provided values
+            bits_per_token = int(bits_per_token_input)
+            num_tokens = int(num_tokens_input)
             shape = (int(batch_size_input), int(seq_length_input))
+            print("Using manually provided metadata")
         else:
+            # Use saved metadata
+            if not last_encoding_metadata.get('bits_per_token'):
+                return None, "❌ No metadata available! Either encode a file first OR provide all metadata fields"
+            bits_per_token = last_encoding_metadata['bits_per_token']
+            num_tokens = last_encoding_metadata['num_tokens']
+            shape = last_encoding_metadata['shape']
+            print("Using saved metadata from last encoding")
         print(f"\n{'='*50}")
+        print(f"Decoding file: {fc_file.name}")
+        print(f"Metadata: bits={bits_per_token}, tokens={num_tokens}, shape={shape}")
+        # Validate
+        if num_tokens != shape[0] * shape[1]:
+            return None, f"❌ Shape mismatch! {num_tokens} tokens != {shape[0]}×{shape[1]} = {shape[0]*shape[1]}"
         # Load tokens
         toks = load_tokens_raw(fc_file.name, bits_per_token, num_tokens, shape)
         print(f"Bitrate: {bitrate:.1f} bps")
         print(f"{'='*50}\n")
+        status = f"✅ Decoded successfully!\n{duration_sec:.1f}s | {file_size}B | {bitrate:.0f} bps | {bits_per_token} bits/tok"
         return (codec.sample_rate_output, decoded_wav), status
     except Exception as e:
         import traceback
         traceback.print_exc()
+        return None, f"❌ Decoding error: {str(e)}"
 # --- Gradio Interface ---
                 file_output = gr.File(
                     label="💾 Download Compressed .fc File (headerless)"
                 )
+                status_output = gr.Textbox(label="📊 Status", lines=5)
         encode_btn = gr.Button("🔄 Encode & Decode", variant="primary", size="lg")
         encode_btn.click(
         )
         gr.Markdown("### ⚠️ Important:")
+        gr.Markdown("- The .fc file contains ONLY raw token data (no metadata)")
+        gr.Markdown("- **Copy the METADATA from the status box** to decode later!")
+        gr.Markdown("- Format: `bits=13, tokens=113, shape=(1, 113)`")
     with gr.Tab("📂 Decode from .fc File"):
         gr.Markdown("### Decode raw .fc file (requires metadata)")
                     file_types=[".fc"]
                 )
+                gr.Markdown("#### 📋 Metadata (from encoding step):")
+                gr.Markdown("Leave blank to use last encoded file's metadata")
                 with gr.Row():
                     bits_input = gr.Number(
                         label="Bits per token",
+                        placeholder="e.g., 13",
+                        precision=0
                     )
                     tokens_input = gr.Number(
                         label="Number of tokens",
+                        placeholder="e.g., 113",
+                        precision=0
                     )
                 with gr.Row():
                     batch_input = gr.Number(
                         label="Batch size",
+                        placeholder="e.g., 1",
+                        precision=0
                     )
                     seq_input = gr.Number(
                         label="Sequence length",
+                        placeholder="e.g., 113",
+                        precision=0
                     )
+                gr.Markdown("💡 **Example:** If metadata says `bits=13, tokens=113, shape=(1, 113)`")
+                gr.Markdown("Enter: bits=13, tokens=113, batch=1, seq=113")
             with gr.Column():
                 decoded_output = gr.Audio(
                     type="numpy",
                     label="🔊 Decoded Audio"
                 )
+                decode_status = gr.Textbox(label="📊 Status", lines=3)
         decode_btn = gr.Button("🔊 Decode Audio", variant="primary", size="lg")
         decode_btn.click(
         ### 🎯 Pure Token Format (No Headers!)
+        This version saves **ONLY the compressed tokens** with zero overhead.
+        ### 📊 Compression:
+        - **Uncompressed:** 256 kbps → 115 MB/hour
+        - **FocalCodec:** 160 bps → **72 KB/hour** (1600x smaller!)
+        ### 🔧 How to Use:
+        **Encoding:**
+        1. Upload/record audio
+        2. Click "Encode & Decode"
+        3. **COPY THE METADATA** from status (important!)
+        4. Download .fc file
+        **Decoding:**
+        1. Upload .fc file
+        2. Enter metadata OR leave blank if you just encoded
+        3. Click "Decode Audio"
+        ### 📝 Metadata Format:
+        ```
+        bits=13, tokens=113, shape=(1, 113)
+        ```
+        Means:
+        - 13 bits per token
+        - 113 total tokens
+        - Batch size = 1
+        - Sequence length = 113
+        ### 💡 Storage Tip:
+        Store metadata in a companion JSON file:
         ```json
         {
+          "recording_001.fc": {
+            "bits": 13,
+            "tokens": 113,
             "shape": [1, 113],
             "duration": 9.04
           }