MihaiPopa-1 commited on
Commit
625e20b
·
verified ·
1 Parent(s): 5146984

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +174 -194
app.py CHANGED
@@ -4,7 +4,6 @@ import gradio as gr
4
  import os
5
  import tempfile
6
  import numpy as np
7
- import struct
8
 
9
  # Define the model ID for the 0.16 kbps codec config
10
  MODEL_CONFIG = "lucadellalib/focalcodec_12_5hz"
@@ -49,18 +48,18 @@ except Exception as e:
49
  codec = None
50
 
51
 
52
- def save_compressed_codes_optimal(toks, codes, fc_file_path, codec):
53
- """Save codes with optimal bit packing to achieve true 160 bps"""
54
 
55
- codes_cpu = codes.cpu().numpy()
56
- toks_cpu = toks.cpu().numpy()
57
 
58
- print(f"\n=== Optimal Compression ===")
59
- print(f"Codes shape: {codes.shape}")
60
- print(f"Codes dtype: {codes.dtype}")
 
61
 
62
- # Determine actual bits needed based on token range
63
- max_token = int(toks_cpu.max())
64
  if max_token <= 1:
65
  bits_needed = 1
66
  elif max_token <= 3:
@@ -94,85 +93,45 @@ def save_compressed_codes_optimal(toks, codes, fc_file_path, codec):
94
  else:
95
  bits_needed = 16
96
 
97
- print(f"Token range: 0 to {max_token}")
98
- print(f"Bits needed per token: {bits_needed}")
99
 
100
- # If codes are already binary (batch, time, bits), use them directly
101
- if len(codes.shape) == 3 and codes.dtype in [torch.bool, torch.uint8]:
102
- print(f"Using binary codes directly: {codes.shape[2]} bits per token")
103
- # Pack the binary codes
104
- codes_flat = codes_cpu.flatten()
105
- packed_bits = np.packbits(codes_flat)
106
- bits_per_token = codes.shape[2]
107
- num_tokens = codes.shape[1]
108
-
109
- else:
110
- # Pack tokens manually using exact bit width
111
- print(f"Packing tokens with {bits_needed} bits each")
112
- toks_flat = toks_cpu.flatten().astype(np.uint32)
113
- num_tokens = len(toks_flat)
114
-
115
- # Convert to binary string and pack
116
- total_bits = num_tokens * bits_needed
117
-
118
- # Create bit array
119
- bit_array = []
120
- for tok in toks_flat:
121
- # Convert to binary with exact bit width
122
- bits = format(int(tok), f'0{bits_needed}b')
123
- bit_array.extend([int(b) for b in bits])
124
-
125
- # Pad to byte boundary
126
- while len(bit_array) % 8 != 0:
127
- bit_array.append(0)
128
-
129
- # Pack into bytes
130
- packed_bits = np.packbits(np.array(bit_array, dtype=np.uint8))
131
- bits_per_token = bits_needed
132
-
133
- # Write to file
134
  with open(fc_file_path, 'wb') as f:
135
- # Magic number
136
- f.write(b'FC01')
137
-
138
- # Metadata
139
- f.write(struct.pack('<I', toks.shape[0])) # batch size
140
- f.write(struct.pack('<I', num_tokens)) # number of tokens
141
- f.write(struct.pack('<B', bits_per_token)) # bits per token
142
-
143
- # Packed data
144
  f.write(packed_bits.tobytes())
145
 
146
  file_size = os.path.getsize(fc_file_path)
147
- header_size = 4 + 4 + 4 + 1 # magic + 2 ints + 1 byte
148
- data_size = file_size - header_size
149
 
150
- print(f"File size: {file_size} bytes (header: {header_size}B, data: {data_size}B)")
151
- print(f"===========================\n")
152
 
153
- return file_size, bits_per_token, data_size
154
 
155
 
156
- def load_compressed_codes_optimal(fc_file_path):
157
- """Load optimally packed codes"""
158
 
 
 
 
 
 
 
159
  with open(fc_file_path, 'rb') as f:
160
- # Verify magic
161
- magic = f.read(4)
162
- if magic != b'FC01':
163
- raise ValueError("Invalid .fc file!")
164
-
165
- # Read metadata
166
- batch_size = struct.unpack('<I', f.read(4))[0]
167
- num_tokens = struct.unpack('<I', f.read(4))[0]
168
- bits_per_token = struct.unpack('<B', f.read(1))[0]
169
-
170
- # Read packed data
171
  packed_data = np.frombuffer(f.read(), dtype=np.uint8)
172
 
173
- print(f"\n=== Loading Optimal Codes ===")
174
- print(f"Batch: {batch_size}, Tokens: {num_tokens}, Bits/token: {bits_per_token}")
175
-
176
  # Unpack bits
177
  unpacked_bits = np.unpackbits(packed_data)
178
 
@@ -193,20 +152,33 @@ def load_compressed_codes_optimal(fc_file_path):
193
  token_value = (token_value << 1) | bit
194
  tokens.append(token_value)
195
 
196
- tokens_array = np.array(tokens, dtype=np.int64).reshape(batch_size, -1)
 
197
  tokens_tensor = torch.from_numpy(tokens_array)
198
 
199
  print(f"Loaded tokens: {tokens_tensor.shape}")
200
- print(f"==============================\n")
 
201
 
202
  return tokens_tensor
203
 
204
 
 
 
 
 
 
 
 
 
 
205
  def encode_decode_focal(audio_input):
206
  """
207
  Processes input audio through the 160 bps FocalCodec, saves the tokens,
208
  and returns both the decoded WAV and the path to the FC file for download.
209
  """
 
 
210
  if codec is None:
211
  return None, None, "❌ ERROR: Model failed to load. Check console for details."
212
 
@@ -264,36 +236,33 @@ def encode_decode_focal(audio_input):
264
  print(f"Duration: {duration_sec:.2f}s")
265
  print(f"Token rate: {token_rate:.2f} tokens/sec")
266
 
267
- # Get binary codes
268
- codes = codec.toks_to_codes(toks)
269
- print(f"Codes shape: {codes.shape}")
270
- print(f"Codes dtype: {codes.dtype}")
271
- if len(codes.shape) == 3:
272
- print(f"Bits per token (from codes): {codes.shape[2]}")
273
-
274
  print("\n--- Decoding ---")
275
  rec_sig = codec.toks_to_sig(toks)
276
  print(f"Reconstructed signal shape: {rec_sig.shape}")
277
 
278
- # --- Save with optimal bit packing ---
279
  temp_dir = tempfile.mkdtemp()
280
  fc_file_path = os.path.join(temp_dir, "compressed_tokens.fc")
281
 
282
- file_size, bits_per_token, data_size = save_compressed_codes_optimal(
283
- toks, codes, fc_file_path, codec
284
- )
 
 
 
 
 
 
285
 
286
  # Calculate bitrates
287
- total_bitrate = (file_size * 8) / duration_sec
288
- data_bitrate = (data_size * 8) / duration_sec
289
  theoretical_bitrate = token_rate * bits_per_token
290
 
291
  print(f"--- Results ---")
292
- print(f"Total bitrate: {total_bitrate:.1f} bps (with header)")
293
- print(f"Data bitrate: {data_bitrate:.1f} bps (data only)")
294
  print(f"Theoretical: {theoretical_bitrate:.1f} bps")
295
  print(f"Target: 160 bps")
296
- print(f"Efficiency: {(160/data_bitrate)*100:.1f}% of target")
297
  print(f"{'='*50}\n")
298
 
299
  # Prepare output
@@ -302,7 +271,8 @@ def encode_decode_focal(audio_input):
302
  if len(decoded_wav_output.shape) == 0:
303
  decoded_wav_output = decoded_wav_output.reshape(1)
304
 
305
- status_msg = f" {duration_sec:.1f}s | {file_size}B | {data_bitrate:.0f} bps | {bits_per_token} bits/tok | target: 160 bps"
 
306
 
307
  return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg
308
 
@@ -314,8 +284,8 @@ def encode_decode_focal(audio_input):
314
  return None, None, error_msg
315
 
316
 
317
- def decode_from_fc_file(fc_file):
318
- """Decode audio from uploaded .fc file"""
319
 
320
  if codec is None:
321
  return None, "❌ Model not loaded"
@@ -323,12 +293,28 @@ def decode_from_fc_file(fc_file):
323
  if fc_file is None:
324
  return None, "❌ Please upload a .fc file"
325
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  try:
327
  print(f"\n{'='*50}")
328
  print(f"Decoding from file: {fc_file.name}")
329
 
330
  # Load tokens
331
- toks = load_compressed_codes_optimal(fc_file.name)
332
 
333
  if torch.cuda.is_available():
334
  toks = toks.cuda()
@@ -344,15 +330,13 @@ def decode_from_fc_file(fc_file):
344
  # Calculate stats
345
  duration_sec = decoded_wav.shape[0] / codec.sample_rate_output
346
  file_size = os.path.getsize(fc_file.name)
347
- header_size = 4 + 4 + 4 + 1
348
- data_size = file_size - header_size
349
- bitrate = (data_size * 8) / duration_sec
350
 
351
  print(f"Duration: {duration_sec:.2f}s")
352
  print(f"Bitrate: {bitrate:.1f} bps")
353
  print(f"{'='*50}\n")
354
 
355
- status = f"✅ Decoded! {duration_sec:.1f}s | {bitrate:.0f} bps"
356
 
357
  return (codec.sample_rate_output, decoded_wav), status
358
 
@@ -363,13 +347,13 @@ def decode_from_fc_file(fc_file):
363
 
364
 
365
  # --- Gradio Interface ---
366
- with gr.Blocks(title="FocalCodec 160 bps") as iface:
367
  gr.Markdown("# 🎙️ FocalCodec at 160 bps")
368
  gr.Markdown(f"**Neural speech codec at insanely low bitrate!** Using `{MODEL_CONFIG}`")
369
- gr.Markdown("⚠️ **Optimized for speech only** - not suitable for music | 🔥 **1600x compression ratio!**")
370
 
371
  with gr.Tab("🎤 Encode Audio"):
372
- gr.Markdown("### Compress audio to ~160 bps with optimal bit packing")
373
 
374
  with gr.Row():
375
  audio_input = gr.Audio(
@@ -384,9 +368,9 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
384
  label="🔊 Decoded Output (16kHz)"
385
  )
386
  file_output = gr.File(
387
- label="💾 Download Compressed .fc File"
388
  )
389
- status_output = gr.Textbox(label="📊 Status", lines=2)
390
 
391
  encode_btn = gr.Button("🔄 Encode & Decode", variant="primary", size="lg")
392
  encode_btn.click(
@@ -395,22 +379,50 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
395
  outputs=[audio_output, file_output, status_output]
396
  )
397
 
398
- gr.Markdown("### How it works:")
399
- gr.Markdown("- Automatically resamples to 16kHz")
400
- gr.Markdown("- Converts stereo to mono")
401
- gr.Markdown("- Encodes to discrete tokens (~12.5 tokens/sec)")
402
- gr.Markdown("- ✅ Packs tokens using only needed bits (no waste!)")
403
- gr.Markdown("- ✅ Decodes tokens back to audio")
404
- gr.Markdown("- 📈 Check console for detailed bitrate analysis!")
405
 
406
  with gr.Tab("📂 Decode from .fc File"):
407
- gr.Markdown("### Decode previously compressed audio")
408
 
409
  with gr.Row():
410
- fc_input = gr.File(
411
- label="Upload .fc File",
412
- file_types=[".fc"]
413
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
  with gr.Column():
416
  decoded_output = gr.Audio(
@@ -422,101 +434,69 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
422
  decode_btn = gr.Button("🔊 Decode Audio", variant="primary", size="lg")
423
  decode_btn.click(
424
  fn=decode_from_fc_file,
425
- inputs=[fc_input],
426
  outputs=[decoded_output, decode_status]
427
  )
428
-
429
- gr.Markdown("### Note:")
430
- gr.Markdown("Upload a .fc file created by this tool to decode it back to audio.")
431
 
432
  with gr.Tab("ℹ️ About"):
433
  gr.Markdown("""
434
  ## FocalCodec - Ultra Low Bitrate Neural Audio Codec
435
 
436
- ### 🎯 Compression Ratios:
437
- | Format | Bitrate | 1-Hour File Size | Compression |
438
- |--------|---------|------------------|-------------|
439
- | **Uncompressed PCM** (16kHz mono) | 256 kbps | ~115 MB | 1x |
440
- | **MP3** (standard) | 128 kbps | ~57 MB | 2x |
441
- | **Opus** (voice optimized) | 16 kbps | ~7.2 MB | 16x |
442
- | **FocalCodec** | **0.16 kbps** | **~72 KB** | **1600x** 🔥 |
443
-
444
- ### 💡 Use Cases:
445
- - 📞 **Ultra-low bandwidth voice calls** (satellite, deep space)
446
- - 🤖 **AI-generated podcasts** (NotebookLM-style apps)
447
- - 🌍 **Low-bandwidth regions** (2G networks)
448
- - 📻 **Emergency communications** (disaster relief)
449
- - 🎓 **Educational content distribution** (offline learning)
450
- - 💾 **Voice memo storage** (years of recordings in MB)
451
-
452
- ### ⚖️ Trade-offs:
453
-
454
- **Pros:**
455
- - ✅ Insanely efficient compression (1600x!)
456
- - ✅ Speech remains highly intelligible
457
- - ✅ Works on any sample rate (auto-resamples)
458
- - ✅ Tiny storage/bandwidth requirements
459
-
460
- **Cons:**
461
- - ❌ Voice characteristics may change
462
- - ❌ Emotional nuances can be lost
463
- - ❌ Occasional pronunciation artifacts
464
- - ❌ Not suitable for music or non-speech audio
465
 
466
- ### 🔧 Technical Details:
467
- - **Model:** `lucadellalib/focalcodec_12_5hz`
468
- - **Sample Rate:** 16 kHz
469
- - **Token Rate:** ~12.5 tokens/second
470
- - **Bits per Token:** 13 bits (auto-detected, optimally packed)
471
- - **Target Bitrate:** 160 bps (12.5 × 13 = 162.5 bps)
472
- - **File Format:** Custom binary format with metadata header
473
 
474
- ### 🧮 How We Achieve 160 bps:
 
 
 
475
 
476
- Traditional approach would waste bits:
477
- ```
478
- Token (0-8191) int16 (16 bits) 16 × 12.5 = 200 bps ❌
479
- Wasting 3 bits per token!
480
- ```
481
-
482
- Our optimal approach:
483
- ```
484
- Token (0-8191) → 13 bits exactly → 13 × 12.5 = 162.5 bps ✅
485
- Zero waste!
486
- ```
487
 
488
- ### 🔬 Debug Information:
489
- Check the **console/terminal** for detailed encoding information:
490
- - Actual token rate and range
491
- - Bits per token (detected automatically)
492
- - Expected vs actual bitrate
493
- - File size breakdown (header vs data)
494
- - Compression efficiency
495
 
496
- ### 📚 Example Use Case - AI Podcast Library:
 
 
 
 
497
 
498
- Imagine storing **1000 hours** of AI-generated podcasts:
499
- - **Uncompressed:** 115 GB
500
- - **MP3:** 57 GB
501
- - **Opus:** 7.2 GB
502
- - **FocalCodec:** **72 MB** 🤯
503
 
504
- You could fit an entire podcast library on a USB flash drive!
 
 
 
 
 
 
 
 
 
 
 
 
 
505
 
506
  ---
507
 
508
- ### 🔗 Links:
509
- - [FocalCodec GitHub](https://github.com/lucadellalib/focalcodec)
510
- - [Research Paper](https://arxiv.org/abs/2410.03608)
511
-
512
- ### 🏗️ Built with:
513
- - PyTorch + TorchAudio
514
- - Gradio
515
- - FocalCodec (Luca Della Libera et al.)
516
  """)
517
 
518
  if __name__ == "__main__":
519
  print("\n" + "="*50)
520
- print("🎙️ FocalCodec 160 bps Demo")
521
  print("="*50 + "\n")
522
  iface.launch()
 
4
  import os
5
  import tempfile
6
  import numpy as np
 
7
 
8
  # Define the model ID for the 0.16 kbps codec config
9
  MODEL_CONFIG = "lucadellalib/focalcodec_12_5hz"
 
48
  codec = None
49
 
50
 
51
+ def save_tokens_raw(toks, fc_file_path):
52
+ """Save tokens as raw binary with NO header - pure tokens only"""
53
 
54
+ toks_cpu = toks.cpu().numpy().flatten()
55
+ max_token = int(toks_cpu.max())
56
 
57
+ print(f"\n=== Saving Raw Tokens ===")
58
+ print(f"Token shape: {toks.shape}")
59
+ print(f"Token range: 0 to {max_token}")
60
+ print(f"Num tokens: {len(toks_cpu)}")
61
 
62
+ # Determine bits needed
 
63
  if max_token <= 1:
64
  bits_needed = 1
65
  elif max_token <= 3:
 
93
  else:
94
  bits_needed = 16
95
 
96
+ print(f"Bits per token: {bits_needed}")
 
97
 
98
+ # Create bit array
99
+ bit_array = []
100
+ for tok in toks_cpu:
101
+ bits = format(int(tok), f'0{bits_needed}b')
102
+ bit_array.extend([int(b) for b in bits])
103
+
104
+ # Pad to byte boundary
105
+ while len(bit_array) % 8 != 0:
106
+ bit_array.append(0)
107
+
108
+ # Pack into bytes
109
+ packed_bits = np.packbits(np.array(bit_array, dtype=np.uint8))
110
+
111
+ # Write ONLY the packed data (no header!)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  with open(fc_file_path, 'wb') as f:
 
 
 
 
 
 
 
 
 
113
  f.write(packed_bits.tobytes())
114
 
115
  file_size = os.path.getsize(fc_file_path)
 
 
116
 
117
+ print(f"File size: {file_size} bytes (pure data, no header)")
118
+ print(f"========================\n")
119
 
120
+ return file_size, bits_needed, len(toks_cpu), toks.shape
121
 
122
 
123
+ def load_tokens_raw(fc_file_path, bits_per_token, num_tokens, original_shape):
124
+ """Load raw tokens from headerless binary file"""
125
 
126
+ print(f"\n=== Loading Raw Tokens ===")
127
+ print(f"Expected bits/token: {bits_per_token}")
128
+ print(f"Expected num tokens: {num_tokens}")
129
+ print(f"Expected shape: {original_shape}")
130
+
131
+ # Read all bytes
132
  with open(fc_file_path, 'rb') as f:
 
 
 
 
 
 
 
 
 
 
 
133
  packed_data = np.frombuffer(f.read(), dtype=np.uint8)
134
 
 
 
 
135
  # Unpack bits
136
  unpacked_bits = np.unpackbits(packed_data)
137
 
 
152
  token_value = (token_value << 1) | bit
153
  tokens.append(token_value)
154
 
155
+ # Reshape to original shape
156
+ tokens_array = np.array(tokens, dtype=np.int64).reshape(original_shape)
157
  tokens_tensor = torch.from_numpy(tokens_array)
158
 
159
  print(f"Loaded tokens: {tokens_tensor.shape}")
160
+ print(f"Token range: {tokens_tensor.min().item()} to {tokens_tensor.max().item()}")
161
+ print(f"==========================\n")
162
 
163
  return tokens_tensor
164
 
165
 
166
+ # Global variables to store metadata for decoding
167
+ last_encoding_metadata = {
168
+ 'bits_per_token': None,
169
+ 'num_tokens': None,
170
+ 'shape': None,
171
+ 'duration': None
172
+ }
173
+
174
+
175
  def encode_decode_focal(audio_input):
176
  """
177
  Processes input audio through the 160 bps FocalCodec, saves the tokens,
178
  and returns both the decoded WAV and the path to the FC file for download.
179
  """
180
+ global last_encoding_metadata
181
+
182
  if codec is None:
183
  return None, None, "❌ ERROR: Model failed to load. Check console for details."
184
 
 
236
  print(f"Duration: {duration_sec:.2f}s")
237
  print(f"Token rate: {token_rate:.2f} tokens/sec")
238
 
 
 
 
 
 
 
 
239
  print("\n--- Decoding ---")
240
  rec_sig = codec.toks_to_sig(toks)
241
  print(f"Reconstructed signal shape: {rec_sig.shape}")
242
 
243
+ # --- Save raw tokens (no header) ---
244
  temp_dir = tempfile.mkdtemp()
245
  fc_file_path = os.path.join(temp_dir, "compressed_tokens.fc")
246
 
247
+ file_size, bits_per_token, num_tokens, shape = save_tokens_raw(toks, fc_file_path)
248
+
249
+ # Store metadata globally for decoding
250
+ last_encoding_metadata = {
251
+ 'bits_per_token': bits_per_token,
252
+ 'num_tokens': num_tokens,
253
+ 'shape': shape,
254
+ 'duration': duration_sec
255
+ }
256
 
257
  # Calculate bitrates
258
+ bitrate = (file_size * 8) / duration_sec
 
259
  theoretical_bitrate = token_rate * bits_per_token
260
 
261
  print(f"--- Results ---")
262
+ print(f"File bitrate: {bitrate:.1f} bps (pure data)")
 
263
  print(f"Theoretical: {theoretical_bitrate:.1f} bps")
264
  print(f"Target: 160 bps")
265
+ print(f"Efficiency: {(160/bitrate)*100:.1f}% of target")
266
  print(f"{'='*50}\n")
267
 
268
  # Prepare output
 
271
  if len(decoded_wav_output.shape) == 0:
272
  decoded_wav_output = decoded_wav_output.reshape(1)
273
 
274
+ metadata_info = f"\n\nℹ️ SAVE THIS: bits={bits_per_token}, tokens={num_tokens}, shape={shape}"
275
+ status_msg = f"✅ {duration_sec:.1f}s | {file_size}B | {bitrate:.0f} bps | {bits_per_token} bits/tok{metadata_info}"
276
 
277
  return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg
278
 
 
284
  return None, None, error_msg
285
 
286
 
287
+ def decode_from_fc_file(fc_file, bits_per_token_input, num_tokens_input, batch_size_input, seq_length_input):
288
+ """Decode audio from uploaded .fc file using provided metadata"""
289
 
290
  if codec is None:
291
  return None, "❌ Model not loaded"
 
293
  if fc_file is None:
294
  return None, "❌ Please upload a .fc file"
295
 
296
+ # Try to use provided metadata, or fall back to last encoding
297
+ try:
298
+ bits_per_token = int(bits_per_token_input) if bits_per_token_input else last_encoding_metadata.get('bits_per_token')
299
+ num_tokens = int(num_tokens_input) if num_tokens_input else last_encoding_metadata.get('num_tokens')
300
+
301
+ if batch_size_input and seq_length_input:
302
+ shape = (int(batch_size_input), int(seq_length_input))
303
+ else:
304
+ shape = last_encoding_metadata.get('shape')
305
+
306
+ if not all([bits_per_token, num_tokens, shape]):
307
+ return None, "❌ Please provide metadata (bits/token, num tokens, batch, seq_length) OR encode a file first"
308
+
309
+ except Exception as e:
310
+ return None, f"❌ Invalid metadata format: {str(e)}"
311
+
312
  try:
313
  print(f"\n{'='*50}")
314
  print(f"Decoding from file: {fc_file.name}")
315
 
316
  # Load tokens
317
+ toks = load_tokens_raw(fc_file.name, bits_per_token, num_tokens, shape)
318
 
319
  if torch.cuda.is_available():
320
  toks = toks.cuda()
 
330
  # Calculate stats
331
  duration_sec = decoded_wav.shape[0] / codec.sample_rate_output
332
  file_size = os.path.getsize(fc_file.name)
333
+ bitrate = (file_size * 8) / duration_sec
 
 
334
 
335
  print(f"Duration: {duration_sec:.2f}s")
336
  print(f"Bitrate: {bitrate:.1f} bps")
337
  print(f"{'='*50}\n")
338
 
339
+ status = f"✅ Decoded! {duration_sec:.1f}s | {bitrate:.0f} bps | {bits_per_token} bits/token"
340
 
341
  return (codec.sample_rate_output, decoded_wav), status
342
 
 
347
 
348
 
349
  # --- Gradio Interface ---
350
+ with gr.Blocks(title="FocalCodec 160 bps", theme=gr.themes.Soft()) as iface:
351
  gr.Markdown("# 🎙️ FocalCodec at 160 bps")
352
  gr.Markdown(f"**Neural speech codec at insanely low bitrate!** Using `{MODEL_CONFIG}`")
353
+ gr.Markdown("⚠️ **Optimized for speech only** | 🔥 **Pure tokens, no header overhead!**")
354
 
355
  with gr.Tab("🎤 Encode Audio"):
356
+ gr.Markdown("### Compress audio to ~160 bps (pure tokens, no header)")
357
 
358
  with gr.Row():
359
  audio_input = gr.Audio(
 
368
  label="🔊 Decoded Output (16kHz)"
369
  )
370
  file_output = gr.File(
371
+ label="💾 Download Compressed .fc File (headerless)"
372
  )
373
+ status_output = gr.Textbox(label="📊 Status", lines=4)
374
 
375
  encode_btn = gr.Button("🔄 Encode & Decode", variant="primary", size="lg")
376
  encode_btn.click(
 
379
  outputs=[audio_output, file_output, status_output]
380
  )
381
 
382
+ gr.Markdown("### ⚠️ Important:")
383
+ gr.Markdown("- The .fc file contains ONLY raw token data (no metadata/header)")
384
+ gr.Markdown("- **Save the metadata** from the status message to decode later!")
385
+ gr.Markdown("- You need: bits per token, number of tokens, and shape")
 
 
 
386
 
387
  with gr.Tab("📂 Decode from .fc File"):
388
+ gr.Markdown("### Decode raw .fc file (requires metadata)")
389
 
390
  with gr.Row():
391
+ with gr.Column():
392
+ fc_input = gr.File(
393
+ label="Upload .fc File",
394
+ file_types=[".fc"]
395
+ )
396
+
397
+ gr.Markdown("#### Metadata (required for decoding):")
398
+
399
+ with gr.Row():
400
+ bits_input = gr.Number(
401
+ label="Bits per token",
402
+ value=13,
403
+ precision=0,
404
+ info="Usually 13 for this model"
405
+ )
406
+ tokens_input = gr.Number(
407
+ label="Number of tokens",
408
+ precision=0,
409
+ info="Total tokens in file"
410
+ )
411
+
412
+ with gr.Row():
413
+ batch_input = gr.Number(
414
+ label="Batch size",
415
+ value=1,
416
+ precision=0,
417
+ info="Usually 1"
418
+ )
419
+ seq_input = gr.Number(
420
+ label="Sequence length",
421
+ precision=0,
422
+ info="Tokens per batch"
423
+ )
424
+
425
+ gr.Markdown("💡 If you just encoded a file, leave these blank to use saved metadata")
426
 
427
  with gr.Column():
428
  decoded_output = gr.Audio(
 
434
  decode_btn = gr.Button("🔊 Decode Audio", variant="primary", size="lg")
435
  decode_btn.click(
436
  fn=decode_from_fc_file,
437
+ inputs=[fc_input, bits_input, tokens_input, batch_input, seq_input],
438
  outputs=[decoded_output, decode_status]
439
  )
 
 
 
440
 
441
  with gr.Tab("ℹ️ About"):
442
  gr.Markdown("""
443
  ## FocalCodec - Ultra Low Bitrate Neural Audio Codec
444
 
445
+ ### 🎯 Pure Token Format (No Headers!)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
446
 
447
+ This version saves **ONLY the compressed tokens** with no metadata overhead.
 
 
 
 
 
 
448
 
449
+ **Benefits:**
450
+ - ✅ Absolute minimum file size
451
+ - ✅ True 160 bps (no header padding)
452
+ - ✅ Maximum compression efficiency
453
 
454
+ **Trade-off:**
455
+ - ⚠️ You must save the metadata separately to decode
456
+ - Required info: bits per token, number of tokens, shape
 
 
 
 
 
 
 
 
457
 
458
+ ### 📊 Compression Ratios:
459
+ | Format | Bitrate | 1-Hour File Size |
460
+ |--------|---------|------------------|
461
+ | Uncompressed PCM | 256 kbps | ~115 MB |
462
+ | MP3 | 128 kbps | ~57 MB |
463
+ | Opus | 16 kbps | ~7.2 MB |
464
+ | **FocalCodec** | **0.16 kbps** | **~72 KB** 🔥 |
465
 
466
+ ### 🔧 Technical Details:
467
+ - **Token Rate:** ~12.5 tokens/sec
468
+ - **Bits per Token:** 13 bits (for most speech)
469
+ - **Bitrate:** 12.5 × 13 = 162.5 bps ≈ **160 bps**
470
+ - **Format:** Raw bit-packed tokens (no header)
471
 
472
+ ### 📝 Example Metadata:
473
+ After encoding, you'll see:
474
+ ```
475
+ ℹ️ SAVE THIS: bits=13, tokens=113, shape=(1, 113)
476
+ ```
477
 
478
+ Save this to decode the file later!
479
+
480
+ ### 💡 Pro Tip:
481
+ If you're building a system, embed the metadata in a separate JSON file:
482
+ ```json
483
+ {
484
+ "audio.fc": {
485
+ "bits_per_token": 13,
486
+ "num_tokens": 113,
487
+ "shape": [1, 113],
488
+ "duration": 9.04
489
+ }
490
+ }
491
+ ```
492
 
493
  ---
494
 
495
+ 🔗 [FocalCodec GitHub](https://github.com/lucadellalib/focalcodec)
 
 
 
 
 
 
 
496
  """)
497
 
498
  if __name__ == "__main__":
499
  print("\n" + "="*50)
500
+ print("🎙️ FocalCodec 160 bps Demo (Headerless Format)")
501
  print("="*50 + "\n")
502
  iface.launch()