MihaiPopa-1 commited on
Commit
01e25a6
ยท
verified ยท
1 Parent(s): 91d64e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +274 -148
app.py CHANGED
@@ -49,113 +49,157 @@ except Exception as e:
49
  codec = None
50
 
51
 
52
- # --- SAVE function (encoding) ---
53
- def save_compressed_tokens(toks, fc_file_path, codec):
54
- """Save tokens in the most compressed format with metadata for decoding"""
55
 
56
- toks_cpu = toks.cpu()
57
- min_tok = toks_cpu.min().item()
58
- max_tok = toks_cpu.max().item()
59
 
60
- print(f"\n=== Saving Tokens ===")
61
- print(f"Shape: {toks.shape}")
62
- print(f"Range: {min_tok} to {max_tok}")
63
 
64
- # Determine bit width
65
- if max_tok <= 1:
66
- bits_per_token = 1
67
- dtype_code = 0
68
- elif max_tok <= 15:
69
- bits_per_token = 4
70
- dtype_code = 1
71
- elif max_tok <= 255:
72
- bits_per_token = 8
73
- dtype_code = 2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  else:
75
- bits_per_token = 16
76
- dtype_code = 3
77
 
78
- # Convert to numpy
79
- toks_np = toks_cpu.numpy().flatten()
80
 
81
- # Pack data
82
- if bits_per_token == 1:
83
- packed = np.packbits(toks_np.astype(np.uint8))
84
- elif bits_per_token == 4:
85
- if len(toks_np) % 2:
86
- toks_np = np.append(toks_np, 0)
87
- packed = ((toks_np[::2] << 4) | toks_np[1::2]).astype(np.uint8)
88
- elif bits_per_token == 8:
89
- packed = toks_np.astype(np.uint8)
90
- else: # 16-bit
91
- packed = toks_np.astype(np.int16)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
 
93
- # Write file with header
94
  with open(fc_file_path, 'wb') as f:
95
- # Magic number (to verify it's our format)
96
- f.write(b'FC01') # FocalCodec version 0.1
97
 
98
  # Metadata
99
- f.write(struct.pack('<B', dtype_code)) # Data type (1 byte)
100
- f.write(struct.pack('<I', toks.shape[0])) # Batch size
101
- f.write(struct.pack('<I', toks.shape[1])) # Sequence length
102
- f.write(struct.pack('<I', len(toks_np))) # Total tokens
103
 
104
- # Packed token data
105
- f.write(packed.tobytes())
106
 
107
  file_size = os.path.getsize(fc_file_path)
108
- print(f"Saved {file_size} bytes ({bits_per_token} bits/token)")
109
- print(f"====================\n")
110
 
111
- return file_size, bits_per_token
 
 
 
112
 
113
 
114
- # --- LOAD function (decoding) ---
115
- def load_compressed_tokens(fc_file_path):
116
- """Load and unpack tokens from .fc file"""
117
 
118
  with open(fc_file_path, 'rb') as f:
119
- # Verify magic number
120
  magic = f.read(4)
121
  if magic != b'FC01':
122
- raise ValueError("Invalid .fc file format!")
123
 
124
  # Read metadata
125
- dtype_code = struct.unpack('<B', f.read(1))[0]
126
  batch_size = struct.unpack('<I', f.read(4))[0]
127
- seq_length = struct.unpack('<I', f.read(4))[0]
128
- total_tokens = struct.unpack('<I', f.read(4))[0]
129
 
130
  # Read packed data
131
  packed_data = np.frombuffer(f.read(), dtype=np.uint8)
132
 
133
- print(f"\n=== Loading Tokens ===")
134
- print(f"Dtype code: {dtype_code}")
135
- print(f"Shape: ({batch_size}, {seq_length})")
 
 
136
 
137
- # Unpack based on dtype
138
- if dtype_code == 0: # 1-bit
139
- unpacked = np.unpackbits(packed_data)[:total_tokens]
140
- elif dtype_code == 1: # 4-bit
141
- high = (packed_data >> 4) & 0x0F
142
- low = packed_data & 0x0F
143
- unpacked = np.empty(len(packed_data) * 2, dtype=np.uint8)
144
- unpacked[::2] = high
145
- unpacked[1::2] = low
146
- unpacked = unpacked[:total_tokens]
147
- elif dtype_code == 2: # 8-bit
148
- unpacked = packed_data[:total_tokens]
149
- else: # 16-bit
150
- unpacked = np.frombuffer(packed_data.tobytes(), dtype=np.int16)[:total_tokens]
151
 
152
- # Reshape to original shape
153
- toks = torch.from_numpy(unpacked.astype(np.int64)).reshape(batch_size, seq_length)
 
 
 
 
 
 
 
 
 
 
154
 
155
- print(f"Loaded tokens: {toks.shape}")
156
- print(f"======================\n")
157
 
158
- return toks
 
 
 
159
 
160
 
161
  def encode_decode_focal(audio_input):
@@ -172,28 +216,28 @@ def encode_decode_focal(audio_input):
172
  try:
173
  sr, wav_numpy = audio_input
174
 
175
- print(f"Input audio: sample_rate={sr}, shape={wav_numpy.shape}, dtype={wav_numpy.dtype}")
 
 
176
 
177
  # Handle stereo to mono conversion
178
  if len(wav_numpy.shape) > 1:
179
- if wav_numpy.shape[1] == 2: # Stereo
180
  wav_numpy = wav_numpy.mean(axis=1)
181
  print("Converted stereo to mono")
182
- elif wav_numpy.shape[0] == 2: # Channels first
183
  wav_numpy = wav_numpy.mean(axis=0)
184
  print("Converted stereo to mono (channels first)")
185
 
186
  # Ensure float32 and normalize
187
  wav_numpy = wav_numpy.astype(np.float32)
188
  if wav_numpy.max() > 1.0 or wav_numpy.min() < -1.0:
189
- wav_numpy = wav_numpy / 32768.0 # Normalize int16 to float
190
 
191
- # Convert to torch tensor [1, samples]
192
  sig = torch.from_numpy(wav_numpy).unsqueeze(0)
193
 
194
- print(f"Tensor shape before resample: {sig.shape}")
195
-
196
- # Resample to 16kHz (required by FocalCodec)
197
  if sr != codec.sample_rate_input:
198
  print(f"Resampling from {sr}Hz to {codec.sample_rate_input}Hz...")
199
  resampler = torchaudio.transforms.Resample(
@@ -202,50 +246,68 @@ def encode_decode_focal(audio_input):
202
  )
203
  sig = resampler(sig)
204
 
205
- print(f"Tensor shape after resample: {sig.shape}")
206
 
207
- # Move to GPU if available
208
  if torch.cuda.is_available():
209
  sig = sig.cuda()
210
 
211
  # --- Encode and Decode ---
212
  with torch.no_grad():
213
- print("Encoding to tokens...")
214
  toks = codec.sig_to_toks(sig)
 
 
 
 
215
  print(f"Tokens shape: {toks.shape}")
216
  print(f"Token range: {toks.min().item()} to {toks.max().item()}")
 
 
217
 
218
- print("Decoding tokens to audio...")
 
 
 
 
 
 
 
219
  rec_sig = codec.toks_to_sig(toks)
220
  print(f"Reconstructed signal shape: {rec_sig.shape}")
221
 
222
- # --- Save the compressed tokens ---
223
  temp_dir = tempfile.mkdtemp()
224
  fc_file_path = os.path.join(temp_dir, "compressed_tokens.fc")
225
 
226
- file_size, bits_per_token = save_compressed_tokens(toks, fc_file_path, codec)
 
 
227
 
228
- # Calculate stats
229
- duration_sec = sig.shape[-1] / codec.sample_rate_input
230
- actual_bitrate = (file_size * 8) / duration_sec
 
231
 
232
- print(f"Duration: {duration_sec:.2f}s")
233
- print(f"File size: {file_size} bytes")
234
- print(f"Actual bitrate: {actual_bitrate:.1f} bps")
 
 
 
 
235
 
236
- # Move audio back to CPU for Gradio output
237
  decoded_wav_output = rec_sig.cpu().numpy().squeeze()
238
 
239
- # Ensure proper shape for Gradio
240
  if len(decoded_wav_output.shape) == 0:
241
  decoded_wav_output = decoded_wav_output.reshape(1)
242
 
243
- status_msg = f"โœ… Duration: {duration_sec:.1f}s | File: {file_size} bytes | Bitrate: {actual_bitrate:.0f} bps ({bits_per_token} bits/token)"
244
 
245
  return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg
246
 
247
  except Exception as e:
248
- error_msg = f"โŒ Processing error: {str(e)}"
249
  print(error_msg)
250
  import traceback
251
  traceback.print_exc()
@@ -262,24 +324,35 @@ def decode_from_fc_file(fc_file):
262
  return None, "โŒ Please upload a .fc file"
263
 
264
  try:
265
- # Load tokens from file
266
- toks = load_compressed_tokens(fc_file.name)
 
 
 
267
 
268
  if torch.cuda.is_available():
269
  toks = toks.cuda()
270
 
271
  # Decode to audio
272
  with torch.no_grad():
 
273
  rec_sig = codec.toks_to_sig(toks)
 
274
 
275
  decoded_wav = rec_sig.cpu().numpy().squeeze()
276
 
277
- # Calculate duration
278
  duration_sec = decoded_wav.shape[0] / codec.sample_rate_output
279
  file_size = os.path.getsize(fc_file.name)
280
- bitrate = (file_size * 8) / duration_sec
 
 
281
 
282
- status = f"โœ… Decoded successfully! Duration: {duration_sec:.1f}s | Bitrate: {bitrate:.0f} bps"
 
 
 
 
283
 
284
  return (codec.sample_rate_output, decoded_wav), status
285
 
@@ -290,13 +363,13 @@ def decode_from_fc_file(fc_file):
290
 
291
 
292
  # --- Gradio Interface ---
293
- with gr.Blocks(title="FocalCodec 160 bps") as iface:
294
  gr.Markdown("# ๐ŸŽ™๏ธ FocalCodec at 160 bps")
295
  gr.Markdown(f"**Neural speech codec at insanely low bitrate!** Using `{MODEL_CONFIG}`")
296
- gr.Markdown("โš ๏ธ **Optimized for speech only** - not suitable for music")
297
 
298
  with gr.Tab("๐ŸŽค Encode Audio"):
299
- gr.Markdown("### Compress audio to 160 bps tokens")
300
 
301
  with gr.Row():
302
  audio_input = gr.Audio(
@@ -308,12 +381,12 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
308
  with gr.Column():
309
  audio_output = gr.Audio(
310
  type="numpy",
311
- label="Decoded Output (16kHz)"
312
  )
313
  file_output = gr.File(
314
- label="Download Compressed .fc File"
315
  )
316
- status_output = gr.Textbox(label="Status", lines=2)
317
 
318
  encode_btn = gr.Button("๐Ÿ”„ Encode & Decode", variant="primary", size="lg")
319
  encode_btn.click(
@@ -323,10 +396,12 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
323
  )
324
 
325
  gr.Markdown("### How it works:")
326
- gr.Markdown("- Automatically resamples to 16kHz")
327
- gr.Markdown("- Converts stereo to mono")
328
- gr.Markdown("- Encodes to discrete tokens (~160 bps)")
329
- gr.Markdown("- Decodes tokens back to audio")
 
 
330
 
331
  with gr.Tab("๐Ÿ“‚ Decode from .fc File"):
332
  gr.Markdown("### Decode previously compressed audio")
@@ -340,9 +415,9 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
340
  with gr.Column():
341
  decoded_output = gr.Audio(
342
  type="numpy",
343
- label="Decoded Audio"
344
  )
345
- decode_status = gr.Textbox(label="Status", lines=2)
346
 
347
  decode_btn = gr.Button("๐Ÿ”Š Decode Audio", variant="primary", size="lg")
348
  decode_btn.click(
@@ -350,47 +425,98 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
350
  inputs=[fc_input],
351
  outputs=[decoded_output, decode_status]
352
  )
 
 
 
353
 
354
  with gr.Tab("โ„น๏ธ About"):
355
  gr.Markdown("""
356
  ## FocalCodec - Ultra Low Bitrate Neural Audio Codec
357
 
358
- ### Compression Ratios:
359
- - **Uncompressed PCM** (16kHz mono): 256 kbps
360
- - **MP3** (standard): ~128 kbps
361
- - **Opus** (voice): ~16 kbps
362
- - **FocalCodec**: **0.16 kbps** (160 bps) ๐Ÿ”ฅ
363
-
364
- ### That's 1600x compression!
365
-
366
- For a 1-hour podcast:
367
- - Uncompressed: ~115 MB
368
- - FocalCodec: **~72 KB**
369
-
370
- ### Use Cases:
371
- - ๐Ÿ“ž Ultra-low bandwidth voice calls
372
- - ๐Ÿค– AI-generated podcasts
373
- - ๐ŸŒ Low-bandwidth regions
374
- - ๐Ÿ“ป Emergency communications
375
-
376
- ### Trade-offs:
377
- - โœ… Extremely efficient compression
378
- - โœ… Speech remains intelligible
 
 
 
 
379
  - โŒ Voice characteristics may change
380
- - โŒ Not suitable for music
381
- - โŒ Some pronunciation artifacts
382
-
383
- ### Technical Details:
384
- - Model: `lucadellalib/focalcodec_12_5hz`
385
- - Sample Rate: 16 kHz
386
- - Token Rate: 12.5 Hz
387
- - Bits per Token: Auto-detected (1/4/8/16 bit)
388
- - Target Bitrate: 160 bps
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
 
390
  ---
391
 
392
- ๐Ÿ”— [GitHub Repository](https://github.com/lucadellalib/focalcodec)
 
 
 
 
 
 
 
393
  """)
394
 
395
  if __name__ == "__main__":
 
 
 
396
  iface.launch()
 
49
  codec = None
50
 
51
 
52
+ def save_compressed_codes_optimal(toks, codes, fc_file_path, codec):
53
+ """Save codes with optimal bit packing to achieve true 160 bps"""
 
54
 
55
+ codes_cpu = codes.cpu().numpy()
56
+ toks_cpu = toks.cpu().numpy()
 
57
 
58
+ print(f"\n=== Optimal Compression ===")
59
+ print(f"Codes shape: {codes.shape}")
60
+ print(f"Codes dtype: {codes.dtype}")
61
 
62
+ # Determine actual bits needed based on token range
63
+ max_token = int(toks_cpu.max())
64
+ if max_token <= 1:
65
+ bits_needed = 1
66
+ elif max_token <= 3:
67
+ bits_needed = 2
68
+ elif max_token <= 7:
69
+ bits_needed = 3
70
+ elif max_token <= 15:
71
+ bits_needed = 4
72
+ elif max_token <= 31:
73
+ bits_needed = 5
74
+ elif max_token <= 63:
75
+ bits_needed = 6
76
+ elif max_token <= 127:
77
+ bits_needed = 7
78
+ elif max_token <= 255:
79
+ bits_needed = 8
80
+ elif max_token <= 511:
81
+ bits_needed = 9
82
+ elif max_token <= 1023:
83
+ bits_needed = 10
84
+ elif max_token <= 2047:
85
+ bits_needed = 11
86
+ elif max_token <= 4095:
87
+ bits_needed = 12
88
+ elif max_token <= 8191:
89
+ bits_needed = 13
90
+ elif max_token <= 16383:
91
+ bits_needed = 14
92
+ elif max_token <= 32767:
93
+ bits_needed = 15
94
  else:
95
+ bits_needed = 16
 
96
 
97
+ print(f"Token range: 0 to {max_token}")
98
+ print(f"Bits needed per token: {bits_needed}")
99
 
100
+ # If codes are already binary (batch, time, bits), use them directly
101
+ if len(codes.shape) == 3 and codes.dtype in [torch.bool, torch.uint8]:
102
+ print(f"Using binary codes directly: {codes.shape[2]} bits per token")
103
+ # Pack the binary codes
104
+ codes_flat = codes_cpu.flatten()
105
+ packed_bits = np.packbits(codes_flat)
106
+ bits_per_token = codes.shape[2]
107
+ num_tokens = codes.shape[1]
108
+
109
+ else:
110
+ # Pack tokens manually using exact bit width
111
+ print(f"Packing tokens with {bits_needed} bits each")
112
+ toks_flat = toks_cpu.flatten().astype(np.uint32)
113
+ num_tokens = len(toks_flat)
114
+
115
+ # Convert to binary string and pack
116
+ total_bits = num_tokens * bits_needed
117
+
118
+ # Create bit array
119
+ bit_array = []
120
+ for tok in toks_flat:
121
+ # Convert to binary with exact bit width
122
+ bits = format(int(tok), f'0{bits_needed}b')
123
+ bit_array.extend([int(b) for b in bits])
124
+
125
+ # Pad to byte boundary
126
+ while len(bit_array) % 8 != 0:
127
+ bit_array.append(0)
128
+
129
+ # Pack into bytes
130
+ packed_bits = np.packbits(np.array(bit_array, dtype=np.uint8))
131
+ bits_per_token = bits_needed
132
 
133
+ # Write to file
134
  with open(fc_file_path, 'wb') as f:
135
+ # Magic number
136
+ f.write(b'FC01')
137
 
138
  # Metadata
139
+ f.write(struct.pack('<I', toks.shape[0])) # batch size
140
+ f.write(struct.pack('<I', num_tokens)) # number of tokens
141
+ f.write(struct.pack('<B', bits_per_token)) # bits per token
 
142
 
143
+ # Packed data
144
+ f.write(packed_bits.tobytes())
145
 
146
  file_size = os.path.getsize(fc_file_path)
147
+ header_size = 4 + 4 + 4 + 1 # magic + 2 ints + 1 byte
148
+ data_size = file_size - header_size
149
 
150
+ print(f"File size: {file_size} bytes (header: {header_size}B, data: {data_size}B)")
151
+ print(f"===========================\n")
152
+
153
+ return file_size, bits_per_token, data_size
154
 
155
 
156
+ def load_compressed_codes_optimal(fc_file_path):
157
+ """Load optimally packed codes"""
 
158
 
159
  with open(fc_file_path, 'rb') as f:
160
+ # Verify magic
161
  magic = f.read(4)
162
  if magic != b'FC01':
163
+ raise ValueError("Invalid .fc file!")
164
 
165
  # Read metadata
 
166
  batch_size = struct.unpack('<I', f.read(4))[0]
167
+ num_tokens = struct.unpack('<I', f.read(4))[0]
168
+ bits_per_token = struct.unpack('<B', f.read(1))[0]
169
 
170
  # Read packed data
171
  packed_data = np.frombuffer(f.read(), dtype=np.uint8)
172
 
173
+ print(f"\n=== Loading Optimal Codes ===")
174
+ print(f"Batch: {batch_size}, Tokens: {num_tokens}, Bits/token: {bits_per_token}")
175
+
176
+ # Unpack bits
177
+ unpacked_bits = np.unpackbits(packed_data)
178
 
179
+ # Extract exact number of bits needed
180
+ total_bits = num_tokens * bits_per_token
181
+ token_bits = unpacked_bits[:total_bits]
 
 
 
 
 
 
 
 
 
 
 
182
 
183
+ # Reconstruct tokens
184
+ tokens = []
185
+ for i in range(num_tokens):
186
+ start = i * bits_per_token
187
+ end = start + bits_per_token
188
+ token_bits_slice = token_bits[start:end]
189
+
190
+ # Convert binary to integer
191
+ token_value = 0
192
+ for bit in token_bits_slice:
193
+ token_value = (token_value << 1) | bit
194
+ tokens.append(token_value)
195
 
196
+ tokens_array = np.array(tokens, dtype=np.int64).reshape(batch_size, -1)
197
+ tokens_tensor = torch.from_numpy(tokens_array)
198
 
199
+ print(f"Loaded tokens: {tokens_tensor.shape}")
200
+ print(f"==============================\n")
201
+
202
+ return tokens_tensor
203
 
204
 
205
  def encode_decode_focal(audio_input):
 
216
  try:
217
  sr, wav_numpy = audio_input
218
 
219
+ print(f"\n{'='*50}")
220
+ print(f"Processing new audio...")
221
+ print(f"Input audio: sample_rate={sr}, shape={wav_numpy.shape}")
222
 
223
  # Handle stereo to mono conversion
224
  if len(wav_numpy.shape) > 1:
225
+ if wav_numpy.shape[1] == 2:
226
  wav_numpy = wav_numpy.mean(axis=1)
227
  print("Converted stereo to mono")
228
+ elif wav_numpy.shape[0] == 2:
229
  wav_numpy = wav_numpy.mean(axis=0)
230
  print("Converted stereo to mono (channels first)")
231
 
232
  # Ensure float32 and normalize
233
  wav_numpy = wav_numpy.astype(np.float32)
234
  if wav_numpy.max() > 1.0 or wav_numpy.min() < -1.0:
235
+ wav_numpy = wav_numpy / 32768.0
236
 
237
+ # Convert to torch tensor
238
  sig = torch.from_numpy(wav_numpy).unsqueeze(0)
239
 
240
+ # Resample to 16kHz
 
 
241
  if sr != codec.sample_rate_input:
242
  print(f"Resampling from {sr}Hz to {codec.sample_rate_input}Hz...")
243
  resampler = torchaudio.transforms.Resample(
 
246
  )
247
  sig = resampler(sig)
248
 
249
+ print(f"Signal shape: {sig.shape}")
250
 
 
251
  if torch.cuda.is_available():
252
  sig = sig.cuda()
253
 
254
  # --- Encode and Decode ---
255
  with torch.no_grad():
256
+ print("\n--- Encoding ---")
257
  toks = codec.sig_to_toks(sig)
258
+
259
+ duration_sec = sig.shape[-1] / codec.sample_rate_input
260
+ token_rate = toks.shape[1] / duration_sec
261
+
262
  print(f"Tokens shape: {toks.shape}")
263
  print(f"Token range: {toks.min().item()} to {toks.max().item()}")
264
+ print(f"Duration: {duration_sec:.2f}s")
265
+ print(f"Token rate: {token_rate:.2f} tokens/sec")
266
 
267
+ # Get binary codes
268
+ codes = codec.toks_to_codes(toks)
269
+ print(f"Codes shape: {codes.shape}")
270
+ print(f"Codes dtype: {codes.dtype}")
271
+ if len(codes.shape) == 3:
272
+ print(f"Bits per token (from codes): {codes.shape[2]}")
273
+
274
+ print("\n--- Decoding ---")
275
  rec_sig = codec.toks_to_sig(toks)
276
  print(f"Reconstructed signal shape: {rec_sig.shape}")
277
 
278
+ # --- Save with optimal bit packing ---
279
  temp_dir = tempfile.mkdtemp()
280
  fc_file_path = os.path.join(temp_dir, "compressed_tokens.fc")
281
 
282
+ file_size, bits_per_token, data_size = save_compressed_codes_optimal(
283
+ toks, codes, fc_file_path, codec
284
+ )
285
 
286
+ # Calculate bitrates
287
+ total_bitrate = (file_size * 8) / duration_sec
288
+ data_bitrate = (data_size * 8) / duration_sec
289
+ theoretical_bitrate = token_rate * bits_per_token
290
 
291
+ print(f"--- Results ---")
292
+ print(f"Total bitrate: {total_bitrate:.1f} bps (with header)")
293
+ print(f"Data bitrate: {data_bitrate:.1f} bps (data only)")
294
+ print(f"Theoretical: {theoretical_bitrate:.1f} bps")
295
+ print(f"Target: 160 bps")
296
+ print(f"Efficiency: {(160/data_bitrate)*100:.1f}% of target")
297
+ print(f"{'='*50}\n")
298
 
299
+ # Prepare output
300
  decoded_wav_output = rec_sig.cpu().numpy().squeeze()
301
 
 
302
  if len(decoded_wav_output.shape) == 0:
303
  decoded_wav_output = decoded_wav_output.reshape(1)
304
 
305
+ status_msg = f"โœ… {duration_sec:.1f}s | {file_size}B | {data_bitrate:.0f} bps | {bits_per_token} bits/tok | target: 160 bps"
306
 
307
  return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg
308
 
309
  except Exception as e:
310
+ error_msg = f"โŒ Error: {str(e)}"
311
  print(error_msg)
312
  import traceback
313
  traceback.print_exc()
 
324
  return None, "โŒ Please upload a .fc file"
325
 
326
  try:
327
+ print(f"\n{'='*50}")
328
+ print(f"Decoding from file: {fc_file.name}")
329
+
330
+ # Load tokens
331
+ toks = load_compressed_codes_optimal(fc_file.name)
332
 
333
  if torch.cuda.is_available():
334
  toks = toks.cuda()
335
 
336
  # Decode to audio
337
  with torch.no_grad():
338
+ print("Decoding tokens to audio...")
339
  rec_sig = codec.toks_to_sig(toks)
340
+ print(f"Reconstructed signal shape: {rec_sig.shape}")
341
 
342
  decoded_wav = rec_sig.cpu().numpy().squeeze()
343
 
344
+ # Calculate stats
345
  duration_sec = decoded_wav.shape[0] / codec.sample_rate_output
346
  file_size = os.path.getsize(fc_file.name)
347
+ header_size = 4 + 4 + 4 + 1
348
+ data_size = file_size - header_size
349
+ bitrate = (data_size * 8) / duration_sec
350
 
351
+ print(f"Duration: {duration_sec:.2f}s")
352
+ print(f"Bitrate: {bitrate:.1f} bps")
353
+ print(f"{'='*50}\n")
354
+
355
+ status = f"โœ… Decoded! {duration_sec:.1f}s | {bitrate:.0f} bps"
356
 
357
  return (codec.sample_rate_output, decoded_wav), status
358
 
 
363
 
364
 
365
  # --- Gradio Interface ---
366
+ with gr.Blocks(title="FocalCodec 160 bps", theme=gr.themes.Soft()) as iface:
367
  gr.Markdown("# ๐ŸŽ™๏ธ FocalCodec at 160 bps")
368
  gr.Markdown(f"**Neural speech codec at insanely low bitrate!** Using `{MODEL_CONFIG}`")
369
+ gr.Markdown("โš ๏ธ **Optimized for speech only** - not suitable for music | ๐Ÿ”ฅ **1600x compression ratio!**")
370
 
371
  with gr.Tab("๐ŸŽค Encode Audio"):
372
+ gr.Markdown("### Compress audio to ~160 bps with optimal bit packing")
373
 
374
  with gr.Row():
375
  audio_input = gr.Audio(
 
381
  with gr.Column():
382
  audio_output = gr.Audio(
383
  type="numpy",
384
+ label="๐Ÿ”Š Decoded Output (16kHz)"
385
  )
386
  file_output = gr.File(
387
+ label="๐Ÿ’พ Download Compressed .fc File"
388
  )
389
+ status_output = gr.Textbox(label="๐Ÿ“Š Status", lines=2)
390
 
391
  encode_btn = gr.Button("๐Ÿ”„ Encode & Decode", variant="primary", size="lg")
392
  encode_btn.click(
 
396
  )
397
 
398
  gr.Markdown("### How it works:")
399
+ gr.Markdown("- โœ… Automatically resamples to 16kHz")
400
+ gr.Markdown("- โœ… Converts stereo to mono")
401
+ gr.Markdown("- โœ… Encodes to discrete tokens (~12.5 tokens/sec)")
402
+ gr.Markdown("- โœ… Packs tokens using only needed bits (no waste!)")
403
+ gr.Markdown("- โœ… Decodes tokens back to audio")
404
+ gr.Markdown("- ๐Ÿ“ˆ Check console for detailed bitrate analysis!")
405
 
406
  with gr.Tab("๐Ÿ“‚ Decode from .fc File"):
407
  gr.Markdown("### Decode previously compressed audio")
 
415
  with gr.Column():
416
  decoded_output = gr.Audio(
417
  type="numpy",
418
+ label="๐Ÿ”Š Decoded Audio"
419
  )
420
+ decode_status = gr.Textbox(label="๐Ÿ“Š Status", lines=2)
421
 
422
  decode_btn = gr.Button("๐Ÿ”Š Decode Audio", variant="primary", size="lg")
423
  decode_btn.click(
 
425
  inputs=[fc_input],
426
  outputs=[decoded_output, decode_status]
427
  )
428
+
429
+ gr.Markdown("### Note:")
430
+ gr.Markdown("Upload a .fc file created by this tool to decode it back to audio.")
431
 
432
  with gr.Tab("โ„น๏ธ About"):
433
  gr.Markdown("""
434
  ## FocalCodec - Ultra Low Bitrate Neural Audio Codec
435
 
436
+ ### ๐ŸŽฏ Compression Ratios:
437
+ | Format | Bitrate | 1-Hour File Size | Compression |
438
+ |--------|---------|------------------|-------------|
439
+ | **Uncompressed PCM** (16kHz mono) | 256 kbps | ~115 MB | 1x |
440
+ | **MP3** (standard) | 128 kbps | ~57 MB | 2x |
441
+ | **Opus** (voice optimized) | 16 kbps | ~7.2 MB | 16x |
442
+ | **FocalCodec** | **0.16 kbps** | **~72 KB** | **1600x** ๐Ÿ”ฅ |
443
+
444
+ ### ๐Ÿ’ก Use Cases:
445
+ - ๐Ÿ“ž **Ultra-low bandwidth voice calls** (satellite, deep space)
446
+ - ๐Ÿค– **AI-generated podcasts** (NotebookLM-style apps)
447
+ - ๐ŸŒ **Low-bandwidth regions** (2G networks)
448
+ - ๐Ÿ“ป **Emergency communications** (disaster relief)
449
+ - ๐ŸŽ“ **Educational content distribution** (offline learning)
450
+ - ๐Ÿ’พ **Voice memo storage** (years of recordings in MB)
451
+
452
+ ### โš–๏ธ Trade-offs:
453
+
454
+ **Pros:**
455
+ - โœ… Insanely efficient compression (1600x!)
456
+ - โœ… Speech remains highly intelligible
457
+ - ๏ฟฝ๏ฟฝ๏ฟฝ Works on any sample rate (auto-resamples)
458
+ - โœ… Tiny storage/bandwidth requirements
459
+
460
+ **Cons:**
461
  - โŒ Voice characteristics may change
462
+ - โŒ Emotional nuances can be lost
463
+ - โŒ Occasional pronunciation artifacts
464
+ - โŒ Not suitable for music or non-speech audio
465
+
466
+ ### ๐Ÿ”ง Technical Details:
467
+ - **Model:** `lucadellalib/focalcodec_12_5hz`
468
+ - **Sample Rate:** 16 kHz
469
+ - **Token Rate:** ~12.5 tokens/second
470
+ - **Bits per Token:** 13 bits (auto-detected, optimally packed)
471
+ - **Target Bitrate:** 160 bps (12.5 ร— 13 = 162.5 bps)
472
+ - **File Format:** Custom binary format with metadata header
473
+
474
+ ### ๐Ÿงฎ How We Achieve 160 bps:
475
+
476
+ Traditional approach would waste bits:
477
+ ```
478
+ Token (0-8191) โ†’ int16 (16 bits) โ†’ 16 ร— 12.5 = 200 bps โŒ
479
+ Wasting 3 bits per token!
480
+ ```
481
+
482
+ Our optimal approach:
483
+ ```
484
+ Token (0-8191) โ†’ 13 bits exactly โ†’ 13 ร— 12.5 = 162.5 bps โœ…
485
+ Zero waste!
486
+ ```
487
+
488
+ ### ๐Ÿ”ฌ Debug Information:
489
+ Check the **console/terminal** for detailed encoding information:
490
+ - Actual token rate and range
491
+ - Bits per token (detected automatically)
492
+ - Expected vs actual bitrate
493
+ - File size breakdown (header vs data)
494
+ - Compression efficiency
495
+
496
+ ### ๐Ÿ“š Example Use Case - AI Podcast Library:
497
+
498
+ Imagine storing **1000 hours** of AI-generated podcasts:
499
+ - **Uncompressed:** 115 GB
500
+ - **MP3:** 57 GB
501
+ - **Opus:** 7.2 GB
502
+ - **FocalCodec:** **72 MB** ๐Ÿคฏ
503
+
504
+ You could fit an entire podcast library on a USB flash drive!
505
 
506
  ---
507
 
508
+ ### ๐Ÿ”— Links:
509
+ - [FocalCodec GitHub](https://github.com/lucadellalib/focalcodec)
510
+ - [Research Paper](https://arxiv.org/abs/2410.03608)
511
+
512
+ ### ๐Ÿ—๏ธ Built with:
513
+ - PyTorch + TorchAudio
514
+ - Gradio
515
+ - FocalCodec (Luca Della Libera et al.)
516
  """)
517
 
518
  if __name__ == "__main__":
519
+ print("\n" + "="*50)
520
+ print("๐ŸŽ™๏ธ FocalCodec 160 bps Demo")
521
+ print("="*50 + "\n")
522
  iface.launch()