MihaiPopa-1 commited on
Commit
ed3b7f8
Β·
verified Β·
1 Parent(s): ac56286

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +134 -94
app.py CHANGED
@@ -53,11 +53,12 @@ def save_tokens_raw(toks, fc_file_path):
53
 
54
  toks_cpu = toks.cpu().numpy().flatten()
55
  max_token = int(toks_cpu.max())
 
56
 
57
  print(f"\n=== Saving Raw Tokens ===")
58
- print(f"Token shape: {toks.shape}")
59
- print(f"Token range: 0 to {max_token}")
60
- print(f"Num tokens: {len(toks_cpu)}")
61
 
62
  # Determine bits needed
63
  if max_token <= 1:
@@ -101,9 +102,15 @@ def save_tokens_raw(toks, fc_file_path):
101
  bits = format(int(tok), f'0{bits_needed}b')
102
  bit_array.extend([int(b) for b in bits])
103
 
 
 
104
  # Pad to byte boundary
 
105
  while len(bit_array) % 8 != 0:
106
  bit_array.append(0)
 
 
 
107
 
108
  # Pack into bytes
109
  packed_bits = np.packbits(np.array(bit_array, dtype=np.uint8))
@@ -114,7 +121,7 @@ def save_tokens_raw(toks, fc_file_path):
114
 
115
  file_size = os.path.getsize(fc_file_path)
116
 
117
- print(f"File size: {file_size} bytes (pure data, no header)")
118
  print(f"========================\n")
119
 
120
  return file_size, bits_needed, len(toks_cpu), toks.shape
@@ -124,40 +131,59 @@ def load_tokens_raw(fc_file_path, bits_per_token, num_tokens, original_shape):
124
  """Load raw tokens from headerless binary file"""
125
 
126
  print(f"\n=== Loading Raw Tokens ===")
127
- print(f"Expected bits/token: {bits_per_token}")
128
- print(f"Expected num tokens: {num_tokens}")
129
- print(f"Expected shape: {original_shape}")
 
130
 
131
  # Read all bytes
132
  with open(fc_file_path, 'rb') as f:
133
  packed_data = np.frombuffer(f.read(), dtype=np.uint8)
134
 
 
 
135
  # Unpack bits
136
  unpacked_bits = np.unpackbits(packed_data)
 
137
 
138
  # Extract exact number of bits needed
139
- total_bits = num_tokens * bits_per_token
140
- token_bits = unpacked_bits[:total_bits]
 
 
 
 
 
141
 
142
  # Reconstruct tokens
143
  tokens = []
144
  for i in range(num_tokens):
145
- start = i * bits_per_token
146
- end = start + bits_per_token
147
- token_bits_slice = token_bits[start:end]
148
 
149
- # Convert binary to integer
150
  token_value = 0
151
  for bit in token_bits_slice:
152
- token_value = (token_value << 1) | bit
 
153
  tokens.append(token_value)
154
 
 
 
 
155
  # Reshape to original shape
156
- tokens_array = np.array(tokens, dtype=np.int64).reshape(original_shape)
 
 
 
 
 
 
157
  tokens_tensor = torch.from_numpy(tokens_array)
158
 
159
- print(f"Loaded tokens: {tokens_tensor.shape}")
160
- print(f"Token range: {tokens_tensor.min().item()} to {tokens_tensor.max().item()}")
161
  print(f"==========================\n")
162
 
163
  return tokens_tensor
@@ -168,7 +194,8 @@ last_encoding_metadata = {
168
  'bits_per_token': None,
169
  'num_tokens': None,
170
  'shape': None,
171
- 'duration': None
 
172
  }
173
 
174
 
@@ -236,7 +263,7 @@ def encode_decode_focal(audio_input):
236
  print(f"Duration: {duration_sec:.2f}s")
237
  print(f"Token rate: {token_rate:.2f} tokens/sec")
238
 
239
- print("\n--- Decoding ---")
240
  rec_sig = codec.toks_to_sig(toks)
241
  print(f"Reconstructed signal shape: {rec_sig.shape}")
242
 
@@ -250,8 +277,9 @@ def encode_decode_focal(audio_input):
250
  last_encoding_metadata = {
251
  'bits_per_token': bits_per_token,
252
  'num_tokens': num_tokens,
253
- 'shape': shape,
254
- 'duration': duration_sec
 
255
  }
256
 
257
  # Calculate bitrates
@@ -259,10 +287,20 @@ def encode_decode_focal(audio_input):
259
  theoretical_bitrate = token_rate * bits_per_token
260
 
261
  print(f"--- Results ---")
262
- print(f"File bitrate: {bitrate:.1f} bps (pure data)")
263
  print(f"Theoretical: {theoretical_bitrate:.1f} bps")
264
  print(f"Target: 160 bps")
265
- print(f"Efficiency: {(160/bitrate)*100:.1f}% of target")
 
 
 
 
 
 
 
 
 
 
266
  print(f"{'='*50}\n")
267
 
268
  # Prepare output
@@ -271,8 +309,8 @@ def encode_decode_focal(audio_input):
271
  if len(decoded_wav_output.shape) == 0:
272
  decoded_wav_output = decoded_wav_output.reshape(1)
273
 
274
- metadata_info = f"\n\nℹ️ SAVE THIS: bits={bits_per_token}, tokens={num_tokens}, shape={shape}"
275
- status_msg = f"βœ… {duration_sec:.1f}s | {file_size}B | {bitrate:.0f} bps | {bits_per_token} bits/tok{metadata_info}"
276
 
277
  return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg
278
 
@@ -293,25 +331,31 @@ def decode_from_fc_file(fc_file, bits_per_token_input, num_tokens_input, batch_s
293
  if fc_file is None:
294
  return None, "❌ Please upload a .fc file"
295
 
296
- # Try to use provided metadata, or fall back to last encoding
297
  try:
298
- bits_per_token = int(bits_per_token_input) if bits_per_token_input else last_encoding_metadata.get('bits_per_token')
299
- num_tokens = int(num_tokens_input) if num_tokens_input else last_encoding_metadata.get('num_tokens')
300
-
301
- if batch_size_input and seq_length_input:
 
302
  shape = (int(batch_size_input), int(seq_length_input))
 
303
  else:
304
- shape = last_encoding_metadata.get('shape')
305
-
306
- if not all([bits_per_token, num_tokens, shape]):
307
- return None, "❌ Please provide metadata (bits/token, num tokens, batch, seq_length) OR encode a file first"
 
 
 
 
308
 
309
- except Exception as e:
310
- return None, f"❌ Invalid metadata format: {str(e)}"
311
-
312
- try:
313
  print(f"\n{'='*50}")
314
- print(f"Decoding from file: {fc_file.name}")
 
 
 
 
 
315
 
316
  # Load tokens
317
  toks = load_tokens_raw(fc_file.name, bits_per_token, num_tokens, shape)
@@ -336,14 +380,14 @@ def decode_from_fc_file(fc_file, bits_per_token_input, num_tokens_input, batch_s
336
  print(f"Bitrate: {bitrate:.1f} bps")
337
  print(f"{'='*50}\n")
338
 
339
- status = f"βœ… Decoded! {duration_sec:.1f}s | {bitrate:.0f} bps | {bits_per_token} bits/token"
340
 
341
  return (codec.sample_rate_output, decoded_wav), status
342
 
343
  except Exception as e:
344
  import traceback
345
  traceback.print_exc()
346
- return None, f"❌ Error: {str(e)}"
347
 
348
 
349
  # --- Gradio Interface ---
@@ -370,7 +414,7 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
370
  file_output = gr.File(
371
  label="πŸ’Ύ Download Compressed .fc File (headerless)"
372
  )
373
- status_output = gr.Textbox(label="πŸ“Š Status", lines=4)
374
 
375
  encode_btn = gr.Button("πŸ”„ Encode & Decode", variant="primary", size="lg")
376
  encode_btn.click(
@@ -380,9 +424,9 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
380
  )
381
 
382
  gr.Markdown("### ⚠️ Important:")
383
- gr.Markdown("- The .fc file contains ONLY raw token data (no metadata/header)")
384
- gr.Markdown("- **Save the metadata** from the status message to decode later!")
385
- gr.Markdown("- You need: bits per token, number of tokens, and shape")
386
 
387
  with gr.Tab("πŸ“‚ Decode from .fc File"):
388
  gr.Markdown("### Decode raw .fc file (requires metadata)")
@@ -394,42 +438,42 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
394
  file_types=[".fc"]
395
  )
396
 
397
- gr.Markdown("#### Metadata (required for decoding):")
 
398
 
399
  with gr.Row():
400
  bits_input = gr.Number(
401
  label="Bits per token",
402
- value=13,
403
- precision=0,
404
- info="Usually 13 for this model"
405
  )
406
  tokens_input = gr.Number(
407
  label="Number of tokens",
408
- precision=0,
409
- info="Total tokens in file"
410
  )
411
 
412
  with gr.Row():
413
  batch_input = gr.Number(
414
  label="Batch size",
415
- value=1,
416
- precision=0,
417
- info="Usually 1"
418
  )
419
  seq_input = gr.Number(
420
  label="Sequence length",
421
- precision=0,
422
- info="Tokens per batch"
423
  )
424
 
425
- gr.Markdown("πŸ’‘ If you just encoded a file, leave these blank to use saved metadata")
 
426
 
427
  with gr.Column():
428
  decoded_output = gr.Audio(
429
  type="numpy",
430
  label="πŸ”Š Decoded Audio"
431
  )
432
- decode_status = gr.Textbox(label="πŸ“Š Status", lines=2)
433
 
434
  decode_btn = gr.Button("πŸ”Š Decode Audio", variant="primary", size="lg")
435
  decode_btn.click(
@@ -444,46 +488,42 @@ with gr.Blocks(title="FocalCodec 160 bps") as iface:
444
 
445
  ### 🎯 Pure Token Format (No Headers!)
446
 
447
- This version saves **ONLY the compressed tokens** with no metadata overhead.
448
-
449
- **Benefits:**
450
- - βœ… Absolute minimum file size
451
- - βœ… True 160 bps (no header padding)
452
- - βœ… Maximum compression efficiency
453
-
454
- **Trade-off:**
455
- - ⚠️ You must save the metadata separately to decode
456
- - Required info: bits per token, number of tokens, shape
457
-
458
- ### πŸ“Š Compression Ratios:
459
- | Format | Bitrate | 1-Hour File Size |
460
- |--------|---------|------------------|
461
- | Uncompressed PCM | 256 kbps | ~115 MB |
462
- | MP3 | 128 kbps | ~57 MB |
463
- | Opus | 16 kbps | ~7.2 MB |
464
- | **FocalCodec** | **0.16 kbps** | **~72 KB** πŸ”₯ |
465
-
466
- ### πŸ”§ Technical Details:
467
- - **Token Rate:** ~12.5 tokens/sec
468
- - **Bits per Token:** 13 bits (for most speech)
469
- - **Bitrate:** 12.5 Γ— 13 = 162.5 bps β‰ˆ **160 bps**
470
- - **Format:** Raw bit-packed tokens (no header)
471
-
472
- ### πŸ“ Example Metadata:
473
- After encoding, you'll see:
474
- ```
475
- ℹ️ SAVE THIS: bits=13, tokens=113, shape=(1, 113)
476
- ```
477
 
478
- Save this to decode the file later!
 
 
 
 
479
 
480
- ### πŸ’‘ Pro Tip:
481
- If you're building a system, embed the metadata in a separate JSON file:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
482
  ```json
483
  {
484
- "audio.fc": {
485
- "bits_per_token": 13,
486
- "num_tokens": 113,
487
  "shape": [1, 113],
488
  "duration": 9.04
489
  }
 
53
 
54
  toks_cpu = toks.cpu().numpy().flatten()
55
  max_token = int(toks_cpu.max())
56
+ min_token = int(toks_cpu.min())
57
 
58
  print(f"\n=== Saving Raw Tokens ===")
59
+ print(f"Original shape: {toks.shape}")
60
+ print(f"Flattened tokens: {len(toks_cpu)}")
61
+ print(f"Token range: {min_token} to {max_token}")
62
 
63
  # Determine bits needed
64
  if max_token <= 1:
 
102
  bits = format(int(tok), f'0{bits_needed}b')
103
  bit_array.extend([int(b) for b in bits])
104
 
105
+ print(f"Total bits: {len(bit_array)}")
106
+
107
  # Pad to byte boundary
108
+ padding = 0
109
  while len(bit_array) % 8 != 0:
110
  bit_array.append(0)
111
+ padding += 1
112
+
113
+ print(f"Padding bits: {padding}")
114
 
115
  # Pack into bytes
116
  packed_bits = np.packbits(np.array(bit_array, dtype=np.uint8))
 
121
 
122
  file_size = os.path.getsize(fc_file_path)
123
 
124
+ print(f"File size: {file_size} bytes")
125
  print(f"========================\n")
126
 
127
  return file_size, bits_needed, len(toks_cpu), toks.shape
 
131
  """Load raw tokens from headerless binary file"""
132
 
133
  print(f"\n=== Loading Raw Tokens ===")
134
+ print(f"File: {fc_file_path}")
135
+ print(f"Bits per token: {bits_per_token}")
136
+ print(f"Num tokens: {num_tokens}")
137
+ print(f"Target shape: {original_shape}")
138
 
139
  # Read all bytes
140
  with open(fc_file_path, 'rb') as f:
141
  packed_data = np.frombuffer(f.read(), dtype=np.uint8)
142
 
143
+ print(f"Read {len(packed_data)} bytes")
144
+
145
  # Unpack bits
146
  unpacked_bits = np.unpackbits(packed_data)
147
+ print(f"Unpacked to {len(unpacked_bits)} bits")
148
 
149
  # Extract exact number of bits needed
150
+ total_bits_needed = num_tokens * bits_per_token
151
+ print(f"Need {total_bits_needed} bits for {num_tokens} tokens")
152
+
153
+ if len(unpacked_bits) < total_bits_needed:
154
+ raise ValueError(f"Not enough bits in file! Have {len(unpacked_bits)}, need {total_bits_needed}")
155
+
156
+ token_bits = unpacked_bits[:total_bits_needed]
157
 
158
  # Reconstruct tokens
159
  tokens = []
160
  for i in range(num_tokens):
161
+ start_bit = i * bits_per_token
162
+ end_bit = start_bit + bits_per_token
163
+ token_bits_slice = token_bits[start_bit:end_bit]
164
 
165
+ # Convert binary array to integer
166
  token_value = 0
167
  for bit in token_bits_slice:
168
+ token_value = (token_value << 1) | int(bit)
169
+
170
  tokens.append(token_value)
171
 
172
+ print(f"Reconstructed {len(tokens)} tokens")
173
+ print(f"Token range: {min(tokens)} to {max(tokens)}")
174
+
175
  # Reshape to original shape
176
+ tokens_array = np.array(tokens, dtype=np.int64)
177
+
178
+ # Validate shape
179
+ if tokens_array.size != np.prod(original_shape):
180
+ raise ValueError(f"Shape mismatch! Have {tokens_array.size} tokens, need {np.prod(original_shape)}")
181
+
182
+ tokens_array = tokens_array.reshape(original_shape)
183
  tokens_tensor = torch.from_numpy(tokens_array)
184
 
185
+ print(f"Final tensor shape: {tokens_tensor.shape}")
186
+ print(f"Final token range: {tokens_tensor.min().item()} to {tokens_tensor.max().item()}")
187
  print(f"==========================\n")
188
 
189
  return tokens_tensor
 
194
  'bits_per_token': None,
195
  'num_tokens': None,
196
  'shape': None,
197
+ 'duration': None,
198
+ 'filename': None
199
  }
200
 
201
 
 
263
  print(f"Duration: {duration_sec:.2f}s")
264
  print(f"Token rate: {token_rate:.2f} tokens/sec")
265
 
266
+ print("\n--- Decoding (test) ---")
267
  rec_sig = codec.toks_to_sig(toks)
268
  print(f"Reconstructed signal shape: {rec_sig.shape}")
269
 
 
277
  last_encoding_metadata = {
278
  'bits_per_token': bits_per_token,
279
  'num_tokens': num_tokens,
280
+ 'shape': tuple(shape),
281
+ 'duration': duration_sec,
282
+ 'filename': fc_file_path
283
  }
284
 
285
  # Calculate bitrates
 
287
  theoretical_bitrate = token_rate * bits_per_token
288
 
289
  print(f"--- Results ---")
290
+ print(f"File bitrate: {bitrate:.1f} bps")
291
  print(f"Theoretical: {theoretical_bitrate:.1f} bps")
292
  print(f"Target: 160 bps")
293
+ print(f"Efficiency: {(bitrate/160)*100:.1f}% of target")
294
+
295
+ # TEST: Try to decode immediately to verify
296
+ print(f"\n--- Verification: Decoding saved file ---")
297
+ try:
298
+ test_toks = load_tokens_raw(fc_file_path, bits_per_token, num_tokens, shape)
299
+ print(f"βœ… Verification successful!")
300
+ print(f"Tokens match: {torch.equal(toks.cpu(), test_toks)}")
301
+ except Exception as e:
302
+ print(f"❌ Verification failed: {e}")
303
+
304
  print(f"{'='*50}\n")
305
 
306
  # Prepare output
 
309
  if len(decoded_wav_output.shape) == 0:
310
  decoded_wav_output = decoded_wav_output.reshape(1)
311
 
312
+ metadata_str = f"bits={bits_per_token}, tokens={num_tokens}, shape={shape}"
313
+ status_msg = f"βœ… {duration_sec:.1f}s | {file_size}B | {bitrate:.0f} bps | {bits_per_token} bits/tok\n\nπŸ“‹ METADATA: {metadata_str}"
314
 
315
  return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg
316
 
 
331
  if fc_file is None:
332
  return None, "❌ Please upload a .fc file"
333
 
 
334
  try:
335
+ # Parse metadata
336
+ if bits_per_token_input and num_tokens_input and batch_size_input and seq_length_input:
337
+ # Use provided values
338
+ bits_per_token = int(bits_per_token_input)
339
+ num_tokens = int(num_tokens_input)
340
  shape = (int(batch_size_input), int(seq_length_input))
341
+ print("Using manually provided metadata")
342
  else:
343
+ # Use saved metadata
344
+ if not last_encoding_metadata.get('bits_per_token'):
345
+ return None, "❌ No metadata available! Either encode a file first OR provide all metadata fields"
346
+
347
+ bits_per_token = last_encoding_metadata['bits_per_token']
348
+ num_tokens = last_encoding_metadata['num_tokens']
349
+ shape = last_encoding_metadata['shape']
350
+ print("Using saved metadata from last encoding")
351
 
 
 
 
 
352
  print(f"\n{'='*50}")
353
+ print(f"Decoding file: {fc_file.name}")
354
+ print(f"Metadata: bits={bits_per_token}, tokens={num_tokens}, shape={shape}")
355
+
356
+ # Validate
357
+ if num_tokens != shape[0] * shape[1]:
358
+ return None, f"❌ Shape mismatch! {num_tokens} tokens != {shape[0]}Γ—{shape[1]} = {shape[0]*shape[1]}"
359
 
360
  # Load tokens
361
  toks = load_tokens_raw(fc_file.name, bits_per_token, num_tokens, shape)
 
380
  print(f"Bitrate: {bitrate:.1f} bps")
381
  print(f"{'='*50}\n")
382
 
383
+ status = f"βœ… Decoded successfully!\n{duration_sec:.1f}s | {file_size}B | {bitrate:.0f} bps | {bits_per_token} bits/tok"
384
 
385
  return (codec.sample_rate_output, decoded_wav), status
386
 
387
  except Exception as e:
388
  import traceback
389
  traceback.print_exc()
390
+ return None, f"❌ Decoding error: {str(e)}"
391
 
392
 
393
  # --- Gradio Interface ---
 
414
  file_output = gr.File(
415
  label="πŸ’Ύ Download Compressed .fc File (headerless)"
416
  )
417
+ status_output = gr.Textbox(label="πŸ“Š Status", lines=5)
418
 
419
  encode_btn = gr.Button("πŸ”„ Encode & Decode", variant="primary", size="lg")
420
  encode_btn.click(
 
424
  )
425
 
426
  gr.Markdown("### ⚠️ Important:")
427
+ gr.Markdown("- The .fc file contains ONLY raw token data (no metadata)")
428
+ gr.Markdown("- **Copy the METADATA from the status box** to decode later!")
429
+ gr.Markdown("- Format: `bits=13, tokens=113, shape=(1, 113)`")
430
 
431
  with gr.Tab("πŸ“‚ Decode from .fc File"):
432
  gr.Markdown("### Decode raw .fc file (requires metadata)")
 
438
  file_types=[".fc"]
439
  )
440
 
441
+ gr.Markdown("#### πŸ“‹ Metadata (from encoding step):")
442
+ gr.Markdown("Leave blank to use last encoded file's metadata")
443
 
444
  with gr.Row():
445
  bits_input = gr.Number(
446
  label="Bits per token",
447
+ placeholder="e.g., 13",
448
+ precision=0
 
449
  )
450
  tokens_input = gr.Number(
451
  label="Number of tokens",
452
+ placeholder="e.g., 113",
453
+ precision=0
454
  )
455
 
456
  with gr.Row():
457
  batch_input = gr.Number(
458
  label="Batch size",
459
+ placeholder="e.g., 1",
460
+ precision=0
 
461
  )
462
  seq_input = gr.Number(
463
  label="Sequence length",
464
+ placeholder="e.g., 113",
465
+ precision=0
466
  )
467
 
468
+ gr.Markdown("πŸ’‘ **Example:** If metadata says `bits=13, tokens=113, shape=(1, 113)`")
469
+ gr.Markdown("Enter: bits=13, tokens=113, batch=1, seq=113")
470
 
471
  with gr.Column():
472
  decoded_output = gr.Audio(
473
  type="numpy",
474
  label="πŸ”Š Decoded Audio"
475
  )
476
+ decode_status = gr.Textbox(label="πŸ“Š Status", lines=3)
477
 
478
  decode_btn = gr.Button("πŸ”Š Decode Audio", variant="primary", size="lg")
479
  decode_btn.click(
 
488
 
489
  ### 🎯 Pure Token Format (No Headers!)
490
 
491
+ This version saves **ONLY the compressed tokens** with zero overhead.
492
+
493
+ ### πŸ“Š Compression:
494
+ - **Uncompressed:** 256 kbps β†’ 115 MB/hour
495
+ - **FocalCodec:** 160 bps β†’ **72 KB/hour** (1600x smaller!)
496
+
497
+ ### πŸ”§ How to Use:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
498
 
499
+ **Encoding:**
500
+ 1. Upload/record audio
501
+ 2. Click "Encode & Decode"
502
+ 3. **COPY THE METADATA** from status (important!)
503
+ 4. Download .fc file
504
 
505
+ **Decoding:**
506
+ 1. Upload .fc file
507
+ 2. Enter metadata OR leave blank if you just encoded
508
+ 3. Click "Decode Audio"
509
+
510
+ ### πŸ“ Metadata Format:
511
+ ```
512
+ bits=13, tokens=113, shape=(1, 113)
513
+ ```
514
+ Means:
515
+ - 13 bits per token
516
+ - 113 total tokens
517
+ - Batch size = 1
518
+ - Sequence length = 113
519
+
520
+ ### πŸ’‘ Storage Tip:
521
+ Store metadata in a companion JSON file:
522
  ```json
523
  {
524
+ "recording_001.fc": {
525
+ "bits": 13,
526
+ "tokens": 113,
527
  "shape": [1, 113],
528
  "duration": 9.04
529
  }