Spaces:

MihaiPopa-1
/

FocalCodec-Demo

Sleeping

App Files Files Community

MihaiPopa-1 commited on 11 days ago

Commit

38b610c

verified ·

1 Parent(s): cb8da7c

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -26

app.py CHANGED Viewed

@@ -61,68 +61,68 @@ def encode_decode_focal(audio_input):
     try:
         sr, wav_numpy = audio_input
-        print(f"Input audio: sample_rate={sr}, shape={wav_numpy.shape}, dtype={wav_numpy.dtype}")
         # Handle stereo to mono conversion
         if len(wav_numpy.shape) > 1:
-            if wav_numpy.shape[1] == 2:  # Stereo
-                wav_numpy = wav_numpy.mean(axis=1)  # Average both channels
-                print("Converted stereo to mono")
-            elif wav_numpy.shape[0] == 2:  # Channels first
                 wav_numpy = wav_numpy.mean(axis=0)
-                print("Converted stereo to mono (channels first)")
         # Ensure float32 and normalize
         wav_numpy = wav_numpy.astype(np.float32)
         if wav_numpy.max() > 1.0 or wav_numpy.min() < -1.0:
-            wav_numpy = wav_numpy / 32768.0  # Normalize int16 to float
         # Convert to torch tensor [1, samples]
         sig = torch.from_numpy(wav_numpy).unsqueeze(0)
-        print(f"Tensor shape before resample: {sig.shape}")
-        # Resample to 16kHz (required by FocalCodec)
         if sr != codec.sample_rate_input:
-            print(f"Resampling from {sr}Hz to {codec.sample_rate_input}Hz...")
             resampler = torchaudio.transforms.Resample(
                 orig_freq=sr,
                 new_freq=codec.sample_rate_input
             )
             sig = resampler(sig)
-        print(f"Tensor shape after resample: {sig.shape}")
-        # Move to GPU if available
         if torch.cuda.is_available():
             sig = sig.cuda()
         # --- Encode and Decode ---
         with torch.no_grad():
-            print("Encoding to tokens...")
             toks = codec.sig_to_toks(sig)
-            print(f"Tokens shape: {toks.shape}")
-            print("Decoding tokens to audio...")
             rec_sig = codec.toks_to_sig(toks)
-            print(f"Reconstructed signal shape: {rec_sig.shape}")
-        # --- Save the compressed tokens to a temporary .fc file ---
         temp_dir = tempfile.mkdtemp()
         fc_file_path = os.path.join(temp_dir, "compressed_tokens.fc")
-        torch.save(toks.cpu(), fc_file_path)
         file_size_bytes = os.path.getsize(fc_file_path)
-        print(f"Tokens saved to {fc_file_path} ({file_size_bytes} bytes)")
-        # Move audio back to CPU for Gradio output
         decoded_wav_output = rec_sig.cpu().numpy().squeeze()
-        # Ensure proper shape for Gradio
         if len(decoded_wav_output.shape) == 0:
             decoded_wav_output = decoded_wav_output.reshape(1)
-        status_msg = f"✅ Success! Compressed tokens: {file_size_bytes} bytes"
         return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg

     try:
         sr, wav_numpy = audio_input
         # Handle stereo to mono conversion
         if len(wav_numpy.shape) > 1:
+            if wav_numpy.shape[1] == 2:
+                wav_numpy = wav_numpy.mean(axis=1)
+            elif wav_numpy.shape[0] == 2:
                 wav_numpy = wav_numpy.mean(axis=0)
         # Ensure float32 and normalize
         wav_numpy = wav_numpy.astype(np.float32)
         if wav_numpy.max() > 1.0 or wav_numpy.min() < -1.0:
+            wav_numpy = wav_numpy / 32768.0
         # Convert to torch tensor [1, samples]
         sig = torch.from_numpy(wav_numpy).unsqueeze(0)
+        # Resample to 16kHz
         if sr != codec.sample_rate_input:
             resampler = torchaudio.transforms.Resample(
                 orig_freq=sr,
                 new_freq=codec.sample_rate_input
             )
             sig = resampler(sig)
         if torch.cuda.is_available():
             sig = sig.cuda()
         # --- Encode and Decode ---
         with torch.no_grad():
             toks = codec.sig_to_toks(sig)
             rec_sig = codec.toks_to_sig(toks)
+            # Get binary codes for true compression
+            codes = codec.toks_to_codes(toks)
+        # --- Save as truly compressed binary file ---
         temp_dir = tempfile.mkdtemp()
         fc_file_path = os.path.join(temp_dir, "compressed_tokens.fc")
+        # Convert codes to binary and pack
+        codes_cpu = codes.cpu().numpy().astype(np.uint8)
+        packed_bits = np.packbits(codes_cpu.flatten())
+        with open(fc_file_path, 'wb') as f:
+            f.write(packed_bits.tobytes())
+        # Calculate stats
         file_size_bytes = os.path.getsize(fc_file_path)
+        duration_sec = sig.shape[-1] / codec.sample_rate_input
+        expected_size = (160 * duration_sec) / 8
+        actual_bitrate = (file_size_bytes * 8) / duration_sec
+        print(f"Duration: {duration_sec:.2f}s")
+        print(f"File size: {file_size_bytes} bytes (expected: ~{expected_size:.0f} bytes)")
+        print(f"Actual bitrate: {actual_bitrate:.0f} bps")
+        # Move audio back to CPU
         decoded_wav_output = rec_sig.cpu().numpy().squeeze()
         if len(decoded_wav_output.shape) == 0:
             decoded_wav_output = decoded_wav_output.reshape(1)
+        status_msg = f"✅ Duration: {duration_sec:.1f}s | File: {file_size_bytes} bytes | Bitrate: {actual_bitrate:.0f} bps"
         return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg