MihaiPopa-1 commited on
Commit
38b610c
·
verified ·
1 Parent(s): cb8da7c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -26
app.py CHANGED
@@ -61,68 +61,68 @@ def encode_decode_focal(audio_input):
61
  try:
62
  sr, wav_numpy = audio_input
63
 
64
- print(f"Input audio: sample_rate={sr}, shape={wav_numpy.shape}, dtype={wav_numpy.dtype}")
65
-
66
  # Handle stereo to mono conversion
67
  if len(wav_numpy.shape) > 1:
68
- if wav_numpy.shape[1] == 2: # Stereo
69
- wav_numpy = wav_numpy.mean(axis=1) # Average both channels
70
- print("Converted stereo to mono")
71
- elif wav_numpy.shape[0] == 2: # Channels first
72
  wav_numpy = wav_numpy.mean(axis=0)
73
- print("Converted stereo to mono (channels first)")
74
 
75
  # Ensure float32 and normalize
76
  wav_numpy = wav_numpy.astype(np.float32)
77
  if wav_numpy.max() > 1.0 or wav_numpy.min() < -1.0:
78
- wav_numpy = wav_numpy / 32768.0 # Normalize int16 to float
79
 
80
  # Convert to torch tensor [1, samples]
81
  sig = torch.from_numpy(wav_numpy).unsqueeze(0)
82
 
83
- print(f"Tensor shape before resample: {sig.shape}")
84
-
85
- # Resample to 16kHz (required by FocalCodec)
86
  if sr != codec.sample_rate_input:
87
- print(f"Resampling from {sr}Hz to {codec.sample_rate_input}Hz...")
88
  resampler = torchaudio.transforms.Resample(
89
  orig_freq=sr,
90
  new_freq=codec.sample_rate_input
91
  )
92
  sig = resampler(sig)
93
 
94
- print(f"Tensor shape after resample: {sig.shape}")
95
-
96
- # Move to GPU if available
97
  if torch.cuda.is_available():
98
  sig = sig.cuda()
99
 
100
  # --- Encode and Decode ---
101
  with torch.no_grad():
102
- print("Encoding to tokens...")
103
  toks = codec.sig_to_toks(sig)
104
- print(f"Tokens shape: {toks.shape}")
105
-
106
- print("Decoding tokens to audio...")
107
  rec_sig = codec.toks_to_sig(toks)
108
- print(f"Reconstructed signal shape: {rec_sig.shape}")
 
 
109
 
110
- # --- Save the compressed tokens to a temporary .fc file ---
111
  temp_dir = tempfile.mkdtemp()
112
  fc_file_path = os.path.join(temp_dir, "compressed_tokens.fc")
113
- torch.save(toks.cpu(), fc_file_path)
114
 
 
 
 
 
 
 
 
 
115
  file_size_bytes = os.path.getsize(fc_file_path)
116
- print(f"Tokens saved to {fc_file_path} ({file_size_bytes} bytes)")
 
 
 
 
 
 
117
 
118
- # Move audio back to CPU for Gradio output
119
  decoded_wav_output = rec_sig.cpu().numpy().squeeze()
120
 
121
- # Ensure proper shape for Gradio
122
  if len(decoded_wav_output.shape) == 0:
123
  decoded_wav_output = decoded_wav_output.reshape(1)
124
 
125
- status_msg = f"✅ Success! Compressed tokens: {file_size_bytes} bytes"
126
 
127
  return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg
128
 
 
61
  try:
62
  sr, wav_numpy = audio_input
63
 
 
 
64
  # Handle stereo to mono conversion
65
  if len(wav_numpy.shape) > 1:
66
+ if wav_numpy.shape[1] == 2:
67
+ wav_numpy = wav_numpy.mean(axis=1)
68
+ elif wav_numpy.shape[0] == 2:
 
69
  wav_numpy = wav_numpy.mean(axis=0)
 
70
 
71
  # Ensure float32 and normalize
72
  wav_numpy = wav_numpy.astype(np.float32)
73
  if wav_numpy.max() > 1.0 or wav_numpy.min() < -1.0:
74
+ wav_numpy = wav_numpy / 32768.0
75
 
76
  # Convert to torch tensor [1, samples]
77
  sig = torch.from_numpy(wav_numpy).unsqueeze(0)
78
 
79
+ # Resample to 16kHz
 
 
80
  if sr != codec.sample_rate_input:
 
81
  resampler = torchaudio.transforms.Resample(
82
  orig_freq=sr,
83
  new_freq=codec.sample_rate_input
84
  )
85
  sig = resampler(sig)
86
 
 
 
 
87
  if torch.cuda.is_available():
88
  sig = sig.cuda()
89
 
90
  # --- Encode and Decode ---
91
  with torch.no_grad():
 
92
  toks = codec.sig_to_toks(sig)
 
 
 
93
  rec_sig = codec.toks_to_sig(toks)
94
+
95
+ # Get binary codes for true compression
96
+ codes = codec.toks_to_codes(toks)
97
 
98
+ # --- Save as truly compressed binary file ---
99
  temp_dir = tempfile.mkdtemp()
100
  fc_file_path = os.path.join(temp_dir, "compressed_tokens.fc")
 
101
 
102
+ # Convert codes to binary and pack
103
+ codes_cpu = codes.cpu().numpy().astype(np.uint8)
104
+ packed_bits = np.packbits(codes_cpu.flatten())
105
+
106
+ with open(fc_file_path, 'wb') as f:
107
+ f.write(packed_bits.tobytes())
108
+
109
+ # Calculate stats
110
  file_size_bytes = os.path.getsize(fc_file_path)
111
+ duration_sec = sig.shape[-1] / codec.sample_rate_input
112
+ expected_size = (160 * duration_sec) / 8
113
+ actual_bitrate = (file_size_bytes * 8) / duration_sec
114
+
115
+ print(f"Duration: {duration_sec:.2f}s")
116
+ print(f"File size: {file_size_bytes} bytes (expected: ~{expected_size:.0f} bytes)")
117
+ print(f"Actual bitrate: {actual_bitrate:.0f} bps")
118
 
119
+ # Move audio back to CPU
120
  decoded_wav_output = rec_sig.cpu().numpy().squeeze()
121
 
 
122
  if len(decoded_wav_output.shape) == 0:
123
  decoded_wav_output = decoded_wav_output.reshape(1)
124
 
125
+ status_msg = f"✅ Duration: {duration_sec:.1f}s | File: {file_size_bytes} bytes | Bitrate: {actual_bitrate:.0f} bps"
126
 
127
  return (codec.sample_rate_output, decoded_wav_output), fc_file_path, status_msg
128