humair025 commited on
Commit
fc1def1
·
verified ·
1 Parent(s): 9522478

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -183
app.py CHANGED
@@ -1,6 +1,6 @@
1
  import subprocess
2
  import sys
3
- import os
4
 
5
  # Auto-install neucodec if missing
6
  try:
@@ -12,195 +12,77 @@ except ImportError:
12
  # Other imports
13
  import gradio as gr
14
  import torch
 
 
 
15
  import librosa
16
  import soundfile as sf
17
  import numpy as np
18
- from neucodec import DistillNeuCodec
19
 
20
  # Load model on CPU
21
  model = DistillNeuCodec.from_pretrained("neuphonic/distill-neucodec")
22
- model.eval()
23
- model.to("cpu")
24
-
25
- # utils
26
- def ensure_dir(d):
27
- if not os.path.exists(d):
28
- os.makedirs(d)
29
-
30
- OUT_DIR = "neucodec_out"
31
- ensure_dir(OUT_DIR)
32
-
33
- def _audio_to_tensor(y, sr, target_sr=16000):
34
- # return (1, 1, T) tensor at target_sr
35
- if sr != target_sr:
36
- y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
37
- sr = target_sr
38
- # normalize (optional) and convert to torch float
39
- y = np.asarray(y, dtype=np.float32)
40
- t = torch.from_numpy(y).unsqueeze(0).unsqueeze(0) # (1, 1, T)
41
- return t, sr
42
-
43
- def _codes_to_text(codes):
44
- """
45
- Convert code(s) to a plain text format that's easy to copy/paste:
46
- - If codes is a Tensor -> a single line of ints
47
- - If codes is a list/tuple of tensors -> each tensor's tokens are placed on their own line
48
- Returns token_text (str).
49
- """
50
- if isinstance(codes, torch.Tensor):
51
- arr = codes.squeeze(0).cpu().numpy()
52
- if arr.ndim == 1:
53
- lines = [" ".join(map(str, arr.astype(int).tolist()))]
54
- else:
55
- # e.g. (C, T) or (T, C) - flatten each row
56
- lines = [" ".join(map(str, row.astype(int).tolist())) for row in arr]
57
- elif isinstance(codes, (list, tuple)):
58
- lines = []
59
- for c in codes:
60
- a = c.squeeze(0).cpu().numpy()
61
- if a.ndim == 1:
62
- lines.append(" ".join(map(str, a.astype(int).tolist())))
63
- else:
64
- # flatten rows
65
- lines.extend(" ".join(map(str, row.astype(int).tolist())) for row in a)
66
- else:
67
- raise ValueError("Unsupported code format for serialization: %r" % type(codes))
68
- token_text = "\n".join(lines)
69
- return token_text
70
-
71
- def _text_to_codes(token_text):
72
- """
73
- Parse the token_text format produced by _codes_to_text back into a list of torch tensors.
74
- Each line becomes a tensor of shape (1, T). Return list-of-tensors.
75
- """
76
- lines = [ln.strip() for ln in token_text.strip().splitlines() if ln.strip()]
77
- if len(lines) == 0:
78
- raise ValueError("No tokens found in input.")
79
- parsed = []
80
- for ln in lines:
81
- # accept commas or spaces
82
- ln = ln.replace(",", " ")
83
- parts = [p for p in ln.split() if p]
84
- ints = list(map(int, parts))
85
- t = torch.tensor(ints, dtype=torch.long).unsqueeze(0) # shape (1, T)
86
- parsed.append(t)
87
- return parsed
88
-
89
- # --- Core operations ---
90
-
91
- def encode_and_reconstruct(audio_file):
92
- """
93
- - Load uploaded audio_file (filepath)
94
- - Encode with DistillNeuCodec -> produce token text + token file
95
- - Decode back to waveform -> save reconstructed.wav (24k)
96
- - Return (recon_path, token_text, token_file_path)
97
- """
98
- if audio_file is None or audio_file == "":
99
- return None, "No audio uploaded.", None
100
-
101
- # load with librosa (preserve original sr then convert)
102
- y, sr = librosa.load(audio_file, sr=None, mono=True)
103
- t, sr16000 = _audio_to_tensor(y, sr, target_sr=16000) # model expects 16k input typically
104
- t = t.to("cpu")
105
-
106
  with torch.no_grad():
107
- # encode_code may return a tensor or a list/tuple depending on model
108
- fsq_codes = model.encode_code(t) # encode
109
- # create token-friendly text
110
- token_text = _codes_to_text(fsq_codes)
111
-
112
- # print to console (visible when running locally)
113
- print("==== Audio tokens (copyable) ====")
114
- print(token_text)
115
- print("=================================")
116
-
117
- # save token file
118
- token_file_path = os.path.join(OUT_DIR, "audio_tokens.txt")
119
- with open(token_file_path, "w", encoding="utf-8") as f:
120
- f.write(token_text)
121
-
122
- # decode to waveform
123
  recon = model.decode_code(fsq_codes)
124
- # recon tensor -> CPU numpy
125
- if isinstance(recon, torch.Tensor):
126
- recon_wav = recon.squeeze().cpu().numpy()
127
- else:
128
- recon_wav = np.asarray(recon)
129
-
130
- recon_path = os.path.join(OUT_DIR, "reconstructed.wav")
131
- # model outputs 24000Hz in your original script — keep that unchanged
132
- sf.write(recon_path, recon_wav, 24000)
133
-
134
- return recon_path, token_text, token_file_path
135
-
136
- def decode_tokens_to_audio(token_text):
137
- """
138
- - Accept token text (as produced by encode_and_reconstruct)
139
- - Parse it back to code tensors and call model.decode_code(...)
140
- - Save decoded audio and return path
141
- """
142
- if token_text is None or token_text.strip() == "":
143
- return None, "No tokens provided."
144
-
145
- try:
146
- parsed_codes = _text_to_codes(token_text)
147
- except Exception as e:
148
- return None, f"Failed to parse tokens: {e}"
149
-
150
- try:
151
- with torch.no_grad():
152
- # Many neucodec models accept a list of 1D tensors (1, T) per codebook or a single Tensor.
153
- # We'll pass the list; if the model expects a single tensor, it will often still accept it or raise.
154
- recon = model.decode_code(parsed_codes)
155
- except Exception as e:
156
- return None, f"Decoding failed: {e}"
157
-
158
- if isinstance(recon, torch.Tensor):
159
- recon_wav = recon.squeeze().cpu().numpy()
160
- else:
161
- recon_wav = np.asarray(recon)
162
-
163
- recon_path = os.path.join(OUT_DIR, "decoded_from_tokens.wav")
164
- sf.write(recon_path, recon_wav, 24000)
165
- return recon_path, "Decoded successfully."
166
-
167
- # --- Gradio UI ---
168
-
169
- with gr.Blocks(title="DistillNeuCodec — encode tokens & decode tokens (CPU)") as demo:
170
- gr.Markdown("## DistillNeuCodec — Encode → tokens (copyable) and Decode → audio\n"
171
- "Upload audio to produce tokens (plain text, one line per codebook). Copy/paste the tokens into the decoder tab to reconstruct from tokens.")
172
- with gr.Tab("Encode & Reconstruct"):
173
- inp_audio = gr.Audio(type="filepath", label="Upload audio (any sr)")
174
- encode_btn = gr.Button("Encode & Reconstruct")
175
- out_audio = gr.Audio(type="filepath", label="Reconstructed Audio (24k)")
176
- out_tokens = gr.Textbox(label="Audio tokens (copy this text)", lines=8)
177
- token_file = gr.File(label="Download token file")
178
-
179
- def _encode_click(aud):
180
- recon_path, token_text, token_file_path = encode_and_reconstruct(aud)
181
- # token_file_path will be a text file with tokens
182
- return recon_path, token_text, token_file_path
183
-
184
- encode_btn.click(fn=_encode_click, inputs=[inp_audio], outputs=[out_audio, out_tokens, token_file])
185
-
186
- with gr.Tab("Decode from Tokens"):
187
- tokens_input = gr.Textbox(label="Paste tokens here (exactly as produced above). One codebook per line.", lines=8)
188
- decode_btn = gr.Button("Decode tokens → audio")
189
- decoded_audio = gr.Audio(type="filepath", label="Decoded Audio (24k)")
190
- decode_status = gr.Textbox(label="Status / Errors", interactive=False)
191
-
192
- def _decode_click(tokens_text):
193
- recon_path, status = decode_tokens_to_audio(tokens_text)
194
- # recon_path could be None on error
195
- return recon_path, status
196
-
197
- decode_btn.click(fn=_decode_click, inputs=[tokens_input], outputs=[decoded_audio, decode_status])
198
-
199
- gr.Markdown("### Notes\n"
200
- "- The token text is plain, space-separated integers. Each line corresponds to one set of tokens (e.g., one codebook). Copy/paste lines exactly to decode.\n"
201
- "- If your tokens came from a single-line encode, paste the single line. If multiple lines, paste all lines.\n"
202
- "- If you prefer a machine format, download `audio_tokens.txt` and upload a text file with the same format to the decoder tab.\n"
203
- "- Decoding may fail if the token shape doesn't match what the model expects; if that happens I'll print the decoder error in the status box.")
204
 
205
  if __name__ == "__main__":
206
- demo.launch()
 
1
  import subprocess
2
  import sys
3
+ import time
4
 
5
  # Auto-install neucodec if missing
6
  try:
 
12
  # Other imports
13
  import gradio as gr
14
  import torch
15
+ import torchaudio
16
+ from torchaudio import transforms as T
17
+ from neucodec import DistillNeuCodec
18
  import librosa
19
  import soundfile as sf
20
  import numpy as np
 
21
 
22
  # Load model on CPU
23
  model = DistillNeuCodec.from_pretrained("neuphonic/distill-neucodec")
24
+ model.eval() # CPU only
25
+
26
+ def reconstruct_audio(audio_file):
27
+ # Start timer
28
+ start_time = time.time()
29
+
30
+ # Load audio with librosa
31
+ y, sr = librosa.load(audio_file, sr=None, mono=True) # Keep original sr
32
+ orig_sr = sr
33
+ orig_len = len(y)
34
+
35
+ # Resample to 16kHz if needed for model encoding
36
+ if sr != 16000:
37
+ y = librosa.resample(y, orig_sr=sr, target_sr=16000)
38
+ sr = 16000
39
+
40
+ # Convert to tensor (1, 1, T)
41
+ y_tensor = torch.from_numpy(y).unsqueeze(0).unsqueeze(0)
42
+
43
+ # Encode & decode
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
  with torch.no_grad():
45
+ fsq_codes = model.encode_code(y_tensor)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  recon = model.decode_code(fsq_codes)
47
+
48
+ recon = recon.squeeze().cpu().numpy()
49
+
50
+ # Save reconstructed audio
51
+ recon_path = "reconstructed.wav"
52
+ sf.write(recon_path, recon, 24000)
53
+
54
+ # End timer
55
+ elapsed_time = time.time() - start_time
56
+
57
+ # Metadata
58
+ metadata = {
59
+ "original_sr": orig_sr,
60
+ "original_length_samples": orig_len,
61
+ "resampled_sr": sr,
62
+ "reconstructed_sr": 24000,
63
+ "num_tokens": fsq_codes.shape,
64
+ "processing_time_sec": round(elapsed_time, 3),
65
+ "input_file": audio_file,
66
+ "output_file": recon_path
67
+ }
68
+
69
+ # Print info
70
+ print("\n=== Audio Reconstruction Info ===")
71
+ for k, v in metadata.items():
72
+ print(f"{k}: {v}")
73
+
74
+ # Return both reconstructed file and metadata for Gradio
75
+ return recon_path, f"Tokens: {fsq_codes.shape}, Processing time: {elapsed_time:.3f}s"
76
+
77
+ # Gradio interface
78
+ iface = gr.Interface(
79
+ fn=reconstruct_audio,
80
+ inputs=gr.Audio(type="filepath", label="Upload Audio"),
81
+ outputs=[gr.Audio(type="filepath", label="Reconstructed Audio"),
82
+ gr.Textbox(label="Info")],
83
+ title="Audio Reconstruction with DistillNeuCodec (CPU + Librosa)",
84
+ description="Upload any audio file, and this app will reconstruct it using DistillNeuCodec at 24kHz on CPU. Metadata and token info are also displayed."
85
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
  if __name__ == "__main__":
88
+ iface.launch()