Spaces:

humair025
/

neucodec

Sleeping

App Files Files Community

humair025 commited on 28 days ago

Commit

fc1def1

verified ·

1 Parent(s): 9522478

Update app.py

Browse files

Files changed (1) hide show

app.py +65 -183

app.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import subprocess
 import sys
-import os
 # Auto-install neucodec if missing
 try:
@@ -12,195 +12,77 @@ except ImportError:
 # Other imports
 import gradio as gr
 import torch
 import librosa
 import soundfile as sf
 import numpy as np
-from neucodec import DistillNeuCodec
 # Load model on CPU
 model = DistillNeuCodec.from_pretrained("neuphonic/distill-neucodec")
-model.eval()
-model.to("cpu")
-# utils
-def ensure_dir(d):
-    if not os.path.exists(d):
-        os.makedirs(d)
-OUT_DIR = "neucodec_out"
-ensure_dir(OUT_DIR)
-def _audio_to_tensor(y, sr, target_sr=16000):
-    # return (1, 1, T) tensor at target_sr
-    if sr != target_sr:
-        y = librosa.resample(y, orig_sr=sr, target_sr=target_sr)
-        sr = target_sr
-    # normalize (optional) and convert to torch float
-    y = np.asarray(y, dtype=np.float32)
-    t = torch.from_numpy(y).unsqueeze(0).unsqueeze(0)  # (1, 1, T)
-    return t, sr
-def _codes_to_text(codes):
-    """
-    Convert code(s) to a plain text format that's easy to copy/paste:
-    - If codes is a Tensor -> a single line of ints
-    - If codes is a list/tuple of tensors -> each tensor's tokens are placed on their own line
-    Returns token_text (str).
-    """
-    if isinstance(codes, torch.Tensor):
-        arr = codes.squeeze(0).cpu().numpy()
-        if arr.ndim == 1:
-            lines = [" ".join(map(str, arr.astype(int).tolist()))]
-        else:
-            # e.g. (C, T) or (T, C) - flatten each row
-            lines = [" ".join(map(str, row.astype(int).tolist())) for row in arr]
-    elif isinstance(codes, (list, tuple)):
-        lines = []
-        for c in codes:
-            a = c.squeeze(0).cpu().numpy()
-            if a.ndim == 1:
-                lines.append(" ".join(map(str, a.astype(int).tolist())))
-            else:
-                # flatten rows
-                lines.extend(" ".join(map(str, row.astype(int).tolist())) for row in a)
-    else:
-        raise ValueError("Unsupported code format for serialization: %r" % type(codes))
-    token_text = "\n".join(lines)
-    return token_text
-def _text_to_codes(token_text):
-    """
-    Parse the token_text format produced by _codes_to_text back into a list of torch tensors.
-    Each line becomes a tensor of shape (1, T). Return list-of-tensors.
-    """
-    lines = [ln.strip() for ln in token_text.strip().splitlines() if ln.strip()]
-    if len(lines) == 0:
-        raise ValueError("No tokens found in input.")
-    parsed = []
-    for ln in lines:
-        # accept commas or spaces
-        ln = ln.replace(",", " ")
-        parts = [p for p in ln.split() if p]
-        ints = list(map(int, parts))
-        t = torch.tensor(ints, dtype=torch.long).unsqueeze(0)  # shape (1, T)
-        parsed.append(t)
-    return parsed
-# --- Core operations ---
-def encode_and_reconstruct(audio_file):
-    """
-    - Load uploaded audio_file (filepath)
-    - Encode with DistillNeuCodec -> produce token text + token file
-    - Decode back to waveform -> save reconstructed.wav (24k)
-    - Return (recon_path, token_text, token_file_path)
-    """
-    if audio_file is None or audio_file == "":
-        return None, "No audio uploaded.", None
-    # load with librosa (preserve original sr then convert)
-    y, sr = librosa.load(audio_file, sr=None, mono=True)
-    t, sr16000 = _audio_to_tensor(y, sr, target_sr=16000)  # model expects 16k input typically
-    t = t.to("cpu")
     with torch.no_grad():
-        # encode_code may return a tensor or a list/tuple depending on model
-        fsq_codes = model.encode_code(t)  # encode
-        # create token-friendly text
-        token_text = _codes_to_text(fsq_codes)
-        # print to console (visible when running locally)
-        print("==== Audio tokens (copyable) ====")
-        print(token_text)
-        print("=================================")
-        # save token file
-        token_file_path = os.path.join(OUT_DIR, "audio_tokens.txt")
-        with open(token_file_path, "w", encoding="utf-8") as f:
-            f.write(token_text)
-        # decode to waveform
         recon = model.decode_code(fsq_codes)
-    # recon tensor -> CPU numpy
-    if isinstance(recon, torch.Tensor):
-        recon_wav = recon.squeeze().cpu().numpy()
-    else:
-        recon_wav = np.asarray(recon)
-    recon_path = os.path.join(OUT_DIR, "reconstructed.wav")
-    # model outputs 24000Hz in your original script — keep that unchanged
-    sf.write(recon_path, recon_wav, 24000)
-    return recon_path, token_text, token_file_path
-def decode_tokens_to_audio(token_text):
-    """
-    - Accept token text (as produced by encode_and_reconstruct)
-    - Parse it back to code tensors and call model.decode_code(...)
-    - Save decoded audio and return path
-    """
-    if token_text is None or token_text.strip() == "":
-        return None, "No tokens provided."
-    try:
-        parsed_codes = _text_to_codes(token_text)
-    except Exception as e:
-        return None, f"Failed to parse tokens: {e}"
-    try:
-        with torch.no_grad():
-            # Many neucodec models accept a list of 1D tensors (1, T) per codebook or a single Tensor.
-            # We'll pass the list; if the model expects a single tensor, it will often still accept it or raise.
-            recon = model.decode_code(parsed_codes)
-    except Exception as e:
-        return None, f"Decoding failed: {e}"
-    if isinstance(recon, torch.Tensor):
-        recon_wav = recon.squeeze().cpu().numpy()
-    else:
-        recon_wav = np.asarray(recon)
-    recon_path = os.path.join(OUT_DIR, "decoded_from_tokens.wav")
-    sf.write(recon_path, recon_wav, 24000)
-    return recon_path, "Decoded successfully."
-# --- Gradio UI ---
-with gr.Blocks(title="DistillNeuCodec — encode tokens & decode tokens (CPU)") as demo:
-    gr.Markdown("## DistillNeuCodec — Encode → tokens (copyable) and Decode → audio\n"
-                "Upload audio to produce tokens (plain text, one line per codebook). Copy/paste the tokens into the decoder tab to reconstruct from tokens.")
-    with gr.Tab("Encode & Reconstruct"):
-        inp_audio = gr.Audio(type="filepath", label="Upload audio (any sr)")
-        encode_btn = gr.Button("Encode & Reconstruct")
-        out_audio = gr.Audio(type="filepath", label="Reconstructed Audio (24k)")
-        out_tokens = gr.Textbox(label="Audio tokens (copy this text)", lines=8)
-        token_file = gr.File(label="Download token file")
-        def _encode_click(aud):
-            recon_path, token_text, token_file_path = encode_and_reconstruct(aud)
-            # token_file_path will be a text file with tokens
-            return recon_path, token_text, token_file_path
-        encode_btn.click(fn=_encode_click, inputs=[inp_audio], outputs=[out_audio, out_tokens, token_file])
-    with gr.Tab("Decode from Tokens"):
-        tokens_input = gr.Textbox(label="Paste tokens here (exactly as produced above). One codebook per line.", lines=8)
-        decode_btn = gr.Button("Decode tokens → audio")
-        decoded_audio = gr.Audio(type="filepath", label="Decoded Audio (24k)")
-        decode_status = gr.Textbox(label="Status / Errors", interactive=False)
-        def _decode_click(tokens_text):
-            recon_path, status = decode_tokens_to_audio(tokens_text)
-            # recon_path could be None on error
-            return recon_path, status
-        decode_btn.click(fn=_decode_click, inputs=[tokens_input], outputs=[decoded_audio, decode_status])
-    gr.Markdown("### Notes\n"
-                "- The token text is plain, space-separated integers. Each line corresponds to one set of tokens (e.g., one codebook). Copy/paste lines exactly to decode.\n"
-                "- If your tokens came from a single-line encode, paste the single line. If multiple lines, paste all lines.\n"
-                "- If you prefer a machine format, download `audio_tokens.txt` and upload a text file with the same format to the decoder tab.\n"
-                "- Decoding may fail if the token shape doesn't match what the model expects; if that happens I'll print the decoder error in the status box.")
 if __name__ == "__main__":
-    demo.launch()

 import subprocess
 import sys
+import time
 # Auto-install neucodec if missing
 try:
 # Other imports
 import gradio as gr
 import torch
+import torchaudio
+from torchaudio import transforms as T
+from neucodec import DistillNeuCodec
 import librosa
 import soundfile as sf
 import numpy as np
 # Load model on CPU
 model = DistillNeuCodec.from_pretrained("neuphonic/distill-neucodec")
+model.eval()  # CPU only
+def reconstruct_audio(audio_file):
+    # Start timer
+    start_time = time.time()
+    # Load audio with librosa
+    y, sr = librosa.load(audio_file, sr=None, mono=True)  # Keep original sr
+    orig_sr = sr
+    orig_len = len(y)
+    # Resample to 16kHz if needed for model encoding
+    if sr != 16000:
+        y = librosa.resample(y, orig_sr=sr, target_sr=16000)
+        sr = 16000
+    # Convert to tensor (1, 1, T)
+    y_tensor = torch.from_numpy(y).unsqueeze(0).unsqueeze(0)
+    # Encode & decode
     with torch.no_grad():
+        fsq_codes = model.encode_code(y_tensor)
         recon = model.decode_code(fsq_codes)
+    recon = recon.squeeze().cpu().numpy()
+    # Save reconstructed audio
+    recon_path = "reconstructed.wav"
+    sf.write(recon_path, recon, 24000)
+    # End timer
+    elapsed_time = time.time() - start_time
+    # Metadata
+    metadata = {
+        "original_sr": orig_sr,
+        "original_length_samples": orig_len,
+        "resampled_sr": sr,
+        "reconstructed_sr": 24000,
+        "num_tokens": fsq_codes.shape,
+        "processing_time_sec": round(elapsed_time, 3),
+        "input_file": audio_file,
+        "output_file": recon_path
+    }
+    # Print info
+    print("\n=== Audio Reconstruction Info ===")
+    for k, v in metadata.items():
+        print(f"{k}: {v}")
+    # Return both reconstructed file and metadata for Gradio
+    return recon_path, f"Tokens: {fsq_codes.shape}, Processing time: {elapsed_time:.3f}s"
+# Gradio interface
+iface = gr.Interface(
+    fn=reconstruct_audio,
+    inputs=gr.Audio(type="filepath", label="Upload Audio"),
+    outputs=[gr.Audio(type="filepath", label="Reconstructed Audio"),
+             gr.Textbox(label="Info")],
+    title="Audio Reconstruction with DistillNeuCodec (CPU + Librosa)",
+    description="Upload any audio file, and this app will reconstruct it using DistillNeuCodec at 24kHz on CPU. Metadata and token info are also displayed."
+)
 if __name__ == "__main__":
+    iface.launch()