Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

README.md +12 -0
__pycache__/gtw.cpython-313.pyc +0 -0
__pycache__/spatial.cpython-313.pyc +0 -0
__pycache__/synthesis.cpython-313.pyc +0 -0
app.py +113 -0
docker/Dockerfile +21 -0
entrypoint.sh +15 -0
gtw.py +97 -0
requirements.txt +8 -0
smoke_test.py +44 -0
spatial.py +21 -0
synthesis.py +103 -0
synthesize_test.py +13 -0

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: Roombox
+emoji: 🦀
+colorFrom: pink
+colorTo: red
+sdk: gradio
+sdk_version: 5.29.0
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

__pycache__/gtw.cpython-313.pyc ADDED Viewed

Binary file (5.62 kB). View file

__pycache__/spatial.cpython-313.pyc ADDED Viewed

Binary file (1.36 kB). View file

__pycache__/synthesis.cpython-313.pyc ADDED Viewed

Binary file (5.43 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,113 @@

+# app.py
+import io, re, zipfile
+from typing import Tuple, List
+import gradio as gr
+import numpy as np
+import soundfile as sf
+from synthesis import synthesize, preload_model
+SR        = 24_000
+DIST_M    = 1.0
+AZ_LOOKUP = {"left": -45, "right": 45}  # extend as needed
+# ---------------------------------------------------------------------------
+# 1. Minimal TTS helper (model cache lives inside synthesize)
+# ---------------------------------------------------------------------------
+def _tts(text: str, az_deg: float) -> np.ndarray:
+    return synthesize(text, az_deg=az_deg, dist_m=DIST_M, sr=SR)  # (2,T)
+# ---------------------------------------------------------------------------
+# 2. Parse textarea ➜ list[(side, wav)]
+# ---------------------------------------------------------------------------
+LINE_RE = re.compile(r"\[S\d+\]\s*\[(left|right)\]\s*(.+)", re.I)
+def parse_script(script: str) -> List[Tuple[str, np.ndarray]]:
+    tracks = []
+    for ln in script.strip().splitlines():
+        m = LINE_RE.match(ln.strip())
+        if not m:
+            continue
+        side, text = m.group(1).lower(), m.group(2).strip()
+        tracks.append((side, _tts(text, AZ_LOOKUP[side])))
+    if not tracks:
+        raise gr.Error("No valid lines found. Format: [S1][ left] Hello …")
+    return tracks
+# ---------------------------------------------------------------------------
+# 3. Mix per side
+# ---------------------------------------------------------------------------
+def _pad(pcm: np.ndarray, T: int) -> np.ndarray:
+    return np.pad(pcm, ((0, 0), (0, T - pcm.shape[1])), "constant")
+def render(script: str):
+    tracks = parse_script(script)
+    left   = [w for side, w in tracks if side == "left"]
+    right  = [w for side, w in tracks if side == "right"]
+    def combine(wavs):
+        if not wavs:
+            return np.zeros((2, 1), dtype=np.float32)
+        T = max(w.shape[1] for w in wavs)
+        return sum(_pad(w, T) for w in wavs)
+    left_mix  = combine(left)
+    right_mix = combine(right)
+    dialog    = left_mix + right_mix
+    return (
+        (SR, left_mix.T),
+        (SR, right_mix.T),
+        (SR, dialog.T),
+        _zip_bytes({
+            "left_speaker.wav":  left_mix.T,
+            "right_speaker.wav": right_mix.T,
+            "dialog_mix.wav":    dialog.T,
+        })
+    )
+# ---------------------------------------------------------------------------
+# 4. Utility – ZIP builder
+# ---------------------------------------------------------------------------
+def _zip_bytes(files: dict) -> bytes:
+    buf = io.BytesIO()
+    with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
+        for fname, data in files.items():
+            wav_buf = io.BytesIO()
+            sf.write(wav_buf, data, SR, subtype="PCM_16")
+            zf.writestr(fname, wav_buf.getvalue())
+    return buf.getvalue()
+# ---------------------------------------------------------------------------
+# 5. Gradio UI
+# ---------------------------------------------------------------------------
+with gr.Blocks(title="Spatial Dialog Synth (Dia)") as demo:
+    gr.Markdown("### Spatial Dialog Synth\n"
+                "Enter lines in the format `[S1][ left] Hello …` / `[S2][ right] …`")
+    with gr.Row():
+        # Left column - Input and Download
+        with gr.Column(scale=1):
+            script_in = gr.Textbox(lines=8, placeholder="[S1][ left] Hello world…", label="Script")
+            gen_btn = gr.Button("Generate", variant="primary")
+            zip_output = gr.File(label="Download all (zip)")
+        # Right column - Audio outputs
+        with gr.Column(scale=1):
+            left_audio = gr.Audio(label="Left speaker")
+            right_audio = gr.Audio(label="Right speaker")
+            mix_audio = gr.Audio(label="Dialog mix")
+    gen_btn.click(
+        fn=render,
+        inputs=script_in,
+        outputs=[left_audio, right_audio, mix_audio, zip_output]
+    )
+# ---------------------------------------------------------------------------
+# 6. Pre-warm Dia so first user click is instant
+# ---------------------------------------------------------------------------
+preload_model()          # blocks ~30 s only on very first container start
+demo.launch()

docker/Dockerfile ADDED Viewed

	@@ -0,0 +1,21 @@

+@@
+FROM pytorch/pytorch:2.6.0-cuda12.6-cudnn9-devel
+#–––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
+# 1.  Hugging Face cache lives in /data (.hf Space volume)       *
+#–––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
+ENV HF_HOME=/data/.huggingface
+@@
+ WORKDIR /workspace/spatial-dia
+ ENV PYTHONUNBUFFERED=1
+CMD ["/bin/bash"]
+#–––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
+# 2.  Boot script: pre-fetch weights once, then launch Gradio
+#–––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––––
+COPY entrypoint.sh /entrypoint.sh
+RUN chmod +x /entrypoint.sh
+CMD ["/entrypoint.sh"]

entrypoint.sh ADDED Viewed

	@@ -0,0 +1,15 @@

+#!/usr/bin/env bash
+set -e
+# 0) Make sure cache dir exists (Space volume mounted at runtime)
+mkdir -p "${HF_HOME:-/data/.huggingface}"
+# 1) One-shot warm-up (skipped after first boot)
+python - <<'PY'
+from huggingface_hub import snapshot_download
+for repo in ("nari-labs/Dia-1.6B", "descriptinc/descript-audio-codec"):
+    snapshot_download(repo, local_files_only=False)   # honours HF_HOME
+PY
+# 2) Start the Gradio app
+exec python app.py

gtw.py ADDED Viewed

	@@ -0,0 +1,97 @@

+# gtw.py  – ZeroBAS‑faithful GTW, batch‑vectorised
+import torch, math
+from torch import Tensor
+import torch.nn.functional as F
+def _lagrange_weights(d: Tensor, taps: int = 8) -> Tensor:
+    """Return (B, taps) weights for 0 ≤ d < 1."""
+    n = torch.arange(taps, device=d.device, dtype=d.dtype)   # 0..7
+    w = torch.ones(d.shape + (taps,), dtype=d.dtype, device=d.device)
+    for k in range(taps):
+        others = torch.cat([n[:k], n[k+1:]])
+        w[..., k] = torch.prod((d.unsqueeze(-1) - others) / (n[k] - others), dim=-1)
+    return w                                                # (B, taps)
+def gtw_shift(x: Tensor, delay: Tensor) -> Tensor:
+    """
+    ZeroBAS‑style GTW: constant ITD per clip.
+      x:     (B, T)
+      delay: (B,) or any constant‑valued (B,T)
+    """
+    if delay.dim() == 0:
+        delay = delay.unsqueeze(0)
+    if delay.dim() == 2:                              # squeeze if constant
+        if not torch.allclose(delay, delay[:, :1].expand_as(delay)):
+            raise ValueError("delay must be constant per item")
+        delay = delay[:, 0]
+    taps, pad = 8, 4
+    total   = -delay                        # ① Positive Δ ⇒ phase‑advance
+    d_int   = torch.floor(total).to(torch.int64)
+    d_frac  = (total - d_int).float()       # 0 ≤ d_frac < 1
+    kernel = _lagrange_weights(d_frac, taps).flip(-1).unsqueeze(1)
+    y = torch.nn.functional.conv1d(
+        x.unsqueeze(1), kernel, padding=pad, groups=x.size(0)
+    ).squeeze(1)
+    y = y.roll(-pad, dims=1)[..., : x.size(1)]
+    for b in range(x.size(0)):
+        if d_int[b] != 0:
+            y[b] = torch.roll(y[b], int(-d_int[b]), 0)
+    return y
+def _linear_weights(d: torch.Tensor) -> torch.Tensor:
+    # (B,) -> (B,2)
+    return torch.stack([1.0 - d, d], dim=-1)
+import torch
+def gtw_shift_linear(x: torch.Tensor,
+                     delay: torch.Tensor,
+                     *, debug: bool = False) -> torch.Tensor:
+    """
+    Linear-interpolation fractional delay.
+    • Positive delay  → advance (earlier), just like ZeroBAS / the tests
+    • Negative delay  → retard (later)
+    • When `delay` is an *exact integer*, the output is a pure cyclic roll,
+      matching the reference tests.
+    Shapes
+    ------
+    x      : (B, T)
+    delay  : (B,)
+    """
+    B, T         = x.shape
+    dtype, dev   = x.dtype, x.device
+    delay        = delay.to(dtype)                     # ensure same dtype/device
+    int_part     = delay.round().to(torch.int64)       # nearest integer
+    is_integer   = torch.isclose(delay, int_part.to(dtype), atol=1e-7)
+    # ── Common path: direct gather-style interpolation ───────────────────
+    n            = torch.arange(T, device=dev, dtype=dtype).unsqueeze(0)  # (1,T)
+    src          = n + delay.unsqueeze(1)                                 # (B,T)
+    src_clamped  = torch.clamp(src, 0, T - 1)
+    i0           = src_clamped.floor().to(torch.long)                     # (B,T)
+    frac         = (src_clamped - i0.to(dtype))
+    i1           = torch.clamp(i0 + 1, max=T - 1)
+    y            = (1.0 - frac) * x.gather(1, i0) + frac * x.gather(1, i1)
+    # ── Overwrite rows whose delay is an exact integer with a cyclic roll ─
+    for b in range(B):
+        if is_integer[b]:
+            shift = -int(int_part[b].item())          # advance ⇔ negative roll
+            if shift:
+                y[b] = torch.roll(x[b], shifts=shift, dims=0)
+    if debug:
+        print("delay      :", delay)
+        print("is_integer :", is_integer)
+        print("int_part   :", int_part)
+    return y

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+git+https://github.com/nari-labs/dia.git@main            # TTS model  :contentReference[oaicite:1]{index=1}
+git+https://github.com/descriptinc/descript-audio-codec.git@main  # DAC  :contentReference[oaicite:2]{index=2}
+soundfile
+numpy
+torchmetrics[audio]             # SI‑SDR  :contentReference[oaicite:3]{index=3}
+pytest
+gradio>=4.27.0
+huggingface-hub>=0.23.0

smoke_test.py ADDED Viewed

	@@ -0,0 +1,44 @@

+"""
+Quick sanity‑check: make Dia speak one sentence and write mono WAV.
+Run inside the container:   python smoke_test.py
+"""
+import argparse
+import soundfile as sf
+import torch
+from dia.model import Dia
+# Parse command line arguments
+parser = argparse.ArgumentParser(description="Dia model smoke test")
+parser.add_argument("--device", type=str, default=None, help="Force device (e.g., 'cuda', 'cpu')")
+args = parser.parse_args()
+# Determine device
+if args.device:
+    device = torch.device(args.device)
+elif torch.cuda.is_available():
+    device = torch.device("cuda")
+elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+    device = torch.device("mps")
+else:
+    device = torch.device("cpu")
+print(f"Using device: {device}")
+# Load Dia model
+print("Loading Dia model...")
+try:
+    model = Dia.from_pretrained("nari-labs/Dia-1.6B", compute_dtype="float16", device=device)
+    print("Model loaded successfully")
+except Exception as e:
+    print(f"Error loading Dia model: {e}")
+    raise
+# Generate audio
+text = "[S1] Hello world, this is Dia on a clean build!"
+print(f"Generating audio for: {text}")
+waveform = model.generate(text)         # returns (T,) float32 numpy, 24 kHz
+print("Shape:", waveform.shape, "dtype:", waveform.dtype)
+sf.write("dia_hello.wav", waveform, 24000)
+print("Audio saved to dia_hello.wav")

spatial.py ADDED Viewed

	@@ -0,0 +1,21 @@

+import torch
+def ild_gain(distance_m: torch.Tensor,
+             clamp_min: float = 0.2,
+             clamp_max: float = 5.0) -> torch.Tensor:
+    """
+    Returns ILD gain (1/d² attenuation) for each ear.
+    distance_m: scalar or tensor of shape (B,)
+    Output: gain factor(s) ∈ [0, 1], same shape
+    """
+    gain = 1.0 / torch.clamp(distance_m, min=clamp_min, max=clamp_max).pow(2)
+    return gain
+def apply_ild(left: torch.Tensor, right: torch.Tensor,
+              gain_left: torch.Tensor, gain_right: torch.Tensor) -> torch.Tensor:
+    """
+    Apply ILD gains to L/R signals. Inputs: (B, T)
+    Output: (B, 2, T) stereo
+    """
+    return torch.stack([left * gain_left[:, None],
+                        right * gain_right[:, None]], dim=1)

synthesis.py ADDED Viewed

	@@ -0,0 +1,103 @@

+"""
+mono → GTW (ITD) → ILD → stereo  (2,T)
+Exports
+-------
+binauralize(mono, az_deg, dist_m, sr) -> torch.Tensor[2,T]
+synthesize(text, az_deg=0, dist_m=1.0, sr=24000)     -> np.ndarray
+preload_model()                                      -> None   # eager weight load
+"""
+from __future__ import annotations
+import os, functools, torch, numpy as np
+import gtw, spatial
+# ───────────────────────────────────────────────────────────────
+#  Global perf & cache
+# ───────────────────────────────────────────────────────────────
+torch.backends.cudnn.benchmark = True                      # cuDNN autotune
+os.environ.setdefault("HF_HOME", "/data/.huggingface")     # HF cache path
+# ───────────────────────────────────────────────────────────────
+#  Geometry helpers
+# ───────────────────────────────────────────────────────────────
+_SPEED_OF_SOUND = 343.0
+_EAR_OFFSET_M   = 0.087
+def _itd_samples(az_deg: float, sr: int) -> float:
+    az_rad  = np.deg2rad(az_deg)
+    delta_m = 2.0 * _EAR_OFFSET_M * np.sin(az_rad)
+    return (delta_m / _SPEED_OF_SOUND) * sr
+# ───────────────────────────────────────────────────────────────
+#  Dia loader (cached)
+# ───────────────────────────────────────────────────────────────
+from dia import Dia      # heavy import but only once
+@functools.lru_cache(maxsize=1)
+def _load_dia() -> "Dia":
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    model  = Dia.from_pretrained(
+                "nari-labs/Dia-1.6B",
+                compute_dtype="float16",
+                device=device
+            )
+    # If Dia happens to be nn.Module, compile for a tiny win
+    if isinstance(model, torch.nn.Module):
+        model = model.eval()
+        try:
+            model = torch.compile(model, mode="reduce-overhead")
+        except Exception:
+            pass
+    return model
+def preload_model() -> None:
+    """Download weights (if missing) and pin Dia in RAM/GPU."""
+    _load_dia()        # runs exactly once because of lru_cache
+# ───────────────────────────────────────────────────────────────
+#  Spatialisation core
+# ───────────────────────────────────────────────────────────────
+def binauralize(mono: torch.Tensor,
+                az_deg: float,
+                dist_m: float,
+                sr: int = 24_000) -> torch.Tensor:
+    if mono.dim() != 1:
+        raise ValueError("mono must be 1-D (T,) tensor")
+    # ITD via GTW
+    itd          = _itd_samples(az_deg, sr)
+    delay_left   = torch.tensor(max(-itd, 0.0), dtype=mono.dtype, device=mono.device)
+    delay_right  = torch.tensor(max(itd,  0.0), dtype=mono.dtype, device=mono.device)
+    left  = gtw.gtw_shift(mono.unsqueeze(0), delay_left).squeeze(0)
+    right = gtw.gtw_shift(mono.unsqueeze(0), delay_right).squeeze(0)
+    # ILD
+    az_rad = np.deg2rad(az_deg)
+    delta  = 2.0 * _EAR_OFFSET_M * np.sin(az_rad)
+    dist_L = max(dist_m - delta, 0.05)
+    dist_R = max(dist_m + delta, 0.05)
+    gL = spatial.ild_gain(torch.tensor(dist_L, dtype=mono.dtype, device=mono.device))
+    gR = spatial.ild_gain(torch.tensor(dist_R, dtype=mono.dtype, device=mono.device))
+    stereo = spatial.apply_ild(
+        left.unsqueeze(0), right.unsqueeze(0), gL.view(1), gR.view(1)
+    ).squeeze(0)
+    return stereo
+# ───────────────────────────────────────────────────────────────
+#  Public wrapper
+# ───────────────────────────────────────────────────────────────
+def synthesize(text: str,
+               az_deg: float = 0.0,
+               dist_m: float = 1.0,
+               sr: int = 24_000) -> np.ndarray:
+    """
+    Cached Dia → mono → spatialise → stereo NumPy array.
+    First-ever call downloads weights; later calls are instant.
+    """
+    model = _load_dia()
+    with torch.inference_mode():
+        mono_np = model.generate(text)                 # (T,) float32
+    mono   = torch.from_numpy(mono_np).to(model.device)
+    return binauralize(mono, az_deg, dist_m, sr).cpu().numpy()

synthesize_test.py ADDED Viewed

	@@ -0,0 +1,13 @@

+import pytest, numpy as np
+from synthesis import synthesize
+stereo = synthesize("one two three", az_deg=15, dist_m=1.2, sr=24_000)
+# Shape & basic energy split
+assert stereo.shape[0] == 2
+assert np.abs(stereo[0]).mean() != 0
+assert np.abs(stereo[1]).mean() != 0
+# Centre check: swap az sign -> channels swap energy ordering
+stereo2 = synthesize("one two three", az_deg=-15, dist_m=1.2, sr=24_000)
+assert stereo[0].mean() > stereo[1].mean()
+assert stereo2[0].mean() < stereo2[1].mean()