roombox / app.py
ak36's picture
Upload folder using huggingface_hub
3e21dc5 verified
# app.py
import io, re, zipfile
from typing import Tuple, List
import gradio as gr
import numpy as np
import soundfile as sf
from synthesis import synthesize, preload_model
SR = 24_000
DIST_M = 1.0
AZ_LOOKUP = {"left": -45, "right": 45} # extend as needed
# ---------------------------------------------------------------------------
# 1. Minimal TTS helper (model cache lives inside synthesize)
# ---------------------------------------------------------------------------
def _tts(text: str, az_deg: float) -> np.ndarray:
return synthesize(text, az_deg=az_deg, dist_m=DIST_M, sr=SR) # (2,T)
# ---------------------------------------------------------------------------
# 2. Parse textarea ➜ list[(side, wav)]
# ---------------------------------------------------------------------------
LINE_RE = re.compile(r"\[S\d+\]\s*\[(left|right)\]\s*(.+)", re.I)
def parse_script(script: str) -> List[Tuple[str, np.ndarray]]:
tracks = []
for ln in script.strip().splitlines():
m = LINE_RE.match(ln.strip())
if not m:
continue
side, text = m.group(1).lower(), m.group(2).strip()
tracks.append((side, _tts(text, AZ_LOOKUP[side])))
if not tracks:
raise gr.Error("No valid lines found. Format: [S1][ left] Hello …")
return tracks
# ---------------------------------------------------------------------------
# 3. Mix per side
# ---------------------------------------------------------------------------
def _pad(pcm: np.ndarray, T: int) -> np.ndarray:
return np.pad(pcm, ((0, 0), (0, T - pcm.shape[1])), "constant")
def render(script: str):
tracks = parse_script(script)
left = [w for side, w in tracks if side == "left"]
right = [w for side, w in tracks if side == "right"]
def combine(wavs):
if not wavs:
return np.zeros((2, 1), dtype=np.float32)
T = max(w.shape[1] for w in wavs)
return sum(_pad(w, T) for w in wavs)
left_mix = combine(left)
right_mix = combine(right)
dialog = left_mix + right_mix
return (
(SR, left_mix.T),
(SR, right_mix.T),
(SR, dialog.T),
_zip_bytes({
"left_speaker.wav": left_mix.T,
"right_speaker.wav": right_mix.T,
"dialog_mix.wav": dialog.T,
})
)
# ---------------------------------------------------------------------------
# 4. Utility – ZIP builder
# ---------------------------------------------------------------------------
def _zip_bytes(files: dict) -> bytes:
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
for fname, data in files.items():
wav_buf = io.BytesIO()
sf.write(wav_buf, data, SR, subtype="PCM_16")
zf.writestr(fname, wav_buf.getvalue())
return buf.getvalue()
# ---------------------------------------------------------------------------
# 5. Gradio UI
# ---------------------------------------------------------------------------
with gr.Blocks(title="Spatial Dialog Synth (Dia)") as demo:
gr.Markdown("### Spatial Dialog Synth\n"
"Enter lines in the format `[S1][ left] Hello …` / `[S2][ right] …`")
with gr.Row():
# Left column - Input and Download
with gr.Column(scale=1):
script_in = gr.Textbox(lines=8, placeholder="[S1][ left] Hello world…", label="Script")
gen_btn = gr.Button("Generate", variant="primary")
zip_output = gr.File(label="Download all (zip)")
# Right column - Audio outputs
with gr.Column(scale=1):
left_audio = gr.Audio(label="Left speaker")
right_audio = gr.Audio(label="Right speaker")
mix_audio = gr.Audio(label="Dialog mix")
gen_btn.click(
fn=render,
inputs=script_in,
outputs=[left_audio, right_audio, mix_audio, zip_output]
)
# ---------------------------------------------------------------------------
# 6. Pre-warm Dia so first user click is instant
# ---------------------------------------------------------------------------
preload_model() # blocks ~30 s only on very first container start
demo.launch()