# app.py import io, re, zipfile from typing import Tuple, List import gradio as gr import numpy as np import soundfile as sf from synthesis import synthesize, preload_model SR = 24_000 DIST_M = 1.0 AZ_LOOKUP = {"left": -45, "right": 45} # extend as needed # --------------------------------------------------------------------------- # 1. Minimal TTS helper (model cache lives inside synthesize) # --------------------------------------------------------------------------- def _tts(text: str, az_deg: float) -> np.ndarray: return synthesize(text, az_deg=az_deg, dist_m=DIST_M, sr=SR) # (2,T) # --------------------------------------------------------------------------- # 2. Parse textarea ➜ list[(side, wav)] # --------------------------------------------------------------------------- LINE_RE = re.compile(r"\[S\d+\]\s*\[(left|right)\]\s*(.+)", re.I) def parse_script(script: str) -> List[Tuple[str, np.ndarray]]: tracks = [] for ln in script.strip().splitlines(): m = LINE_RE.match(ln.strip()) if not m: continue side, text = m.group(1).lower(), m.group(2).strip() tracks.append((side, _tts(text, AZ_LOOKUP[side]))) if not tracks: raise gr.Error("No valid lines found. Format: [S1][ left] Hello …") return tracks # --------------------------------------------------------------------------- # 3. Mix per side # --------------------------------------------------------------------------- def _pad(pcm: np.ndarray, T: int) -> np.ndarray: return np.pad(pcm, ((0, 0), (0, T - pcm.shape[1])), "constant") def render(script: str): tracks = parse_script(script) left = [w for side, w in tracks if side == "left"] right = [w for side, w in tracks if side == "right"] def combine(wavs): if not wavs: return np.zeros((2, 1), dtype=np.float32) T = max(w.shape[1] for w in wavs) return sum(_pad(w, T) for w in wavs) left_mix = combine(left) right_mix = combine(right) dialog = left_mix + right_mix return ( (SR, left_mix.T), (SR, right_mix.T), (SR, dialog.T), _zip_bytes({ "left_speaker.wav": left_mix.T, "right_speaker.wav": right_mix.T, "dialog_mix.wav": dialog.T, }) ) # --------------------------------------------------------------------------- # 4. Utility – ZIP builder # --------------------------------------------------------------------------- def _zip_bytes(files: dict) -> bytes: buf = io.BytesIO() with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: for fname, data in files.items(): wav_buf = io.BytesIO() sf.write(wav_buf, data, SR, subtype="PCM_16") zf.writestr(fname, wav_buf.getvalue()) return buf.getvalue() # --------------------------------------------------------------------------- # 5. Gradio UI # --------------------------------------------------------------------------- with gr.Blocks(title="Spatial Dialog Synth (Dia)") as demo: gr.Markdown("### Spatial Dialog Synth\n" "Enter lines in the format `[S1][ left] Hello …` / `[S2][ right] …`") with gr.Row(): # Left column - Input and Download with gr.Column(scale=1): script_in = gr.Textbox(lines=8, placeholder="[S1][ left] Hello world…", label="Script") gen_btn = gr.Button("Generate", variant="primary") zip_output = gr.File(label="Download all (zip)") # Right column - Audio outputs with gr.Column(scale=1): left_audio = gr.Audio(label="Left speaker") right_audio = gr.Audio(label="Right speaker") mix_audio = gr.Audio(label="Dialog mix") gen_btn.click( fn=render, inputs=script_in, outputs=[left_audio, right_audio, mix_audio, zip_output] ) # --------------------------------------------------------------------------- # 6. Pre-warm Dia so first user click is instant # --------------------------------------------------------------------------- preload_model() # blocks ~30 s only on very first container start demo.launch()