|
|
|
|
|
import io, re, zipfile |
|
|
from typing import Tuple, List |
|
|
|
|
|
import gradio as gr |
|
|
import numpy as np |
|
|
import soundfile as sf |
|
|
|
|
|
from synthesis import synthesize, preload_model |
|
|
|
|
|
SR = 24_000 |
|
|
DIST_M = 1.0 |
|
|
AZ_LOOKUP = {"left": -45, "right": 45} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _tts(text: str, az_deg: float) -> np.ndarray: |
|
|
return synthesize(text, az_deg=az_deg, dist_m=DIST_M, sr=SR) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
LINE_RE = re.compile(r"\[S\d+\]\s*\[(left|right)\]\s*(.+)", re.I) |
|
|
|
|
|
def parse_script(script: str) -> List[Tuple[str, np.ndarray]]: |
|
|
tracks = [] |
|
|
for ln in script.strip().splitlines(): |
|
|
m = LINE_RE.match(ln.strip()) |
|
|
if not m: |
|
|
continue |
|
|
side, text = m.group(1).lower(), m.group(2).strip() |
|
|
tracks.append((side, _tts(text, AZ_LOOKUP[side]))) |
|
|
if not tracks: |
|
|
raise gr.Error("No valid lines found. Format: [S1][ left] Hello …") |
|
|
return tracks |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _pad(pcm: np.ndarray, T: int) -> np.ndarray: |
|
|
return np.pad(pcm, ((0, 0), (0, T - pcm.shape[1])), "constant") |
|
|
|
|
|
def render(script: str): |
|
|
tracks = parse_script(script) |
|
|
left = [w for side, w in tracks if side == "left"] |
|
|
right = [w for side, w in tracks if side == "right"] |
|
|
|
|
|
def combine(wavs): |
|
|
if not wavs: |
|
|
return np.zeros((2, 1), dtype=np.float32) |
|
|
T = max(w.shape[1] for w in wavs) |
|
|
return sum(_pad(w, T) for w in wavs) |
|
|
|
|
|
left_mix = combine(left) |
|
|
right_mix = combine(right) |
|
|
dialog = left_mix + right_mix |
|
|
|
|
|
return ( |
|
|
(SR, left_mix.T), |
|
|
(SR, right_mix.T), |
|
|
(SR, dialog.T), |
|
|
_zip_bytes({ |
|
|
"left_speaker.wav": left_mix.T, |
|
|
"right_speaker.wav": right_mix.T, |
|
|
"dialog_mix.wav": dialog.T, |
|
|
}) |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def _zip_bytes(files: dict) -> bytes: |
|
|
buf = io.BytesIO() |
|
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf: |
|
|
for fname, data in files.items(): |
|
|
wav_buf = io.BytesIO() |
|
|
sf.write(wav_buf, data, SR, subtype="PCM_16") |
|
|
zf.writestr(fname, wav_buf.getvalue()) |
|
|
return buf.getvalue() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(title="Spatial Dialog Synth (Dia)") as demo: |
|
|
gr.Markdown("### Spatial Dialog Synth\n" |
|
|
"Enter lines in the format `[S1][ left] Hello …` / `[S2][ right] …`") |
|
|
|
|
|
with gr.Row(): |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
script_in = gr.Textbox(lines=8, placeholder="[S1][ left] Hello world…", label="Script") |
|
|
gen_btn = gr.Button("Generate", variant="primary") |
|
|
zip_output = gr.File(label="Download all (zip)") |
|
|
|
|
|
|
|
|
with gr.Column(scale=1): |
|
|
left_audio = gr.Audio(label="Left speaker") |
|
|
right_audio = gr.Audio(label="Right speaker") |
|
|
mix_audio = gr.Audio(label="Dialog mix") |
|
|
|
|
|
gen_btn.click( |
|
|
fn=render, |
|
|
inputs=script_in, |
|
|
outputs=[left_audio, right_audio, mix_audio, zip_output] |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
preload_model() |
|
|
|
|
|
demo.launch() |
|
|
|