File size: 4,227 Bytes
3e21dc5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
# app.py
import io, re, zipfile
from typing import Tuple, List

import gradio as gr
import numpy as np
import soundfile as sf

from synthesis import synthesize, preload_model

SR        = 24_000
DIST_M    = 1.0
AZ_LOOKUP = {"left": -45, "right": 45}  # extend as needed

# ---------------------------------------------------------------------------
# 1. Minimal TTS helper (model cache lives inside synthesize)
# ---------------------------------------------------------------------------
def _tts(text: str, az_deg: float) -> np.ndarray:
    return synthesize(text, az_deg=az_deg, dist_m=DIST_M, sr=SR)  # (2,T)

# ---------------------------------------------------------------------------
# 2. Parse textarea ➜ list[(side, wav)]
# ---------------------------------------------------------------------------
LINE_RE = re.compile(r"\[S\d+\]\s*\[(left|right)\]\s*(.+)", re.I)

def parse_script(script: str) -> List[Tuple[str, np.ndarray]]:
    tracks = []
    for ln in script.strip().splitlines():
        m = LINE_RE.match(ln.strip())
        if not m:
            continue
        side, text = m.group(1).lower(), m.group(2).strip()
        tracks.append((side, _tts(text, AZ_LOOKUP[side])))
    if not tracks:
        raise gr.Error("No valid lines found. Format: [S1][ left] Hello …")
    return tracks

# ---------------------------------------------------------------------------
# 3. Mix per side
# ---------------------------------------------------------------------------
def _pad(pcm: np.ndarray, T: int) -> np.ndarray:
    return np.pad(pcm, ((0, 0), (0, T - pcm.shape[1])), "constant")

def render(script: str):
    tracks = parse_script(script)
    left   = [w for side, w in tracks if side == "left"]
    right  = [w for side, w in tracks if side == "right"]

    def combine(wavs):
        if not wavs:
            return np.zeros((2, 1), dtype=np.float32)
        T = max(w.shape[1] for w in wavs)
        return sum(_pad(w, T) for w in wavs)

    left_mix  = combine(left)
    right_mix = combine(right)
    dialog    = left_mix + right_mix

    return (
        (SR, left_mix.T),
        (SR, right_mix.T),
        (SR, dialog.T),
        _zip_bytes({
            "left_speaker.wav":  left_mix.T,
            "right_speaker.wav": right_mix.T,
            "dialog_mix.wav":    dialog.T,
        })
    )

# ---------------------------------------------------------------------------
# 4. Utility – ZIP builder
# ---------------------------------------------------------------------------
def _zip_bytes(files: dict) -> bytes:
    buf = io.BytesIO()
    with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as zf:
        for fname, data in files.items():
            wav_buf = io.BytesIO()
            sf.write(wav_buf, data, SR, subtype="PCM_16")
            zf.writestr(fname, wav_buf.getvalue())
    return buf.getvalue()

# ---------------------------------------------------------------------------
# 5. Gradio UI
# ---------------------------------------------------------------------------
with gr.Blocks(title="Spatial Dialog Synth (Dia)") as demo:
    gr.Markdown("### Spatial Dialog Synth\n"
                "Enter lines in the format `[S1][ left] Hello …` / `[S2][ right] …`")
    
    with gr.Row():
        # Left column - Input and Download
        with gr.Column(scale=1):
            script_in = gr.Textbox(lines=8, placeholder="[S1][ left] Hello world…", label="Script")
            gen_btn = gr.Button("Generate", variant="primary")
            zip_output = gr.File(label="Download all (zip)")
        
        # Right column - Audio outputs
        with gr.Column(scale=1):
            left_audio = gr.Audio(label="Left speaker")
            right_audio = gr.Audio(label="Right speaker")
            mix_audio = gr.Audio(label="Dialog mix")

    gen_btn.click(
        fn=render,
        inputs=script_in,
        outputs=[left_audio, right_audio, mix_audio, zip_output]
    )
    
# ---------------------------------------------------------------------------
# 6. Pre-warm Dia so first user click is instant
# ---------------------------------------------------------------------------
preload_model()          # blocks ~30 s only on very first container start

demo.launch()