File size: 8,187 Bytes
e88b235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d94b53d
e88b235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a2774f
e88b235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a2774f
e88b235
 
f75e3fe
 
 
e88b235
f75e3fe
e88b235
 
 
f75e3fe
e88b235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7a2774f
e88b235
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<title>LiveEdit · Realtime</title>
<style>
  :root { --bg:#0e0f13; --panel:#171922; --line:#2a2e3a; --fg:#e8e8ee; --accent:#c084fc; --good:#86efac; }
  * { box-sizing: border-box; }
  body { margin:0; background:var(--bg); color:var(--fg); font-family:-apple-system,BlinkMacSystemFont,"Segoe UI",Helvetica,Arial,sans-serif; }
  .wrap { max-width:1100px; margin:0 auto; padding:24px 18px 60px; }
  h1 { font-size:1.5rem; margin:0 0 4px; }
  .sub { color:#9aa0ad; font-size:.95rem; margin:0 0 18px; line-height:1.5; }
  .sub a { color:var(--accent); }
  .controls { display:flex; gap:10px; flex-wrap:wrap; align-items:center; margin-bottom:16px; }
  input[type=text] { flex:1; min-width:260px; background:var(--panel); border:1px solid var(--line); color:var(--fg); padding:12px 14px; border-radius:10px; font-size:1rem; }
  button { background:var(--accent); color:#1a1024; border:0; padding:12px 18px; border-radius:10px; font-size:1rem; font-weight:600; cursor:pointer; }
  button:disabled { opacity:.5; cursor:not-allowed; }
  .timer { font-family:ui-monospace,Menlo,monospace; color:var(--good); background:#11210f; border:1px solid #234; padding:8px 12px; border-radius:8px; display:none; }
  .grid { display:grid; grid-template-columns:1fr 1fr; gap:14px; }
  .card { background:var(--panel); border:1px solid var(--line); border-radius:14px; overflow:hidden; }
  .card h2 { font-size:.85rem; text-transform:uppercase; letter-spacing:.05em; color:#9aa0ad; margin:0; padding:10px 14px; border-bottom:1px solid var(--line); }
  .media { aspect-ratio:832/480; background:#000; display:flex; align-items:center; justify-content:center; }
  .media video, .media img { width:100%; height:100%; object-fit:cover; display:block; }
  .placeholder { color:#5a6070; font-size:.9rem; }
  .status { margin-top:14px; color:#9aa0ad; font-size:.9rem; min-height:1.2em; }
  @media (max-width:780px){ .grid{ grid-template-columns:1fr; } }
</style>
</head>
<body>
<div class="wrap">
  <h1>🌀 StreamDiffusionV2 · Realtime Webcam Diffusion</h1>
  <p class="sub">
    Live demo of <a href="https://streamdiffusionv2.github.io/" target="_blank">StreamDiffusionV2</a>
    (MLSys 2026 Best Paper) on Wan2.1-T2V-1.3B. It streams your webcam through a causal video-diffusion
    model with a <b>sink-token rolling KV cache</b> &mdash; built for <i>continuous</i> streaming, so it
    keeps flowing without the window-shift burst. Type a style prompt, click <b>Start</b> to grab ZeroGPU
    for ~60s. <b>Change the prompt anytime</b> &mdash; it updates the live stream.
  </p>

  <div class="controls">
    <input id="instruction" type="text" placeholder="style prompt · e.g. psychedelic neon dream · van gogh · cyberpunk city" />
    <button id="startBtn">▶ Start session</button>
    <span id="timer" class="timer"><span id="count">58</span>s</span>
  </div>

  <div class="grid">
    <div class="card">
      <h2>Your webcam</h2>
      <div class="media"><video id="cam" autoplay muted playsinline></video></div>
    </div>
    <div class="card">
      <h2>Edited (live)</h2>
      <div class="media"><img id="out" alt="" /><span id="outPh" class="placeholder">edited stream appears here</span></div>
    </div>
  </div>

  <div id="status" class="status"></div>
</div>

<canvas id="grab" width="640" height="360" style="display:none"></canvas>

<script type="module">
import { Client } from "https://cdn.jsdelivr.net/npm/@gradio/client/dist/index.min.js";

const camEl = document.getElementById("cam");
const outEl = document.getElementById("out");
const outPh = document.getElementById("outPh");
const startBtn = document.getElementById("startBtn");
const instr = document.getElementById("instruction");
const timerEl = document.getElementById("timer");
const countEl = document.getElementById("count");
const statusEl = document.getElementById("status");
const grab = document.getElementById("grab");
const gctx = grab.getContext("2d");

let client = null;
let stream = null;
let captureTimer = null;
let countdownTimer = null;
let running = false;

const FPS = 30;                // webcam frames sent per second (backend drops backlog)
const SESSION_SECONDS = 58;

// --- low-latency player: StreamDiffusionV2 streams steadily (~14fps), so we do
// NOT jitter-buffer. Keep a tiny queue and aggressively drop the backlog so the
// preview always shows the most recent edit (minimal action->reaction delay).
let playQueue = [];
const MAX_QUEUE = 3;           // ~0.2s worth; drop older frames past this
let lastShown = 0;
function playLoop(ts){
  if (playQueue.length > MAX_QUEUE) playQueue = playQueue.slice(-MAX_QUEUE);
  if (playQueue.length && ts - lastShown >= 1000 / 30){
    outEl.src = playQueue.shift();
    outPh.style.display = "none";
    lastShown = ts;
  }
  requestAnimationFrame(playLoop);
}
requestAnimationFrame(playLoop);

function setStatus(t){ statusEl.textContent = t; }

async function ensureClient(){
  if (!client) client = await Client.connect(window.location.origin);
  return client;
}

async function ensureCam(){
  if (stream) return;
  stream = await navigator.mediaDevices.getUserMedia({ video: { width: 832, height: 480 }, audio: false });
  camEl.srcObject = stream;
  await camEl.play().catch(()=>{});
}

async function sendInstruction(){
  try {
    await fetch("/instruction", {
      method:"POST", headers:{ "Content-Type":"application/json" },
      body: JSON.stringify({ instruction: instr.value || "" })
    });
  } catch(e){}
}

function startCapture(){
  captureTimer = setInterval(() => {
    if (!camEl.videoWidth) return;
    gctx.drawImage(camEl, 0, 0, grab.width, grab.height);
    grab.toBlob(async (blob) => {
      if (!blob) return;
      try { await fetch("/frame", { method:"POST", body: blob }); } catch(e){}
    }, "image/jpeg", 0.6);
  }, 1000 / FPS);
}

function stopAll(){
  running = false;
  if (captureTimer) { clearInterval(captureTimer); captureTimer = null; }
  if (countdownTimer) { clearInterval(countdownTimer); countdownTimer = null; }
  timerEl.style.display = "none";
  startBtn.disabled = false;
  startBtn.textContent = "▶ Start session";
}

function startCountdown(){
  let r = SESSION_SECONDS;
  countEl.textContent = r;
  timerEl.style.display = "inline-block";
  countdownTimer = setInterval(() => {
    r -= 1; countEl.textContent = Math.max(0, r);
    if (r <= 0) clearInterval(countdownTimer);
  }, 1000);
}

instr.addEventListener("change", () => { if (running) sendInstruction(); });
instr.addEventListener("input", () => { if (running) sendInstruction(); });

startBtn.addEventListener("click", async () => {
  if (running) return;
  startBtn.disabled = true;
  try {
    setStatus("Requesting webcam…");
    await ensureCam();
    setStatus("Connecting…");
    await ensureClient();
    running = true;
    playQueue = [];
    startBtn.textContent = "◌ Acquiring ZeroGPU…";
    await sendInstruction();
    setStatus("Queued for ZeroGPU — webcam streaming starts once the GPU is acquired…");

    const job = client.submit("/run_session", {});
    let frames = 0;
    for await (const msg of job) {
      if (msg.type !== "data" || !msg.data || msg.data[0] == null) continue;
      const payload = msg.data[0];
      if (payload === "__READY__") {
        // GPU is now allocated — only now start capturing & sending frames.
        startBtn.textContent = "● Live";
        await sendInstruction();
        startCapture();
        startCountdown();
        setStatus("ZeroGPU acquired — streaming your webcam through LiveEdit…");
        continue;
      }
      playQueue.push(payload);          // jitter buffer paces actual display
      frames += 1;
      if (frames % 12 === 0) setStatus(`Streaming… ${frames} edited frames`);
    }
    setStatus(frames ? "Session ended. Click Start to run another ~60s session."
                     : "Session ended before any frames were produced — try again.");
  } catch (e) {
    setStatus("Error: " + (e && e.message ? e.message : e));
  } finally {
    stopAll();
  }
});
</script>
</body>
</html>