shreyas-joshi Cursor commited on
Commit
8eeaf9e
·
1 Parent(s): 7348977

Fix audio jitter: remove real-time sleep, add ms_start to sentence events

Browse files

- Remove per-frame asyncio.sleep pacing. Frames now send as fast as
synthesis allows, letting the client buffer audio ahead of playback.
- Track cumulative_samples per chapter; include ms_start (ms from
chapter start) in every sentence JSON event so the client can fire
highlights at the correct playback position via getStreamTimeConsumed.
- Default prefetch raised to 6 in client (was 3).

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (1) hide show
  1. backend/server.py +16 -7
backend/server.py CHANGED
@@ -372,6 +372,11 @@ async def websocket_endpoint(websocket: WebSocket):
372
  )
373
 
374
  last_key = None
 
 
 
 
 
375
  try:
376
  control_task: asyncio.Task[str] | None = asyncio.create_task(websocket.receive_text())
377
 
@@ -423,26 +428,30 @@ async def websocket_endpoint(websocket: WebSocket):
423
 
424
  if cancel_event.is_set():
425
  break
 
426
  key = (p_idx + start_paragraph, s_idx, sentence)
427
  if key != last_key:
428
  last_key = key
 
 
 
429
  await websocket.send_json(
430
  {
431
  "type": "sentence",
432
  "text": sentence,
433
  "paragraph_index": int(p_idx + start_paragraph),
434
  "sentence_index": int(s_idx),
 
435
  }
436
  )
437
  await websocket.send_bytes(audio_frame)
 
 
438
 
439
- # Pace frames close to real-time so UI updates (sentence highlighting)
440
- # match what is audible, even when synthesis runs faster than realtime.
441
- if realtime:
442
- try:
443
- await asyncio.sleep(len(audio_frame) / (2 * app.state.tts.sample_rate))
444
- except Exception:
445
- pass
446
 
447
  if control_task is not None:
448
  control_task.cancel()
 
372
  )
373
 
374
  last_key = None
375
+ # Cumulative samples sent so far — used to stamp ms_start on each
376
+ # sentence event so the client can fire highlights at the right
377
+ # playback position rather than at message-arrival time.
378
+ cumulative_samples = 0
379
+ sample_rate = app.state.tts.sample_rate
380
  try:
381
  control_task: asyncio.Task[str] | None = asyncio.create_task(websocket.receive_text())
382
 
 
428
 
429
  if cancel_event.is_set():
430
  break
431
+
432
  key = (p_idx + start_paragraph, s_idx, sentence)
433
  if key != last_key:
434
  last_key = key
435
+ # ms_start lets the client fire this highlight exactly when
436
+ # the audio reaches this sentence, regardless of buffering.
437
+ ms_start = (cumulative_samples * 1000) // sample_rate
438
  await websocket.send_json(
439
  {
440
  "type": "sentence",
441
  "text": sentence,
442
  "paragraph_index": int(p_idx + start_paragraph),
443
  "sentence_index": int(s_idx),
444
+ "ms_start": ms_start,
445
  }
446
  )
447
  await websocket.send_bytes(audio_frame)
448
+ # Track cumulative audio sent (int16 = 2 bytes per sample).
449
+ cumulative_samples += len(audio_frame) // 2
450
 
451
+ # No real-time sleep: send frames as fast as synthesis allows.
452
+ # The client buffers audio and fires highlights via ms_start +
453
+ # getStreamTimeConsumed, so no pacing is needed here.
454
+ # For offline downloads (realtime=False) the same path applies.
 
 
 
455
 
456
  if control_task is not None:
457
  control_task.cancel()