agkavin commited on Mar 11

Commit

9400b83

1 Parent(s): a4cc15e

avatars

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +27 -10
.gitignore +3 -3
backend/api/pipeline.py +306 -289
backend/api/server.py +157 -217
backend/avatars/christine/coords.pkl +3 -0
backend/avatars/christine/full_imgs/00000000.png +3 -0
backend/avatars/christine/full_imgs/00000001.png +3 -0
backend/avatars/christine/full_imgs/00000002.png +3 -0
backend/avatars/christine/full_imgs/00000003.png +3 -0
backend/avatars/christine/full_imgs/00000004.png +3 -0
backend/avatars/christine/full_imgs/00000005.png +3 -0
backend/avatars/christine/full_imgs/00000006.png +3 -0
backend/avatars/christine/full_imgs/00000007.png +3 -0
backend/avatars/christine/full_imgs/00000008.png +3 -0
backend/avatars/christine/full_imgs/00000009.png +3 -0
backend/avatars/christine/full_imgs/00000010.png +3 -0
backend/avatars/christine/full_imgs/00000011.png +3 -0
backend/avatars/christine/full_imgs/00000012.png +3 -0
backend/avatars/christine/full_imgs/00000013.png +3 -0
backend/avatars/christine/full_imgs/00000014.png +3 -0
backend/avatars/christine/full_imgs/00000015.png +3 -0
backend/avatars/christine/full_imgs/00000016.png +3 -0
backend/avatars/christine/full_imgs/00000017.png +3 -0
backend/avatars/christine/full_imgs/00000018.png +3 -0
backend/avatars/christine/full_imgs/00000019.png +3 -0
backend/avatars/christine/full_imgs/00000020.png +3 -0
backend/avatars/christine/full_imgs/00000021.png +3 -0
backend/avatars/christine/full_imgs/00000022.png +3 -0
backend/avatars/christine/full_imgs/00000023.png +3 -0
backend/avatars/christine/full_imgs/00000024.png +3 -0
backend/avatars/christine/full_imgs/00000025.png +3 -0
backend/avatars/christine/mask/00000000.png +3 -0
backend/avatars/christine/mask/00000001.png +3 -0
backend/avatars/christine/mask/00000002.png +3 -0
backend/avatars/christine/mask/00000003.png +3 -0
backend/avatars/christine/mask/00000004.png +3 -0
backend/avatars/christine/mask/00000005.png +3 -0
backend/avatars/christine/mask/00000006.png +3 -0
backend/avatars/christine/mask/00000007.png +3 -0
backend/avatars/christine/mask/00000008.png +3 -0
backend/avatars/christine/mask/00000009.png +3 -0
backend/avatars/christine/mask/00000010.png +3 -0
backend/avatars/christine/mask/00000011.png +3 -0
backend/avatars/christine/mask/00000012.png +3 -0
backend/avatars/christine/mask/00000013.png +3 -0
backend/avatars/christine/mask/00000014.png +3 -0
backend/avatars/christine/mask/00000015.png +3 -0
backend/avatars/christine/mask/00000016.png +3 -0
backend/avatars/christine/mask/00000017.png +3 -0
backend/avatars/christine/mask/00000018.png +3 -0

.gitattributes CHANGED Viewed

@@ -1,10 +1,28 @@
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
-*.gguf filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
 *.jpg filter=lfs diff=lfs merge=lfs -text
 *.jpeg filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text
@@ -13,10 +31,9 @@
 *.mp3 filter=lfs diff=lfs merge=lfs -text
 *.mp4 filter=lfs diff=lfs merge=lfs -text
 *.webm filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.hdf5 filter=lfs diff=lfs merge=lfs -text

+# Git LFS attributes for large binary files
+# patterns matched with filter=lfs and -text to avoid diffing
+# only include filetypes that are typically large or binary.
+# common model formats
 *.bin filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
 *.onnx filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
 *.safetensors filter=lfs diff=lfs merge=lfs -text
+*.gguf filter=lfs diff=lfs merge=lfs -text
 *.pkl filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.hdf5 filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+# archives
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+# media assets
 *.jpg filter=lfs diff=lfs merge=lfs -text
 *.jpeg filter=lfs diff=lfs merge=lfs -text
 *.png filter=lfs diff=lfs merge=lfs -text
 *.mp3 filter=lfs diff=lfs merge=lfs -text
 *.mp4 filter=lfs diff=lfs merge=lfs -text
 *.webm filter=lfs diff=lfs merge=lfs -text
+# project-specific large paths
+backend/personaplex-7b-v1-bnb-4bit/model_bnb_4bit.pt filter=lfs diff=lfs merge=lfs -text
+backend/avatars/**/* filter=lfs diff=lfs merge=lfs -text
+src/musetalk/models/**/* filter=lfs diff=lfs merge=lfs -text

.gitignore CHANGED Viewed

@@ -7,9 +7,9 @@ __pycache__/
 # are tracked via Git LFS (see .gitattributes)
 # Avatar image frames (pre-computed, regenerated by precompute_avatar.py)
-backend/avatars/*/full_imgs/
-backend/avatars/*/mask/
-backend/avatars/*/*.pkl
 # Frontend dependencies
 frontend/node_modules/

 # are tracked via Git LFS (see .gitattributes)
 # Avatar image frames (pre-computed, regenerated by precompute_avatar.py)
+# backend/avatars/*/full_imgs/
+# backend/avatars/*/mask/
+# backend/avatars/*/*.pkl
 # Frontend dependencies
 frontend/node_modules/

backend/api/pipeline.py CHANGED Viewed

@@ -1,8 +1,21 @@
 """
-Speech-to-Video Pipeline Orchestrator
-=====================================
-Coordinates TTS → MuseTalk → LiveKit publishing.
-Optimized for low latency (<200ms initial response).
 """
 from __future__ import annotations
@@ -13,162 +26,38 @@ from typing import Optional
 import numpy as np
-# Use relative imports for standalone
 import sys
 from pathlib import Path
-_backend_dir = Path(__file__).parent.parent.parent
 if str(_backend_dir) not in sys.path:
     sys.path.insert(0, str(_backend_dir))
 from config import (
-    CHUNK_DURATION,
     FRAMES_PER_CHUNK,
     TTS_SAMPLE_RATE,
-    TTS_SAMPLES_PER_CHUNK,
     VIDEO_FPS,
-    SYSTEM_PROMPT,
 )
 from tts.kokoro_tts import KokoroTTS
-from musetalk.worker import MuseTalkWorker, AVChunk
-from sync.av_sync import AVSyncGate, SimpleAVSync
 from publisher.livekit_publisher import AVPublisher, IdleFrameGenerator
 log = logging.getLogger(__name__)
-class SpeechToVideoPipeline:
-    """
-    Main pipeline: Text → TTS → MuseTalk → LiveKit
-    Optimized for smooth, synchronized AV output.
-    """
-    def __init__(
-        self,
-        tts: KokoroTTS,
-        musetalk: MuseTalkWorker,
-        publisher: AVPublisher,
-        avatar_assets,
-    ):
-        self._tts = tts
-        self._musetalk = musetalk
-        self._publisher = publisher
-        self._avatar_assets = avatar_assets
-        self._idle_generator = IdleFrameGenerator(
-            avatar_assets,
-            target_width=publisher._video_width,
-            target_height=publisher._video_height,
-        )
-        self._av_sync = SimpleAVSync(video_fps=VIDEO_FPS)
-        self._running = False
-        self._idle_task: Optional[asyncio.Task] = None
-        log.info("SpeechToVideoPipeline initialized")
-    async def start(self):
-        """Start the pipeline."""
-        self._running = True
-        self._idle_task = asyncio.create_task(self._idle_loop())
-        log.info("Pipeline started")
-    async def stop(self):
-        """Stop the pipeline."""
-        self._running = False
-        if self._idle_task:
-            self._idle_task.cancel()
-            try:
-                await self._idle_task
-            except asyncio.CancelledError:
-                pass
-        log.info("Pipeline stopped")
-    async def speak(self, text: str) -> float:
-        """
-        Process text and generate synchronized AV output.
-        Args:
-            text: Text to speak
-        Returns:
-            Start latency in seconds
-        """
-        start_time = time.monotonic()
-        # Process in chunks for low latency
-        chunk_id = 0
-        current_pts = 0.0
-        # Stream TTS and process through MuseTalk
-        async for audio_chunk, pts_start, pts_end in self._tts.synthesize_stream(text):
-            # Process through MuseTalk
-            av_chunk = await self._musetalk.process_chunk(
-                audio_pcm=audio_chunk,
-                chunk_id=chunk_id,
-                pts_start=pts_start,
-                pts_end=pts_end,
-                is_last=False,
-            )
-            # Publish synchronized AV
-            await self._publisher.publish_av_chunk(
-                audio=av_chunk.audio_pcm,
-                video_frames=av_chunk.video_frames,
-                pts_start=pts_start,
-            )
-            chunk_id += 1
-            current_pts = pts_end
-        latency = time.monotonic() - start_time
-        log.info(f"Speech completed in {latency:.3f}s")
-        return latency
-    async def _idle_loop(self):
-        """Idle animation loop when not speaking."""
-        frame_interval = 1.0 / VIDEO_FPS
-        session_start = time.monotonic()
-        log.info("Idle loop started")
-        try:
-            while self._running:
-                frame_start = time.monotonic()
-                # Get idle frame
-                idle_frame = self._idle_generator.next_frame()
-                # Calculate PTS
-                pts_us = int((frame_start - session_start) * 1_000_000)
-                # Publish video frame
-                await self._publisher.publish_video_frame(idle_frame, pts_us)
-                # Maintain frame rate
-                elapsed = time.monotonic() - frame_start
-                sleep_time = frame_interval - elapsed
-                if sleep_time > 0:
-                    await asyncio.sleep(sleep_time)
-                elif sleep_time < -0.01:
-                    log.warning("Frame took too long: %.3fs", -sleep_time)
-        except asyncio.CancelledError:
-            log.info("Idle loop cancelled")
-            raise
 class StreamingPipeline:
     """
-    Streaming version of the pipeline for real-time text input.
-    Processes text incrementally for lower latency.
     """
     def __init__(
         self,
         tts: KokoroTTS,
@@ -180,200 +69,328 @@ class StreamingPipeline:
         self._musetalk = musetalk
         self._publisher = publisher
         self._avatar_assets = avatar_assets
         self._idle_generator = IdleFrameGenerator(
             avatar_assets,
             target_width=publisher._video_width,
             target_height=publisher._video_height,
         )
         self._running = False
-        self._processing = False
-        self._speaking = False          # True while _speak_text is active
-        self._idle_task: Optional[asyncio.Task] = None
-        # Queue holds (video_frame, audio_slice_or_None) tuples.
-        # The idle loop drains at 25fps and publishes both in lockstep.
-        # Size 256 ≈ ~10s of video at 25fps.
-        self._video_queue: asyncio.Queue = asyncio.Queue(maxsize=256)
         self._text_queue: asyncio.Queue = asyncio.Queue()
-        log.info("StreamingPipeline initialized")
     async def start(self):
-        """Start the pipeline."""
         self._running = True
-        self._idle_task = asyncio.create_task(self._idle_loop())
         log.info("StreamingPipeline started")
     async def stop(self):
-        """Stop the pipeline."""
         self._running = False
-        if self._idle_task:
-            self._idle_task.cancel()
-            try:
-                await self._idle_task
-            except asyncio.CancelledError:
-                pass
         log.info("StreamingPipeline stopped")
     async def push_text(self, text: str):
         """
-        Push text to be spoken.
-        Non-blocking - starts processing immediately.
         """
-        await self._text_queue.put(text)
-        if not self._processing:
-            self._processing = True  # Set before task creation to prevent double-spawn
-            asyncio.create_task(self._process_queue())
-    async def _process_queue(self):
-        """Process text queue."""
-        self._processing = True
         try:
             while self._running:
                 try:
-                    text = await asyncio.wait_for(
-                        self._text_queue.get(),
-                        timeout=0.1
-                    )
                 except asyncio.TimeoutError:
-                    break
-                await self._speak_text(text)
-        finally:
-            self._processing = False
-    async def _speak_text(self, text: str):
         """
-        Speak text with sub-batch streaming.
-        Flow per Kokoro audio chunk:
-          1. Whisper encoder — once (~40 ms)
-          2. For each sub-batch of 4 frames:
-             a. MuseTalk UNet (~100 ms)
-             b. Chop audio into per-frame slices
-             c. Push (frame, audio_slice) tuples to queue
-          The idle loop drains at 25fps, publishing video + audio in lockstep.
         """
-        start_time = time.monotonic()
-        first_batch_logged = False
-        chunk_id = 0
-        BATCH = self._musetalk.BATCH_FRAMES  # 4
-        self._speaking = True
         try:
-            async for audio_chunk, pts_start, pts_end in self._tts.synthesize_stream(text):
-                # Flatten audio
-                audio_flat = audio_chunk.flatten() if audio_chunk.ndim > 1 else audio_chunk
-                audio_dur  = len(audio_flat) / TTS_SAMPLE_RATE
-                total_frames = max(1, round(audio_dur * VIDEO_FPS))
-                samples_per_frame = len(audio_flat) / total_frames
-                # Phase 1 — Whisper (once for the whole Kokoro chunk)
                 t0 = time.monotonic()
-                feats, _ = await self._musetalk.extract_features(audio_flat)
-                whisper_ms = (time.monotonic() - t0) * 1000
-                # Phase 2 — stream sub-batches of BATCH frames
                 for batch_start in range(0, total_frames, BATCH):
                     n = min(BATCH, total_frames - batch_start)
-                    t1 = time.monotonic()
                     frames = await self._musetalk.generate_batch(feats, batch_start, n)
-                    unet_ms = (time.monotonic() - t1) * 1000
-                    if not first_batch_logged:
-                        elapsed = (time.monotonic() - start_time) * 1000
-                        log.info("first batch: whisper %.0fms  unet %.0fms  total %.0fms  "
-                                 "(%d frames)", whisper_ms, unet_ms, elapsed, n)
-                        first_batch_logged = True
-                    # Slice matching audio for this sub-batch
-                    a_start = int(batch_start * samples_per_frame)
-                    a_end   = int((batch_start + n) * samples_per_frame)
-                    batch_audio = audio_flat[a_start:a_end]
-                    # Chop into per-frame audio and push (frame, audio) tuples
-                    frame_samples = int(samples_per_frame)
-                    for fi, vf in enumerate(frames):
-                        fa_s = fi * frame_samples
-                        fa_e = min((fi + 1) * frame_samples, len(batch_audio))
-                        per_frame_audio = batch_audio[fa_s:fa_e] if fa_e > fa_s else None
-                        try:
-                            self._video_queue.put_nowait((vf, per_frame_audio))
-                        except asyncio.QueueFull:
-                            log.warning("Video queue full — dropping oldest frame (audio gap risk)")
-                            try:
-                                self._video_queue.get_nowait()
-                            except asyncio.QueueEmpty:
-                                pass
-                            self._video_queue.put_nowait((vf, per_frame_audio))
-                chunk_id += 1
-        finally:
-            self._speaking = False
-        latency = time.monotonic() - start_time
-        log.info("Text spoken in %.3fs (%d tts chunks)", latency, chunk_id)
-    async def _idle_loop(self):
-        """Idle animation loop — drains video queue at 25fps.
-        During speech (_speaking=True):
-          - Pulls (frame, audio) tuples from queue.
-          - If the queue is momentarily empty, block-waits up to 500ms
-            for the next sub-batch instead of flashing to idle.
-          - Publishes video + audio in lockstep.
-        When idle:
-          - Queue is empty → shows base.mp4 loop, no audio.
         """
-        frame_interval = 1.0 / VIDEO_FPS
         session_start = time.monotonic()
         try:
             while self._running:
-                frame_start = time.monotonic()
-                frame = None
-                audio_slice = None
-                # --- pull next frame ---
                 try:
-                    item = self._video_queue.get_nowait()
-                    frame, audio_slice = item
                 except asyncio.QueueEmpty:
-                    if self._speaking:
-                        # UNet is still generating — wait for next batch
-                        try:
-                            item = await asyncio.wait_for(
-                                self._video_queue.get(), timeout=0.5,
-                            )
-                            frame, audio_slice = item
-                        except asyncio.TimeoutError:
-                            # Safety: if nothing arrived in 500ms, show idle
-                            frame = self._idle_generator.next_frame()
                     else:
                         frame = self._idle_generator.next_frame()
-                pts_us = int((frame_start - session_start) * 1_000_000)
                 await self._publisher.publish_video_frame(frame, pts_us)
-                # Publish per-frame audio if present (lockstep with video)
                 if audio_slice is not None and len(audio_slice) > 0:
-                    await self._publisher.publish_audio_chunk(audio_slice, 0.0)
-                elapsed = time.monotonic() - frame_start
                 sleep_time = frame_interval - elapsed
                 if sleep_time > 0:
                     await asyncio.sleep(sleep_time)
         except asyncio.CancelledError:
             raise

 """
+Three-Queue Parallel Pipeline  (api/ canonical version)
+========================================================
+Promoted from e2e/pipeline.py — this is now the default StreamingPipeline
+used by api/server.py.
+Architecture:
+  _tts_producer   → _tts_queue(6)    → _whisper_worker
+  _whisper_worker → _whisper_queue(3) → _unet_worker
+  _unet_worker    → _frame_queue(64)  → _publish_loop (VIDEO_FPS drain)
+Key properties:
+  - TTS (CPU/ONNX) runs ahead of Whisper/UNet, absorbing inter-fragment
+    Kokoro reinit time in the bounded queue buffer. No inter-sentence stall.
+  - _publish_loop holds the last speech frame during inter-batch gaps instead
+    of flashing to idle — prevents LiveKit bitrate drops from irregular delivery.
+  - Audio PTS tracked via monotonic sample counter.
+  - stop() cancels all tasks and drains queues — safe to restart cleanly.
 """
 from __future__ import annotations
 import numpy as np
 import sys
 from pathlib import Path
+_backend_dir = Path(__file__).parent.parent
 if str(_backend_dir) not in sys.path:
     sys.path.insert(0, str(_backend_dir))
 from config import (
     FRAMES_PER_CHUNK,
     TTS_SAMPLE_RATE,
     VIDEO_FPS,
 )
 from tts.kokoro_tts import KokoroTTS
+from musetalk.worker import MuseTalkWorker
 from publisher.livekit_publisher import AVPublisher, IdleFrameGenerator
 log = logging.getLogger(__name__)
+# Sentinel: distinguishes "queue was empty" from the None end-of-utterance marker
+_QUEUE_EMPTY = object()
 class StreamingPipeline:
     """
+    Three-queue parallel pipeline: text → TTS → Whisper → UNet → LiveKit.
+    Public interface:
+      await pipeline.start()
+      await pipeline.push_text("Hello world.")
+      await pipeline.stop()
     """
     def __init__(
         self,
         tts: KokoroTTS,
         self._musetalk = musetalk
         self._publisher = publisher
         self._avatar_assets = avatar_assets
+        # Use idle.png from the avatar folder if available (static frame, no flicker)
+        _avatar_idle_png = (
+            Path(__file__).parent.parent / "avatars" / avatar_assets.name / "idle.png"
+        )
         self._idle_generator = IdleFrameGenerator(
             avatar_assets,
+            image_path=str(_avatar_idle_png) if _avatar_idle_png.exists() else None,
             target_width=publisher._video_width,
             target_height=publisher._video_height,
         )
         self._running = False
+        # ── three-stage async queues ──────────────────────────────────────────
+        # Unbounded: holds raw text requests
         self._text_queue: asyncio.Queue = asyncio.Queue()
+        # Stage 1→2: Kokoro audio chunks.  6 slots ≈ ~2 full sentences of
+        # buffering — absorbs the Kokoro create_stream() reinit gap (~50-100ms)
+        # between sentence fragments so _whisper_worker never stalls.
+        self._tts_queue: asyncio.Queue = asyncio.Queue(maxsize=6)
+        # Stage 2→3: Whisper features.  Small — GPU is the bottleneck here.
+        self._whisper_queue: asyncio.Queue = asyncio.Queue(maxsize=3)
+        # Stage 3→publish: composited RGBA frames + per-frame audio.
+        # 64 slots ≈ 2.56s of video at 25fps — publish loop never starves.
+        self._frame_queue: asyncio.Queue = asyncio.Queue(maxsize=64)
+        # ── worker task handles ───────────────────────────────────────────────
+        self._tts_task: Optional[asyncio.Task] = None
+        self._whisper_task: Optional[asyncio.Task] = None
+        self._unet_task: Optional[asyncio.Task] = None
+        self._publish_task: Optional[asyncio.Task] = None
+        self._log_task: Optional[asyncio.Task] = None
+        log.info("StreamingPipeline (3-queue) initialized")
+    # ── lifecycle ─────────────────────────────────────────────────────────────
+    def _task_done_cb(self, task: asyncio.Task):
+        """Log unhandled exceptions from worker tasks immediately."""
+        if task.cancelled():
+            return
+        exc = task.exception()
+        if exc is not None:
+            log.error("Worker task '%s' crashed: %s", task.get_name(), exc, exc_info=exc)
     async def start(self):
+        """Spawn all worker coroutines and start the pipeline."""
         self._running = True
+        self._tts_task     = asyncio.create_task(self._tts_producer(),     name="tts_producer")
+        self._whisper_task = asyncio.create_task(self._whisper_worker(),   name="whisper_worker")
+        self._unet_task    = asyncio.create_task(self._unet_worker(),      name="unet_worker")
+        self._publish_task = asyncio.create_task(self._publish_loop(),     name="publish_loop")
+        self._log_task     = asyncio.create_task(self._log_queue_depths(), name="log_depths")
+        for t in (self._tts_task, self._whisper_task, self._unet_task,
+                  self._publish_task, self._log_task):
+            t.add_done_callback(self._task_done_cb)
         log.info("StreamingPipeline started")
     async def stop(self):
+        """Cancel all workers, drain queues, and reset state."""
         self._running = False
+        for task in (
+            self._tts_task,
+            self._whisper_task,
+            self._unet_task,
+            self._publish_task,
+            self._log_task,
+        ):
+            if task and not task.done():
+                task.cancel()
+                try:
+                    await task
+                except asyncio.CancelledError:
+                    pass
+        # Drain all queues — no stale data on reconnect
+        for q in (
+            self._text_queue,
+            self._tts_queue,
+            self._whisper_queue,
+            self._frame_queue,
+        ):
+            while not q.empty():
+                try:
+                    q.get_nowait()
+                except asyncio.QueueEmpty:
+                    break
         log.info("StreamingPipeline stopped")
+    # ── public API ────────────────────────────────────────────────────────────
     async def push_text(self, text: str):
+        """Enqueue text to be spoken.  Non-blocking; returns immediately."""
+        await self._text_queue.put(text)
+    # ── Stage 1: TTS producer ─────────────────────────────────────────────────
+    async def _tts_producer(self):
         """
+        Reads text from _text_queue, streams Kokoro audio into _tts_queue.
+        Sentinel convention: None is pushed after each utterance to signal
+        end-of-utterance to downstream workers.
+        NOTE: text is passed directly to synthesize_stream() — no outer
+        _split_to_fragments() call here.  synthesize_stream() handles splitting
+        internally, preventing double-split and PTS reset at fragment boundaries.
         """
         try:
             while self._running:
                 try:
+                    text = await asyncio.wait_for(self._text_queue.get(), timeout=0.1)
                 except asyncio.TimeoutError:
+                    continue
+                log.debug("tts_producer: utterance (%d chars)", len(text))
+                first_chunk = True
+                async for audio, pts_s, pts_e in self._tts.synthesize_stream(text):
+                    audio_flat = audio.flatten() if audio.ndim > 1 else audio
+                    if first_chunk:
+                        log.debug("tts_producer: first chunk pts=%.3f→%.3f len=%d",
+                                  pts_s, pts_e, len(audio_flat))
+                        first_chunk = False
+                    await self._tts_queue.put((audio_flat, pts_s, pts_e))
+                # End-of-utterance sentinel
+                await self._tts_queue.put(None)
+                log.debug("tts_producer: utterance done")
+        except asyncio.CancelledError:
+            raise
+        except Exception:
+            log.exception("tts_producer: unhandled exception — worker stopped")
+            raise
+    # ── Stage 2: Whisper worker ───────────────────────────────────────────────
+    async def _whisper_worker(self):
         """
+        Consumes audio chunks from _tts_queue, runs Whisper encoder, pushes
+        (feats, audio_flat, pts_s, pts_e, total_frames) into _whisper_queue.
+        Forwards None sentinel downstream on end-of-utterance.
         """
         try:
+            while self._running:
+                item = await self._tts_queue.get()
+                if item is None:
+                    await self._whisper_queue.put(None)
+                    continue
+                audio_flat, pts_s, pts_e = item
                 t0 = time.monotonic()
+                feats, total_frames = await self._musetalk.extract_features(audio_flat)
+                log.debug(
+                    "whisper_worker: %.0fms audio → %d frames  (took %.0fms)",
+                    len(audio_flat) / TTS_SAMPLE_RATE * 1000,
+                    total_frames,
+                    (time.monotonic() - t0) * 1000,
+                )
+                await self._whisper_queue.put((feats, audio_flat, pts_s, pts_e, total_frames))
+        except asyncio.CancelledError:
+            raise
+        except Exception:
+            log.exception("whisper_worker: unhandled exception — worker stopped")
+            raise
+    # ── Stage 3: UNet worker ──────────────────────────────────────────────────
+    async def _unet_worker(self):
+        """
+        Consumes Whisper features from _whisper_queue, runs MuseTalk UNet in
+        FRAMES_PER_CHUNK-sized batches, pushes (frame_rgba, audio_slice) into
+        _frame_queue.  Forwards None sentinel downstream.
+        """
+        BATCH = self._musetalk.BATCH_FRAMES
+        try:
+            while self._running:
+                item = await self._whisper_queue.get()
+                if item is None:
+                    await self._frame_queue.put(None)
+                    continue
+                feats, audio_flat, pts_s, pts_e, total_frames = item
+                spf = len(audio_flat) / max(total_frames, 1)
+                first_batch = True
                 for batch_start in range(0, total_frames, BATCH):
                     n = min(BATCH, total_frames - batch_start)
+                    t0 = time.monotonic()
                     frames = await self._musetalk.generate_batch(feats, batch_start, n)
+                    if first_batch:
+                        log.debug("unet_worker: first batch %d frames  (%.0fms)",
+                                  n, (time.monotonic() - t0) * 1000)
+                        first_batch = False
+                    for fi, frame in enumerate(frames):
+                        a_s = int((batch_start + fi) * spf)
+                        a_e = min(int((batch_start + fi + 1) * spf), len(audio_flat))
+                        audio_slice = audio_flat[a_s:a_e] if a_e > a_s else None
+                        await self._frame_queue.put((frame, audio_slice))
+        except asyncio.CancelledError:
+            raise
+        except Exception:
+            log.exception("unet_worker: unhandled exception — worker stopped")
+            raise
+    # ── Publish loop ──────────────────────────────────────────────────────────
+    async def _publish_loop(self):
         """
+        Ticks at exactly VIDEO_FPS — always.
+        Frame selection priority per tick:
+          1. Next speech frame from _frame_queue  (non-blocking get_nowait)
+          2. If speaking and queue empty: hold last speech frame
+             (UNet is generating the next batch — freeze beats idle flash)
+          3. Utterance sentinel (None): switch to idle immediately
+          4. Truly idle: idle frame
+        """
+        frame_interval = 1.0 / VIDEO_FPS  # e.g. 62.5ms @ 16fps
         session_start = time.monotonic()
+        audio_pts_samples = 0
+        is_speaking = False
+        last_speech_frame = None
+        hold_count = 0
         try:
             while self._running:
+                tick_start = time.monotonic()
+                # ── non-blocking frame pick ───────────────────────────────────
                 try:
+                    item = self._frame_queue.get_nowait()
                 except asyncio.QueueEmpty:
+                    item = _QUEUE_EMPTY
+                if item is _QUEUE_EMPTY:
+                    if is_speaking and last_speech_frame is not None:
+                        frame = last_speech_frame
+                        audio_slice = None
+                        hold_count += 1
                     else:
                         frame = self._idle_generator.next_frame()
+                        audio_slice = None
+                elif item is None:
+                    # End-of-utterance sentinel
+                    if is_speaking:
+                        log.info(
+                            "publish_loop: utterance ended → idle"
+                            "  (held %d frames for inter-batch gaps)", hold_count
+                        )
+                        is_speaking = False
+                        last_speech_frame = None
+                        hold_count = 0
+                    frame = self._idle_generator.next_frame()
+                    audio_slice = None
+                else:
+                    # Real speech frame
+                    frame, audio_slice = item
+                    last_speech_frame = frame
+                    if not is_speaking:
+                        log.info(
+                            "publish_loop: speaking started  (frame_q=%d)",
+                            self._frame_queue.qsize(),
+                        )
+                        is_speaking = True
+                        hold_count = 0
+                # ── video publish ─────────────────────────────────────────────
+                pts_us = int((tick_start - session_start) * 1_000_000)
                 await self._publisher.publish_video_frame(frame, pts_us)
+                # ── audio publish ─────────────────────────────────────────────
                 if audio_slice is not None and len(audio_slice) > 0:
+                    audio_pts_sec = audio_pts_samples / TTS_SAMPLE_RATE
+                    await self._publisher.publish_audio_chunk(audio_slice, audio_pts_sec)
+                    audio_pts_samples += len(audio_slice)
+                # ── pace to VIDEO_FPS ─────────────────────────────────────────
+                elapsed = time.monotonic() - tick_start
                 sleep_time = frame_interval - elapsed
                 if sleep_time > 0:
                     await asyncio.sleep(sleep_time)
+                elif sleep_time < -0.010:
+                    log.warning(
+                        "publish_loop: over budget by %.0fms",
+                        -sleep_time * 1000,
+                    )
         except asyncio.CancelledError:
             raise
+    # ── Debug helper ──────────────────────────────────────────────────────────
+    async def _log_queue_depths(self):
+        """Log queue depths every 2 seconds for pipeline health monitoring."""
+        while self._running:
+            tts_q  = self._tts_queue.qsize()
+            whi_q  = self._whisper_queue.qsize()
+            frm_q  = self._frame_queue.qsize()
+            lvl = logging.INFO if (tts_q or whi_q or frm_q) else logging.DEBUG
+            log.log(
+                lvl,
+                "queues — text=%d  tts=%d/%d  whisper=%d/%d  frame=%d/%d",
+                self._text_queue.qsize(),
+                tts_q,  self._tts_queue.maxsize,
+                whi_q,  self._whisper_queue.maxsize,
+                frm_q,  self._frame_queue.maxsize,
+            )
+            await asyncio.sleep(2.0)

backend/api/server.py CHANGED Viewed

@@ -1,39 +1,40 @@
 """
-Speech-to-Video Server
-====================
-FastAPI server for text-to-speech-to-video pipeline.
-Uses Kokoro TTS + MuseTalk + LiveKit.
 """
 from __future__ import annotations
 import asyncio
 import logging
-import os
 import sys
 import time
 from contextlib import asynccontextmanager
 from pathlib import Path
 from typing import Optional
-# NOTE: do NOT load root .env (parent project sets SPEECHX_AVATAR=christine).
-# speech_to_video/backend/config.py has all defaults we need.
-# Add local backend to path (PRIORITY over parent)
-import sys
-from pathlib import Path
-# Get the directory containing this file (backend/api/)
 _current_file = Path(__file__).resolve()
-_api_dir = _current_file.parent          # backend/api/
-_backend_dir = _api_dir.parent           # backend/
-_speech_to_video_dir = _backend_dir.parent  # speech_to_video/
-# Add paths in order of priority
-for p in [_backend_dir, _speech_to_video_dir]:
     if str(p) not in sys.path:
         sys.path.insert(0, str(p))
-# Now import after path is set
 import uvicorn
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
@@ -48,20 +49,17 @@ from config import (
     LIVEKIT_API_KEY,
     LIVEKIT_API_SECRET,
     LIVEKIT_ROOM_NAME,
-    VIDEO_WIDTH,
-    VIDEO_HEIGHT,
     VIDEO_FPS,
     DEFAULT_AVATAR,
     DEVICE,
 )
 from tts.kokoro_tts import KokoroTTS
-from musetalk.worker import load_musetalk_models, MuseTalkWorker
 from publisher.livekit_publisher import AVPublisher
 from api.pipeline import StreamingPipeline
 import torch
-torch.set_float32_matmul_precision('high')  # Use TF32 on Ampere+ for ~5-10% free speedup
-# If torch.compile Triton JIT fails (e.g. first-run slow compile, SIGINT), fall back to eager
 torch._dynamo.config.suppress_errors = True
 log = logging.getLogger(__name__)
@@ -70,51 +68,73 @@ logging.basicConfig(
     format="%(asctime)s  %(levelname)-7s  %(name)s  %(message)s",
 )
-# Global state
-_pipeline: Optional[StreamingPipeline] = None
-_room: Optional[rtc.Room] = None
-_publisher: Optional[AVPublisher] = None
-_models_loaded = False
 @asynccontextmanager
 async def lifespan(app: FastAPI):
-    """Load models at startup."""
-    global _models_loaded
     log.info("=== Speech-to-Video Server Starting ===")
-    log.info(f"Device: {DEVICE}")
-    log.info(f"Avatar: {DEFAULT_AVATAR}")
-    # Models are loaded lazily on first request
-    _models_loaded = True
-    log.info("=== Server Ready ===")
-    yield
-    # Cleanup
-    global _pipeline, _room, _publisher
     if _pipeline:
         await _pipeline.stop()
-        _pipeline = None
-    # Stop publisher BEFORE disconnecting the room — unpublish_track requires an
-    # active room connection; room.disconnect() tears down the session first.
     if _publisher:
         await _publisher.stop()
-        _publisher = None
     if _room:
         await _room.disconnect()
-        _room = None
     log.info("=== Server Shutdown ===")
 app = FastAPI(
-    title="Speech-to-Video",
-    description="Text → Kokoro TTS → MuseTalk → LiveKit pipeline",
-    version="1.0.0",
     lifespan=lifespan,
 )
@@ -126,260 +146,180 @@ app.add_middleware(
 )
 class SpeakRequest(BaseModel):
     text: str
     voice: Optional[str] = None
     speed: Optional[float] = None
 class TokenRequest(BaseModel):
-    room_name: str = "speech-to-video-room"
     identity: str = "user"
-# ──────────────────────────────────────────────────────────────────────────────
-# Endpoints
-# ──────────────────────────────────────────────────────────────────────────────
 @app.get("/health")
 async def health():
-    """Liveness probe."""
     return {
         "status": "ok",
-        "models_loaded": _models_loaded,
-        "pipeline_active": _pipeline is not None and getattr(_pipeline, '_running', False),
     }
 @app.get("/status")
 async def status():
-    """Get server status."""
-    import torch
     vram = {}
     if torch.cuda.is_available():
         vram = {
             "allocated_gb": round(torch.cuda.memory_allocated() / 1024**3, 2),
-            "reserved_gb": round(torch.cuda.memory_reserved() / 1024**3, 2),
         }
     return {
-        "models_loaded": _models_loaded,
-        "pipeline_active": _pipeline is not None and getattr(_pipeline, '_running', False),
         "avatar": DEFAULT_AVATAR,
         "device": DEVICE,
         "vram": vram,
     }
 @app.post("/connect")
 async def connect():
-    """
-    Connect to LiveKit room and start the pipeline.
-    """
     global _room, _publisher, _pipeline
-    if _pipeline is not None and getattr(_pipeline, '_running', False):
         raise HTTPException(status_code=400, detail="Already connected")
     log.info("Connecting to LiveKit room...")
     try:
-        # Load models
-        log.info("Loading MuseTalk models...")
-        musetalk_bundle = load_musetalk_models(
-            avatar_name=DEFAULT_AVATAR,
-            device=DEVICE,
-        )
-        log.info("Loading Kokoro TTS...")
-        tts = KokoroTTS()
-        # Create LiveKit room
         room = rtc.Room()
-        # Generate token for backend agent
-        token = lk_api.AccessToken(
-            LIVEKIT_API_KEY,
-            LIVEKIT_API_SECRET,
-        ).with_identity("backend-agent").with_name("Speech-to-Video Agent")
         token.with_grants(lk_api.VideoGrants(
             room_join=True,
             room=LIVEKIT_ROOM_NAME,
             can_publish=True,
             can_subscribe=True,
         ))
-        # Determine actual video dimensions from precomputed avatar frames
-        first_frame = musetalk_bundle.avatar_assets.frame_list[0]
-        actual_h, actual_w = first_frame.shape[:2]  # cv2 shape is (H, W, C)
-        log.info(f"Avatar frame size: {actual_w}x{actual_h}")
-        # Create publisher
         publisher = AVPublisher(
             room,
             video_width=actual_w,
             video_height=actual_h,
             video_fps=VIDEO_FPS,
         )
-        # Create MuseTalk worker
-        musetalk_worker = MuseTalkWorker(musetalk_bundle)
-        # Create pipeline
         pipeline = StreamingPipeline(
-            tts=tts,
             musetalk=musetalk_worker,
             publisher=publisher,
-            avatar_assets=musetalk_bundle.avatar_assets,
-        )
-        # Connect to room
-        await room.connect(
-            url=LIVEKIT_URL,
-            token=token.to_jwt(),
         )
-        log.info(f"Connected to LiveKit: {LIVEKIT_ROOM_NAME}")
-        # Start publishing
         await publisher.start()
-        # Start pipeline
         await pipeline.start()
-        # Warm up Whisper + Kokoro synchronously (~0.5s each, no Triton JIT).
-        # UNet warm-up runs in background only when torch.compile is active.
-        import os
-        import numpy as np
-        log.info("Warming up Whisper + TTS...")
-        dummy_audio = np.zeros(int(0.32 * 24000), dtype=np.float32)  # 320ms silence
-        _feats, _ = await musetalk_worker.extract_features(dummy_audio)
-        # Warm Kokoro ONNX session (first call initializes thread pool)
-        tts.synthesize_full("Hello.")
-        log.info("Whisper + TTS warm-up done")
-        if os.environ.get("MUSETALK_TORCH_COMPILE", "0") == "1":
-            log.info("torch.compile enabled — UNet JIT starting in background...")
-            async def _background_unet_warmup():
-                try:
-                    _batch_n = min(8, len(musetalk_bundle.avatar_assets.frame_list))
-                    await musetalk_worker.generate_batch(_feats, 0, _batch_n)
-                    log.info("UNet warm-up / torch.compile complete")
-                except Exception as _e:
-                    log.warning("UNet background warm-up failed (non-fatal): %s", _e)
-            asyncio.ensure_future(_background_unet_warmup())
-        else:
-            # Eager mode: run one synchronous warm-up pass to prime CUDA kernels
-            log.info("Warming up UNet (eager mode)...")
-            _batch_n = min(8, len(musetalk_bundle.avatar_assets.frame_list))
-            await musetalk_worker.generate_batch(_feats, 0, _batch_n)
-            log.info("UNet warm-up complete")
-        # Store references
-        _room = room
         _publisher = publisher
-        _pipeline = pipeline
-        return {
-            "status": "connected",
-            "room": LIVEKIT_ROOM_NAME,
-            "url": LIVEKIT_URL,
-        }
-    except Exception as e:
-        log.error(f"Connection failed: {e}", exc_info=True)
-        raise HTTPException(status_code=500, detail=str(e))
 @app.post("/disconnect")
 async def disconnect():
-    """Disconnect from LiveKit."""
     global _room, _publisher, _pipeline
     if _pipeline is None:
         raise HTTPException(status_code=400, detail="Not connected")
     log.info("Disconnecting...")
     if _pipeline:
         await _pipeline.stop()
     if _publisher:
         await _publisher.stop()
     if _room:
         await _room.disconnect()
-    _room = None
-    _publisher = None
-    _pipeline = None
     return {"status": "disconnected"}
 @app.post("/speak")
 async def speak(request: SpeakRequest):
-    """
-    Speak text through the avatar.
-    Returns latency metrics.
-    """
-    global _pipeline
-    if _pipeline is None or not getattr(_pipeline, '_running', False):
         raise HTTPException(status_code=400, detail="Not connected")
-    start_time = time.monotonic()
-    # Push text to pipeline
     await _pipeline.push_text(request.text)
-    # Calculate latency
-    latency_ms = (time.monotonic() - start_time) * 1000
-    return {
-        "status": "processing",
-        "latency_ms": round(latency_ms, 1),
-    }
 @app.post("/get-token")
 @app.get("/livekit-token")
 async def get_token(request: TokenRequest = TokenRequest()):
-    """Get LiveKit token for frontend.
-    Does NOT require the pipeline to be connected — tokens are issued
-    from the API key/secret alone, same pattern as Avatar_gen/backend/agent.py.
-    The frontend passes roomName + identity in the POST body.
-    """
-    room = request.room_name or LIVEKIT_ROOM_NAME
-    identity = request.identity or "frontend-user"
-    token = lk_api.AccessToken(
-        LIVEKIT_API_KEY,
-        LIVEKIT_API_SECRET,
-    ).with_identity(identity).with_name(identity)
     token.with_grants(lk_api.VideoGrants(
         room_join=True,
         room=room,
         can_publish=True,
         can_subscribe=True,
     ))
-    return {
-        "token": token.to_jwt(),
-        "url": LIVEKIT_URL,
-        "room": room,
-    }
-# ──────────────────────────────────────────────────────────────────────────────
-# Entry point
-# ──────────────────────────────────────────────────────────────────────────────
 if __name__ == "__main__":
-    uvicorn.run(
-        app,  # Direct app reference instead of string
-        host=HOST,
-        port=PORT,
-        reload=False,
-        log_level="info",
-    )

 """
+Speech-to-Video Server  (api/ — warm-load version)
+====================================================
+Models are loaded ONCE at server startup (lifespan), not at /connect.
+This means /connect is instant for subsequent sessions.
+Model loading split:
+  lifespan  → MuseTalk bundle + Kokoro TTS + UNet warmup (stay in VRAM)
+  /connect  → Room, Publisher, MuseTalkWorker, Pipeline (per-session)
+  /disconnect → session objects torn down; models stay loaded
+Run:
+  cd backend && python api/server.py
+  # or: uvicorn api.server:app --host 0.0.0.0 --port 8767
 """
 from __future__ import annotations
 import asyncio
 import logging
 import sys
 import time
 from contextlib import asynccontextmanager
 from pathlib import Path
 from typing import Optional
+# ── path setup ────────────────────────────────────────────────────────────────
 _current_file = Path(__file__).resolve()
+_api_dir      = _current_file.parent        # backend/api/
+_backend_dir  = _api_dir.parent             # backend/
+_project_dir  = _backend_dir.parent         # speech_to_video/
+for p in [_backend_dir, _project_dir]:
     if str(p) not in sys.path:
         sys.path.insert(0, str(p))
+# ── imports ───────────────────────────────────────────────────────────────────
+import numpy as np
 import uvicorn
 from fastapi import FastAPI, HTTPException
 from fastapi.middleware.cors import CORSMiddleware
     LIVEKIT_API_KEY,
     LIVEKIT_API_SECRET,
     LIVEKIT_ROOM_NAME,
     VIDEO_FPS,
     DEFAULT_AVATAR,
     DEVICE,
 )
 from tts.kokoro_tts import KokoroTTS
+from musetalk.worker import load_musetalk_models, MuseTalkWorker, MuseTalkBundle
 from publisher.livekit_publisher import AVPublisher
 from api.pipeline import StreamingPipeline
 import torch
+torch.set_float32_matmul_precision("high")
 torch._dynamo.config.suppress_errors = True
 log = logging.getLogger(__name__)
     format="%(asctime)s  %(levelname)-7s  %(name)s  %(message)s",
 )
+# ── global model state (loaded once, lives for server lifetime) ───────────────
+_musetalk_bundle: Optional[MuseTalkBundle] = None
+_tts: Optional[KokoroTTS] = None
+# ── session state (created/destroyed on connect/disconnect) ──────────────────
+_pipeline:  Optional[StreamingPipeline] = None
+_room:      Optional[rtc.Room]          = None
+_publisher: Optional[AVPublisher]       = None
+# ── lifespan: load models once at startup ────────────────────────────────────
 @asynccontextmanager
 async def lifespan(app: FastAPI):
+    global _musetalk_bundle, _tts
+    t_start = time.monotonic()
     log.info("=== Speech-to-Video Server Starting ===")
+    log.info("Device: %s  Avatar: %s", DEVICE, DEFAULT_AVATAR)
+    # 1. Load MuseTalk (VAE + UNet + Whisper + avatar latents)
+    log.info("Loading MuseTalk models...")
+    _musetalk_bundle = await asyncio.to_thread(
+        load_musetalk_models, DEFAULT_AVATAR, DEVICE
+    )
+    log.info("MuseTalk loaded  (%.1fs)", time.monotonic() - t_start)
+    # 2. Load Kokoro TTS
+    log.info("Loading Kokoro TTS...")
+    _tts = await asyncio.to_thread(KokoroTTS)
+    log.info("Kokoro TTS loaded")
+    # 3. UNet warmup — prime GPU caches
+    worker_tmp = MuseTalkWorker(_musetalk_bundle)
+    dummy_audio = np.zeros(int(0.32 * 24_000), dtype=np.float32)
+    feats, _ = await worker_tmp.extract_features(dummy_audio)
+    t0 = time.monotonic()
+    n = min(8, len(_musetalk_bundle.avatar_assets.frame_list))
+    await worker_tmp.generate_batch(feats, 0, n)
+    log.info("UNet warm-up done  (%.1fs)", time.monotonic() - t0)
+    worker_tmp.shutdown()
+    _tts.synthesize_full("Hello.")
+    log.info("TTS warm-up done")
+    log.info("=== Server ready in %.1fs — waiting for /connect (port %d) ===",
+             time.monotonic() - t_start, PORT)
+    yield  # ── server running ────────────────────────────────────────────────
+    # ── shutdown ──────────────────────────────────────────────────────────────
+    global _pipeline, _room, _publisher
     if _pipeline:
         await _pipeline.stop()
     if _publisher:
         await _publisher.stop()
     if _room:
         await _room.disconnect()
     log.info("=== Server Shutdown ===")
+# ── FastAPI app ───────────────────────────────────────────────────────────────
 app = FastAPI(
+    title="Speech-to-Video (api — 3-queue)",
+    description="Text → Kokoro TTS → Whisper → MuseTalk → LiveKit",
+    version="2.0.0",
     lifespan=lifespan,
 )
 )
+# ── request models ────────────────────────────────────────────────────────────
 class SpeakRequest(BaseModel):
     text: str
     voice: Optional[str] = None
     speed: Optional[float] = None
 class TokenRequest(BaseModel):
+    room_name: str = LIVEKIT_ROOM_NAME
     identity: str = "user"
+# ── /health and /status ───────────────────────────────────────────────────────
 @app.get("/health")
 async def health():
     return {
         "status": "ok",
+        "models_loaded": _musetalk_bundle is not None and _tts is not None,
+        "pipeline_active": _pipeline is not None and getattr(_pipeline, "_running", False),
     }
 @app.get("/status")
 async def status():
     vram = {}
     if torch.cuda.is_available():
         vram = {
             "allocated_gb": round(torch.cuda.memory_allocated() / 1024**3, 2),
+            "reserved_gb":  round(torch.cuda.memory_reserved()  / 1024**3, 2),
         }
     return {
+        "pipeline": "api-3-queue",
+        "models_loaded": _musetalk_bundle is not None,
+        "pipeline_active": _pipeline is not None and getattr(_pipeline, "_running", False),
         "avatar": DEFAULT_AVATAR,
         "device": DEVICE,
         "vram": vram,
     }
+# ── /connect ──────────────────────────────────────────────────────────────────
 @app.post("/connect")
 async def connect():
     global _room, _publisher, _pipeline
+    if _musetalk_bundle is None or _tts is None:
+        raise HTTPException(status_code=503, detail="Server still loading models")
+    if _pipeline is not None and getattr(_pipeline, "_running", False):
         raise HTTPException(status_code=400, detail="Already connected")
     log.info("Connecting to LiveKit room...")
+    t0 = time.monotonic()
     try:
+        first_frame = _musetalk_bundle.avatar_assets.frame_list[0]
+        actual_h, actual_w = first_frame.shape[:2]
         room = rtc.Room()
+        token = (
+            lk_api.AccessToken(LIVEKIT_API_KEY, LIVEKIT_API_SECRET)
+            .with_identity("backend-agent")
+            .with_name("Speech-to-Video Agent")
+        )
         token.with_grants(lk_api.VideoGrants(
             room_join=True,
             room=LIVEKIT_ROOM_NAME,
             can_publish=True,
             can_subscribe=True,
         ))
         publisher = AVPublisher(
             room,
             video_width=actual_w,
             video_height=actual_h,
             video_fps=VIDEO_FPS,
         )
+        # MuseTalkWorker wraps the already-loaded bundle — no model reload
+        musetalk_worker = MuseTalkWorker(_musetalk_bundle)
         pipeline = StreamingPipeline(
+            tts=_tts,
             musetalk=musetalk_worker,
             publisher=publisher,
+            avatar_assets=_musetalk_bundle.avatar_assets,
         )
+        await room.connect(url=LIVEKIT_URL, token=token.to_jwt())
+        log.info("Connected to LiveKit: %s", LIVEKIT_ROOM_NAME)
         await publisher.start()
         await pipeline.start()
+        # Fast warmup (models already hot in VRAM)
+        dummy_audio = np.zeros(int(0.32 * 24_000), dtype=np.float32)
+        feats, _ = await musetalk_worker.extract_features(dummy_audio)
+        n = min(8, len(_musetalk_bundle.avatar_assets.frame_list))
+        await musetalk_worker.generate_batch(feats, 0, n)
+        log.info("Session warm-up done")
+        _room      = room
         _publisher = publisher
+        _pipeline  = pipeline
+        log.info("/connect done in %.1fs", time.monotonic() - t0)
+        return {"status": "connected", "room": LIVEKIT_ROOM_NAME, "url": LIVEKIT_URL}
+    except Exception as exc:
+        log.error("Connection failed: %s", exc, exc_info=True)
+        raise HTTPException(status_code=500, detail=str(exc))
+# ── /disconnect ───────────────────────────────────────────────────────────────
 @app.post("/disconnect")
 async def disconnect():
     global _room, _publisher, _pipeline
     if _pipeline is None:
         raise HTTPException(status_code=400, detail="Not connected")
     log.info("Disconnecting...")
     if _pipeline:
         await _pipeline.stop()
     if _publisher:
         await _publisher.stop()
     if _room:
         await _room.disconnect()
+    _room = _publisher = _pipeline = None
+    # NOTE: _musetalk_bundle and _tts are intentionally NOT cleared —
+    # models stay in VRAM so the next /connect is instant.
+    log.info("Disconnected — models remain loaded for next session")
     return {"status": "disconnected"}
+# ── /speak ────────────────────────────────────────────────────────────────────
 @app.post("/speak")
 async def speak(request: SpeakRequest):
+    if _pipeline is None or not getattr(_pipeline, "_running", False):
         raise HTTPException(status_code=400, detail="Not connected")
+    t0 = time.monotonic()
     await _pipeline.push_text(request.text)
+    return {"status": "processing", "latency_ms": round((time.monotonic() - t0) * 1000, 1)}
+# ── /get-token ────────────────────────────────────────────────────────────────
 @app.post("/get-token")
 @app.get("/livekit-token")
 async def get_token(request: TokenRequest = TokenRequest()):
+    room     = request.room_name or LIVEKIT_ROOM_NAME
+    identity = request.identity  or "frontend-user"
+    token = (
+        lk_api.AccessToken(LIVEKIT_API_KEY, LIVEKIT_API_SECRET)
+        .with_identity(identity)
+        .with_name(identity)
+    )
     token.with_grants(lk_api.VideoGrants(
         room_join=True,
         room=room,
         can_publish=True,
         can_subscribe=True,
     ))
+    return {"token": token.to_jwt(), "url": LIVEKIT_URL, "room": room}
+# ── entry point ───────────────────────────────────────────────────────────────
 if __name__ == "__main__":
+    uvicorn.run(app, host=HOST, port=PORT, reload=False, log_level="info")

backend/avatars/christine/coords.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:842e96bc4fd963cc836e96b881dd1840cb00e7cd5f99a4ae9e707de32e6d6900
+size 777

backend/avatars/christine/full_imgs/00000000.png ADDED Viewed

Git LFS Details

SHA256: 20ad0f6ffec2bd8c0d868307eb6541eb2d0c8f579d5076031d98c6f7446d56a5
Pointer size: 131 Bytes
Size of remote file: 293 kB

backend/avatars/christine/full_imgs/00000001.png ADDED Viewed

Git LFS Details

SHA256: b96d4d3f9807ddbbdb8311986254d8765fbd47a02bca260eb333630e977c2ffc
Pointer size: 131 Bytes
Size of remote file: 425 kB

backend/avatars/christine/full_imgs/00000002.png ADDED Viewed

Git LFS Details

SHA256: 17cfafc5adfa3e550a0fad050ecd992b54b4699ec80a5441404ad648c4ec73de
Pointer size: 131 Bytes
Size of remote file: 413 kB

backend/avatars/christine/full_imgs/00000003.png ADDED Viewed

Git LFS Details

SHA256: 74ebded05f02e73bb627e0f49887e96c9caa02d8a14894afd13e783f1ff2d835
Pointer size: 131 Bytes
Size of remote file: 415 kB

backend/avatars/christine/full_imgs/00000004.png ADDED Viewed

Git LFS Details

SHA256: 64ac29c4d48b902a889479eb67ce43ff01c73bbe50b4fb5d2d6ab6bd024be9c9
Pointer size: 131 Bytes
Size of remote file: 389 kB

backend/avatars/christine/full_imgs/00000005.png ADDED Viewed

Git LFS Details

SHA256: 39f220ca43019e4e35332d7a7b269452e05eaaa5ee1d199f9052ca03b6624de8
Pointer size: 131 Bytes
Size of remote file: 393 kB

backend/avatars/christine/full_imgs/00000006.png ADDED Viewed

Git LFS Details

SHA256: 372e49e19501c1be1168cdee4541a06d8d9eae9b76adb00977dc3f274acdf68d
Pointer size: 131 Bytes
Size of remote file: 422 kB

backend/avatars/christine/full_imgs/00000007.png ADDED Viewed

Git LFS Details

SHA256: c0d8cef00028e1e3d462ae252c2fd5ec17213db40059f09ab84d5d05f97420c4
Pointer size: 131 Bytes
Size of remote file: 421 kB

backend/avatars/christine/full_imgs/00000008.png ADDED Viewed

Git LFS Details

SHA256: 58ca18422db51815da990b756f2c0f95f9a8e4b6aa6fafcb489fca29dbcf07bd
Pointer size: 131 Bytes
Size of remote file: 424 kB

backend/avatars/christine/full_imgs/00000009.png ADDED Viewed

Git LFS Details

SHA256: a0b26e2a8a5477abeb8bb17e7cc484aa80dc965c91b2469833cf70166c6f5182
Pointer size: 131 Bytes
Size of remote file: 418 kB

backend/avatars/christine/full_imgs/00000010.png ADDED Viewed

Git LFS Details

SHA256: 8fbd75ddd14d2d3bf8b41b771d69719aceb37a53af434c13502d2c682812aea7
Pointer size: 131 Bytes
Size of remote file: 417 kB

backend/avatars/christine/full_imgs/00000011.png ADDED Viewed

Git LFS Details

SHA256: b025ba7e94d89e43b25a246aa84f1c29301b663dd6034e8e2ebc8b07ea24e44f
Pointer size: 131 Bytes
Size of remote file: 416 kB

backend/avatars/christine/full_imgs/00000012.png ADDED Viewed

Git LFS Details

SHA256: 13e50f1653dfcf717edcf4c7742f6440a443ba979c459534893ae9656db8349a
Pointer size: 131 Bytes
Size of remote file: 415 kB

backend/avatars/christine/full_imgs/00000013.png ADDED Viewed

Git LFS Details

SHA256: 13e50f1653dfcf717edcf4c7742f6440a443ba979c459534893ae9656db8349a
Pointer size: 131 Bytes
Size of remote file: 415 kB

backend/avatars/christine/full_imgs/00000014.png ADDED Viewed

Git LFS Details

SHA256: b025ba7e94d89e43b25a246aa84f1c29301b663dd6034e8e2ebc8b07ea24e44f
Pointer size: 131 Bytes
Size of remote file: 416 kB

backend/avatars/christine/full_imgs/00000015.png ADDED Viewed

Git LFS Details

SHA256: 8fbd75ddd14d2d3bf8b41b771d69719aceb37a53af434c13502d2c682812aea7
Pointer size: 131 Bytes
Size of remote file: 417 kB

backend/avatars/christine/full_imgs/00000016.png ADDED Viewed

Git LFS Details

SHA256: a0b26e2a8a5477abeb8bb17e7cc484aa80dc965c91b2469833cf70166c6f5182
Pointer size: 131 Bytes
Size of remote file: 418 kB

backend/avatars/christine/full_imgs/00000017.png ADDED Viewed

Git LFS Details

SHA256: 58ca18422db51815da990b756f2c0f95f9a8e4b6aa6fafcb489fca29dbcf07bd
Pointer size: 131 Bytes
Size of remote file: 424 kB

backend/avatars/christine/full_imgs/00000018.png ADDED Viewed

Git LFS Details

SHA256: c0d8cef00028e1e3d462ae252c2fd5ec17213db40059f09ab84d5d05f97420c4
Pointer size: 131 Bytes
Size of remote file: 421 kB

backend/avatars/christine/full_imgs/00000019.png ADDED Viewed

Git LFS Details

SHA256: 372e49e19501c1be1168cdee4541a06d8d9eae9b76adb00977dc3f274acdf68d
Pointer size: 131 Bytes
Size of remote file: 422 kB

backend/avatars/christine/full_imgs/00000020.png ADDED Viewed

Git LFS Details

SHA256: 39f220ca43019e4e35332d7a7b269452e05eaaa5ee1d199f9052ca03b6624de8
Pointer size: 131 Bytes
Size of remote file: 393 kB

backend/avatars/christine/full_imgs/00000021.png ADDED Viewed

Git LFS Details

SHA256: 64ac29c4d48b902a889479eb67ce43ff01c73bbe50b4fb5d2d6ab6bd024be9c9
Pointer size: 131 Bytes
Size of remote file: 389 kB

backend/avatars/christine/full_imgs/00000022.png ADDED Viewed

Git LFS Details

SHA256: 74ebded05f02e73bb627e0f49887e96c9caa02d8a14894afd13e783f1ff2d835
Pointer size: 131 Bytes
Size of remote file: 415 kB

backend/avatars/christine/full_imgs/00000023.png ADDED Viewed

Git LFS Details

SHA256: 17cfafc5adfa3e550a0fad050ecd992b54b4699ec80a5441404ad648c4ec73de
Pointer size: 131 Bytes
Size of remote file: 413 kB

backend/avatars/christine/full_imgs/00000024.png ADDED Viewed

Git LFS Details

SHA256: b96d4d3f9807ddbbdb8311986254d8765fbd47a02bca260eb333630e977c2ffc
Pointer size: 131 Bytes
Size of remote file: 425 kB

backend/avatars/christine/full_imgs/00000025.png ADDED Viewed

Git LFS Details

SHA256: 20ad0f6ffec2bd8c0d868307eb6541eb2d0c8f579d5076031d98c6f7446d56a5
Pointer size: 131 Bytes
Size of remote file: 293 kB

backend/avatars/christine/mask/00000000.png ADDED Viewed

Git LFS Details

SHA256: 06907c26c0d2193822d529c63a5afc1820e0d63bfeed0b44f2e19a7e3c20a55f
Pointer size: 129 Bytes
Size of remote file: 6.54 kB

backend/avatars/christine/mask/00000001.png ADDED Viewed

Git LFS Details

SHA256: 0de4852caf063fb7883deeecdaf982b5f269008c5b09bc2bf5a0d180fb097681
Pointer size: 129 Bytes
Size of remote file: 6.53 kB

backend/avatars/christine/mask/00000002.png ADDED Viewed

Git LFS Details

SHA256: 4b2374ae4f18f7eb6a44eda4de695597dbc0c0c8b71b1af20f12df571ed6ed02
Pointer size: 129 Bytes
Size of remote file: 6.53 kB

backend/avatars/christine/mask/00000003.png ADDED Viewed

Git LFS Details

SHA256: 8dba2d9b0f07297bd4dc74337f5a9cd3c13b260da42238060b87bcbbc96e268a
Pointer size: 129 Bytes
Size of remote file: 6.62 kB

backend/avatars/christine/mask/00000004.png ADDED Viewed

Git LFS Details

SHA256: 536155c0cffc81088945588fee7c689eb1a7123dbe275e1686dd4f112c09e4da
Pointer size: 129 Bytes
Size of remote file: 6.62 kB

backend/avatars/christine/mask/00000005.png ADDED Viewed

Git LFS Details

SHA256: 27ef32778029353be2bea68b3450edaf585e0c36c03a9a4bdbc779a2401228b1
Pointer size: 129 Bytes
Size of remote file: 6.53 kB

backend/avatars/christine/mask/00000006.png ADDED Viewed

Git LFS Details

SHA256: 79c58a39b70b5ba2809726a3aad860d3ee027da47e27f8d95a798cc6173057a3
Pointer size: 129 Bytes
Size of remote file: 6.62 kB

backend/avatars/christine/mask/00000007.png ADDED Viewed

Git LFS Details

SHA256: b23425f0f8c744f63c6db3ebf50b20b059758cb025deae9dfe02c7dff821b4c3
Pointer size: 129 Bytes
Size of remote file: 6.54 kB

backend/avatars/christine/mask/00000008.png ADDED Viewed

Git LFS Details

SHA256: 58e052ea940446e84561cf3478d710bada6b5f84bc0ec9803eff67b3555e8133
Pointer size: 129 Bytes
Size of remote file: 6.69 kB

backend/avatars/christine/mask/00000009.png ADDED Viewed

Git LFS Details

SHA256: 31e918689f9023049a23c20e29346285242c2da11e732bdc4ce7b45abcbe06b9
Pointer size: 129 Bytes
Size of remote file: 6.69 kB

backend/avatars/christine/mask/00000010.png ADDED Viewed

Git LFS Details

SHA256: 558de7129bf48f92104e2ad60035742aeb0a049e5de737b80d7349259b4a760d
Pointer size: 129 Bytes
Size of remote file: 6.71 kB

backend/avatars/christine/mask/00000011.png ADDED Viewed

Git LFS Details

SHA256: 7d6d82a045e8f52c55c9ad979cd8bb88da9b4fcafbecc1189b81a6f355041d1a
Pointer size: 129 Bytes
Size of remote file: 6.53 kB

backend/avatars/christine/mask/00000012.png ADDED Viewed

Git LFS Details

SHA256: 9b878013518acf76a8e5e52a74a7900d9bf8aa109b2d93d13b414532d613dedf
Pointer size: 129 Bytes
Size of remote file: 6.59 kB

backend/avatars/christine/mask/00000013.png ADDED Viewed

Git LFS Details

SHA256: 9b878013518acf76a8e5e52a74a7900d9bf8aa109b2d93d13b414532d613dedf
Pointer size: 129 Bytes
Size of remote file: 6.59 kB

backend/avatars/christine/mask/00000014.png ADDED Viewed

Git LFS Details

SHA256: 7d6d82a045e8f52c55c9ad979cd8bb88da9b4fcafbecc1189b81a6f355041d1a
Pointer size: 129 Bytes
Size of remote file: 6.53 kB

backend/avatars/christine/mask/00000015.png ADDED Viewed

Git LFS Details

SHA256: 558de7129bf48f92104e2ad60035742aeb0a049e5de737b80d7349259b4a760d
Pointer size: 129 Bytes
Size of remote file: 6.71 kB

backend/avatars/christine/mask/00000016.png ADDED Viewed

Git LFS Details

SHA256: 31e918689f9023049a23c20e29346285242c2da11e732bdc4ce7b45abcbe06b9
Pointer size: 129 Bytes
Size of remote file: 6.69 kB

backend/avatars/christine/mask/00000017.png ADDED Viewed

Git LFS Details

SHA256: 58e052ea940446e84561cf3478d710bada6b5f84bc0ec9803eff67b3555e8133
Pointer size: 129 Bytes
Size of remote file: 6.69 kB

backend/avatars/christine/mask/00000018.png ADDED Viewed

Git LFS Details

SHA256: b23425f0f8c744f63c6db3ebf50b20b059758cb025deae9dfe02c7dff821b4c3
Pointer size: 129 Bytes
Size of remote file: 6.54 kB