Uploading DART folder into model repo

Browse files

Files changed (9) hide show

aligner.py +70 -0
main.py +244 -0
orpheus_engine.py +127 -0
readme.md +3 -0
requirements.txt +1 -0
setup.sh +19 -0
temp.py +248 -0
transcript.txt +356 -0
util.py +58 -0

aligner.py ADDED Viewed

	@@ -0,0 +1,70 @@

+# aligner.py
+import os
+import re
+import tempfile
+from typing import Dict, Any
+# These imports are from your original script and are installed by your setup.sh
+from aeneas.executetask import ExecuteTask
+from aeneas.task import Task
+def setup_aligner():
+    """
+    Aeneas does not require a model to be loaded, so this function does nothing.
+    It exists to keep the structure of main.py consistent.
+    """
+    print("✅ Aeneas aligner is ready (no setup required).")
+    pass
+def align_words(audio_bytes: bytes, transcript: str) -> Dict[str, Any]:
+    """
+    Performs word alignment using the file-based aeneas library.
+    This is run sequentially to ensure stability.
+    """
+    config = (
+        "task_language=eng|"
+        "is_text_type=plain|"
+        "os_task_file_format=json|"
+        "task_adjust_boundary_algorithm=percent|"
+        "task_adjust_boundary_percent_value=30"
+    )
+    # Use a with statement to ensure temporary files are always cleaned up
+    with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as af, \
+         tempfile.NamedTemporaryFile(mode="w+", delete=False, suffix=".txt") as tf:
+        # Write audio to a temporary file
+        af.write(audio_bytes)
+        audio_path = af.name
+        # Write the formatted transcript to a temporary file
+        words_only = re.findall(r"\b[a-zA-Z']+\b", transcript)
+        formatted_transcript = "\n".join(words_only)
+        tf.write(formatted_transcript)
+        text_path = tf.name
+    try:
+        # Setup and run the aeneas alignment task
+        task = Task(config_string=config)
+        task.audio_file_path_absolute = audio_path
+        task.text_file_path_absolute = text_path
+        ExecuteTask(task).execute()
+        # Extract the aligned words and start times
+        words = []
+        start_times = []
+        if task.sync_map is not None:
+            for fragment in task.sync_map.fragments:
+                word = fragment.text.strip()
+                if word:
+                    words.append(word)
+                    start_times.append(float(fragment.begin))
+        return {"word": words, "startTime": start_times}
+    finally:
+        # Manually clean up the temporary files
+        os.unlink(audio_path)
+        os.unlink(text_path)

main.py ADDED Viewed

	@@ -0,0 +1,244 @@

+# main.py
+import socket
+import struct
+import json
+import msgpack
+import zlib
+import re
+from util import calculate_duration_from_bytes, update_motion_generator_duration,load_yaml
+import base64
+from typing import Dict, Any, List, Tuple
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from aligner import align_words, setup_aligner
+from orpheus_engine import synthesize_for_scene, setup_model
+config = load_yaml()
+HOST = config["HOST"]
+PORT = config["PORT"]
+print(f"Connecting to {HOST}:{PORT}")
+MAGIC = 0x2333
+def patch_socket_keepalive(sock: socket.socket) -> None:
+    """Set keepalive + long timeout to prevent halts on idle."""
+    sock.settimeout(None)  # Never timeout on recv
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1)
+    # Platform-specific tuning
+    if hasattr(socket, 'TCP_KEEPIDLE'):
+        sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, 10)
+    if hasattr(socket, 'TCP_KEEPINTVL'):
+        sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 5)
+    if hasattr(socket, 'TCP_KEEPCNT'):
+        sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPCNT, 3)
+def recv_exact(sock: socket.socket, n: int) -> bytes:
+    buf = bytearray()
+    while len(buf) < n:
+        chunk = sock.recv(n - len(buf))
+        if not chunk:
+            raise EOFError("Connection closed prematurely")
+        buf.extend(chunk)
+    return bytes(buf)
+def send_frame(sock: socket.socket, event: str, payload: Any) -> None:
+    # Use msgpack instead of JSON
+    raw = msgpack.packb({"event": event, "payload": payload}, use_bin_type=True)
+    comp = zlib.compress(raw)
+    # <MAGIC><raw_len><comp_len>
+    header = struct.pack("<III", MAGIC, len(raw), len(comp))
+    sock.sendall(header + comp)
+MAGIC_JSON = 0xDEADBEEF
+def recv_frame(sock: socket.socket) -> Dict[str, Any]:
+    header = recv_exact(sock, 12)
+    magic, raw_len, comp_len = struct.unpack("<III", header)
+    if magic != MAGIC_JSON:
+        raise RuntimeError("Bad magic number – protocol mismatch")
+    comp_bytes = recv_exact(sock, comp_len)
+    raw_bytes = zlib.decompress(comp_bytes)
+    return json.loads(raw_bytes.decode())
+def strip_tags(text: str) -> str:
+    no_tags = re.sub(r"<[^>]+>", "", text)
+    words = re.findall(r"\b[a-zA-Z']+\b", no_tags)
+    return " ".join(words).strip()
+def align_audio(audio_bytes: bytes, scene_text: str) -> Tuple:
+    """
+    Helper function that runs both TTS and alignment for a single scene.
+    This entire function will be executed in a parallel thread.
+    """
+    """
+    dummy_path = "output_0.wav"
+    if not os.path.exists(dummy_path):
+        raise FileNotFoundError("Dummy file 'output_0.wav' not found.")
+    # Read dummy WAV file as bytes
+    with open(dummy_path, "rb") as f:
+        audio_bytes = f.read()
+    # Strip tags from text (optional)
+    spoken_text = strip_tags(scene_text)
+    """
+    # Align
+    alignment = align_words(audio_bytes, scene_text)
+    return alignment
+def generate_audio(scene: Dict[str, Any]) -> Tuple[bytes, str]:
+    audio_bytes, audio_base64 = synthesize_for_scene(
+        prompt=scene["txt"],
+        voice=scene.get("voice", "miko"),
+        temperature=scene.get("temperature", 0.6),
+        top_p=scene.get("top_p", 0.8),
+        repetition_penalty=scene.get("repetition_penalty", 1.3),
+        max_tokens=scene.get("max_tokens", 1200),
+    )
+    # In a real scenario, this would call your TTS engine.
+    """
+    dummy_path = "output_0.wav"
+    if not os.path.exists(dummy_path):
+        raise FileNotFoundError("Dummy file 'output_0.wav' not found.")
+    with open(dummy_path, "rb") as f:
+        audio_bytes = f.read()
+    audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")"""
+    return audio_bytes, audio_base64
+def handle_connection(sock: socket.socket) -> None:
+    send_frame(sock, "hello", {"role": "tts"})
+    print("→ hello (role=tts) sent")
+    while True:
+        try:
+            frame = recv_frame(sock)
+        except EOFError:
+            print('[ "Connection closed by the other side" ]')
+            break
+        event = frame.get("event")
+        payload = frame.get("payload")
+        if event != "generate-voice":
+            print(f"⚠️  unknown event {event}, ignored")
+            continue
+        scenes: List[dict] = payload.get("scenes", [])
+        # --- STAGE 1: FAST Audio Generation & Duration Notification ---
+        # The goal here is to get durations to the motion generator ASAP.
+        generated_audio_data = []
+        print("")
+        print("--- Generating Audios Thread ---")
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            # Submit all the FAST audio generation tasks
+            future_to_scene = {
+                executor.submit(generate_audio, scene): scene
+                for scene in scenes if scene.get("txt")
+            }
+            # As each FAST audio generation task completes...
+            for future in as_completed(future_to_scene):
+                scene = future_to_scene[future]
+                try:
+                    scene_id = scene["sceneId"]
+                    motion_index = scene.get("motionIndex", 0)
+                    # 1. Get the generated audio
+                    audio_bytes, audio_base64 = future.result()
+                    print("")
+                    print(f'[ "Generated Audio {scene_id}, Motion: {motion_index}" ]')
+                    # 2. Calculate duration instantly
+                    duration = calculate_duration_from_bytes(audio_bytes)
+                    # 3. Notify motion generator IMMEDIATELY
+                    if duration > 0:
+                        update_motion_generator_duration(scene["sceneId"], scene.get("motionIndex", 0), duration)
+                    # 4. Store the results to be used in the next (slow) stage
+                    generated_audio_data.append({
+                        "scene": scene,
+                        "audio_bytes": audio_bytes,
+                        "audio_base64": audio_base64
+                    })
+                except Exception as e:
+                    print(f'[ "Error during audio generation for {scene['sceneId']}: {e}" ]')
+        # --- STAGE 2: SLOW Word Alignment in Parallel ---
+        # Now that all notifications are sent, we can perform the slow alignment work.
+        response_by_scene: Dict[str, Any] = {}
+        print("")
+        print("--- Word Alignments Thread ---")
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            # Use the data from Stage 1 to submit SLOW alignment tasks.
+            # We call `align_words` directly (your `align_audio` function is not needed).
+            future_to_data = {
+                executor.submit(align_words, data["audio_bytes"], strip_tags(data["scene"]["txt"])): data
+                for data in generated_audio_data
+            }
+            # As each SLOW alignment task completes...
+            for future in as_completed(future_to_data):
+                data = future_to_data[future]
+                scene = data["scene"]
+                scene_id = scene["sceneId"]
+                motion_index = scene.get("motionIndex", 0)
+                try:
+                    # 1. Get the alignment result
+                    alignment = future.result()
+                    print("")
+                    print(f'[ "Aligned {scene_id}, Motion: {motion_index}" ]')
+                    # 2. Now, build the final response object with all the data
+                    voice_audio = {
+                        "motion": motion_index,
+                        "audio_base64": data["audio_base64"], # From Stage 1
+                        "alignment": alignment,               # From Stage 2
+                    }
+                    if scene_id not in response_by_scene:
+                        response_by_scene[scene_id] = {"sceneId": scene_id, "audioEvents": []}
+                    response_by_scene[scene_id]["audioEvents"].append(voice_audio)
+                except Exception as e:
+                    print(f"Error during alignment for scene {scene_id}: {e}")
+        if response_by_scene:
+            send_frame(sock, "voice-generated", list(response_by_scene.values()))
+            print("")
+            print(f"[ ← Audios ({len(response_by_scene)}) sent ]")
+def main() -> None:
+    # Setup the Orpheus TTS model on startup.
+    setup_model()
+    # Setup the aligner (does nothing for aeneas, but keeps pattern consistent)
+    setup_aligner()
+    while True:
+        try:
+            with socket.create_connection((HOST, PORT), timeout=60) as sock:
+                patch_socket_keepalive(sock)
+                print(f'["Connected to server at {HOST}:{PORT}"]')
+                handle_connection(sock)
+        except (ConnectionRefusedError, OSError) as e:
+            print(f"Connection error: {e}, retrying in 5s")
+        except Exception as e:
+            print(f"Unhandled error: {e}, reconnecting in 5s")
+        finally:
+            import time
+            time.sleep(5)
+if __name__ == "__main__":
+    main()

orpheus_engine.py ADDED Viewed

	@@ -0,0 +1,127 @@

+# tts_engine.py
+import wave
+import asyncio
+import uuid  # Import uuid to generate unique IDs
+import threading
+import queue
+import base64
+from io import BytesIO
+from util import load_yaml
+from orpheus_tts.engine_class import OrpheusModel
+from vllm.outputs import RequestOutput
+from vllm import SamplingParams
+# --- Background loop to keep vLLM stable across requests ---
+# This class is correct and does not need changes.
+class BackgroundEventLoop:
+    def __init__(self):
+        self._loop = asyncio.new_event_loop()
+        self._thread = threading.Thread(target=self._run_loop, daemon=True)
+        self._thread.start()
+    def _run_loop(self):
+        asyncio.set_event_loop(self._loop)
+        self._loop.run_forever()
+    def run_generator(self, async_gen):
+        q = queue.Queue()
+        sentinel = object()
+        async def producer():
+            try:
+                async for item in async_gen:
+                    q.put(item)
+            except Exception as e:
+                q.put(e)
+            finally:
+                q.put(sentinel)
+        asyncio.run_coroutine_threadsafe(producer(), self._loop)
+        while True:
+            item = q.get()
+            if item is sentinel:
+                break
+            if isinstance(item, Exception):
+                raise item
+            yield item
+# --- Patched Orpheus model using background loop ---
+tts_event_loop = BackgroundEventLoop()
+class PatchedOrpheusModel(OrpheusModel):
+    # THE FIX IS HERE
+    def generate_tokens_sync(self, prompt, voice=None, request_id=None, temperature=0.6, top_p=0.8, max_tokens=1200, stop_token_ids=[49158], repetition_penalty=1.3):
+        # If no request_id is provided, generate a new unique one.
+        # This solves the "id already running" error.
+        if request_id is None:
+            request_id = str(uuid.uuid4())
+        prompt_string = self._format_prompt(prompt, voice)
+        sampling_params = SamplingParams(
+            temperature=temperature,
+            top_p=top_p,
+            max_tokens=max_tokens,
+            stop_token_ids=stop_token_ids,
+            repetition_penalty=repetition_penalty,
+        )
+        async_gen = self.engine.generate(
+            prompt=prompt_string,
+            sampling_params=sampling_params,
+            request_id=request_id  # Use the unique ID
+        )
+        for result in tts_event_loop.run_generator(async_gen):
+            if not isinstance(result, RequestOutput):
+                raise TypeError(f"Unexpected result type: {type(result)}")
+            yield result.outputs[0].text
+# --- Persistent global model ---
+# This section is correct and does not need changes.
+model = None
+def setup_model():
+    global model
+    if model is None:
+        print("Loading TTS model...")
+        config = load_yaml()
+        model = PatchedOrpheusModel(model_name=config["tts"]["model_name"])
+        print("✅ Model loaded and ready.")
+def synthesize_for_scene(
+    prompt: str,
+    voice: str = "miko",
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    repetition_penalty: float = 1.3,
+    max_tokens: int = 1200,
+):
+    global model
+    # This function now works correctly in parallel because each call
+    # will trigger a unique request_id in the PatchedOrpheusModel above.
+    chunks = bytearray()
+    for chunk in model.generate_speech(
+        prompt=prompt,
+        voice=voice,
+        temperature=temperature,
+        top_p=top_p,
+        max_tokens=max_tokens,
+        repetition_penalty=repetition_penalty,
+    ):
+        chunks.extend(chunk)
+    buffer = BytesIO()
+    with wave.open(buffer, "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(24000)
+        wf.writeframes(chunks)
+    audio_bytes = buffer.getvalue()
+    audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")
+    return audio_bytes, audio_base64

readme.md ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ Miko TTS
2	+
3	+ Fine-Tuned version of Orpheus TTS

requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ orpheus-speech

setup.sh ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/bin/bash
+set -e
+apt update && apt install -y \
+  ffmpeg \
+  libespeak-dev \
+  python3.10-dev \
+  python3.10-distutils \
+  build-essential \
+  curl
+curl -sS https://bootstrap.pypa.io/get-pip.py | python3.10
+python3.10 -m pip install "numpy<2.0.0" "setuptools<60"
+python3.10 -m pip install aeneas
+python3.10 -m pip install orpheus-speech
+echo "✅ Done. Aeneas and Orpheus-Speech are installed globally for Python 3.10."

temp.py ADDED Viewed

	@@ -0,0 +1,248 @@

+# main.py
+import socket
+import struct
+import json
+import msgpack
+import zlib
+import re
+from util import calculate_duration_from_bytes, update_motion_generator_duration
+import base64
+from typing import Dict, Any, List, Tuple
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from aligner import align_words, setup_aligner
+import os
+#from orpheus_engine import synthesize_for_scene, setup_model
+# Config and basic networking functions
+config_path = os.path.join(os.path.dirname(__file__), '..', 'config.json')
+with open(config_path, 'r') as f:
+    config = json.load(f)
+HOST = config["HOST"]
+PORT = config["PORT"]
+print(f"Connecting to {HOST}:{PORT}")
+MAGIC = 0x2333
+def patch_socket_keepalive(sock: socket.socket) -> None:
+    """Set keepalive + long timeout to prevent halts on idle."""
+    sock.settimeout(None)  # Never timeout on recv
+    sock.setsockopt(socket.SOL_SOCKET, socket.SO_KEEPALIVE, 1)
+    # Platform-specific tuning
+    if hasattr(socket, 'TCP_KEEPIDLE'):
+        sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPIDLE, 10)
+    if hasattr(socket, 'TCP_KEEPINTVL'):
+        sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPINTVL, 5)
+    if hasattr(socket, 'TCP_KEEPCNT'):
+        sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_KEEPCNT, 3)
+def recv_exact(sock: socket.socket, n: int) -> bytes:
+    buf = bytearray()
+    while len(buf) < n:
+        chunk = sock.recv(n - len(buf))
+        if not chunk:
+            raise EOFError("Connection closed prematurely")
+        buf.extend(chunk)
+    return bytes(buf)
+def send_frame(sock: socket.socket, event: str, payload: Any) -> None:
+    # Use msgpack instead of JSON
+    raw = msgpack.packb({"event": event, "payload": payload}, use_bin_type=True)
+    comp = zlib.compress(raw)
+    # <MAGIC><raw_len><comp_len>
+    header = struct.pack("<III", MAGIC, len(raw), len(comp))
+    sock.sendall(header + comp)
+MAGIC_JSON = 0xDEADBEEF
+def recv_frame(sock: socket.socket) -> Dict[str, Any]:
+    header = recv_exact(sock, 12)
+    magic, raw_len, comp_len = struct.unpack("<III", header)
+    if magic != MAGIC_JSON:
+        raise RuntimeError("Bad magic number – protocol mismatch")
+    comp_bytes = recv_exact(sock, comp_len)
+    raw_bytes = zlib.decompress(comp_bytes)
+    return json.loads(raw_bytes.decode())
+def strip_tags(text: str) -> str:
+    no_tags = re.sub(r"<[^>]+>", "", text)
+    words = re.findall(r"\b[a-zA-Z']+\b", no_tags)
+    return " ".join(words).strip()
+def align_audio(audio_bytes: bytes, scene_text: str) -> Tuple:
+    """
+    Helper function that runs both TTS and alignment for a single scene.
+    This entire function will be executed in a parallel thread.
+    """
+    """
+    dummy_path = "output_0.wav"
+    if not os.path.exists(dummy_path):
+        raise FileNotFoundError("Dummy file 'output_0.wav' not found.")
+    # Read dummy WAV file as bytes
+    with open(dummy_path, "rb") as f:
+        audio_bytes = f.read()
+    # Strip tags from text (optional)
+    spoken_text = strip_tags(scene_text)
+    """
+    # Align
+    alignment = align_words(audio_bytes, scene_text)
+    return alignment
+def generate_audio(scene: Dict[str, Any]) -> Tuple[bytes, str]:
+    """
+    audio_bytes, audio_base64 = synthesize_for_scene(
+        prompt=scene["txt"],
+        voice=scene.get("voice", "miko"),
+        temperature=scene.get("temperature", 0.6),
+        top_p=scene.get("top_p", 0.8),
+        repetition_penalty=scene.get("repetition_penalty", 1.3),
+        max_tokens=scene.get("max_tokens", 1200),
+    )"""
+    # In a real scenario, this would call your TTS engine.
+    """
+    dummy_path = "output_0.wav"
+    if not os.path.exists(dummy_path):
+        raise FileNotFoundError("Dummy file 'output_0.wav' not found.")
+    with open(dummy_path, "rb") as f:
+        audio_bytes = f.read()
+    audio_base64 = base64.b64encode(audio_bytes).decode("utf-8")"""
+    return audio_bytes, audio_base64
+def handle_connection(sock: socket.socket) -> None:
+    send_frame(sock, "hello", {"role": "tts"})
+    print("→ hello (role=tts) sent")
+    while True:
+        try:
+            frame = recv_frame(sock)
+        except EOFError:
+            print('[ "Connection closed by the other side" ]')
+            break
+        event = frame.get("event")
+        payload = frame.get("payload")
+        if event != "generate-voice":
+            print(f"⚠️  unknown event {event}, ignored")
+            continue
+        scenes: List[dict] = payload.get("scenes", [])
+        # --- STAGE 1: FAST Audio Generation & Duration Notification ---
+        # The goal here is to get durations to the motion generator ASAP.
+        generated_audio_data = []
+        print("")
+        print("--- Generating Audios Thread ---")
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            # Submit all the FAST audio generation tasks
+            future_to_scene = {
+                executor.submit(generate_audio, scene): scene
+                for scene in scenes if scene.get("txt")
+            }
+            # As each FAST audio generation task completes...
+            for future in as_completed(future_to_scene):
+                scene = future_to_scene[future]
+                try:
+                    scene_id = scene["sceneId"]
+                    motion_index = scene.get("motionIndex", 0)
+                    # 1. Get the generated audio
+                    audio_bytes, audio_base64 = future.result()
+                    print("")
+                    print(f'[ "Generated Audio {scene_id}, Motion: {motion_index}" ]')
+                    # 2. Calculate duration instantly
+                    duration = calculate_duration_from_bytes(audio_bytes)
+                    # 3. Notify motion generator IMMEDIATELY
+                    if duration > 0:
+                        update_motion_generator_duration(scene["sceneId"], scene.get("motionIndex", 0), duration)
+                    # 4. Store the results to be used in the next (slow) stage
+                    generated_audio_data.append({
+                        "scene": scene,
+                        "audio_bytes": audio_bytes,
+                        "audio_base64": audio_base64
+                    })
+                except Exception as e:
+                    print(f"Error during audio generation for {scene['sceneId']}: {e}")
+        # --- STAGE 2: SLOW Word Alignment in Parallel ---
+        # Now that all notifications are sent, we can perform the slow alignment work.
+        response_by_scene: Dict[str, Any] = {}
+        print("")
+        print("--- Word Alignments Thread ---")
+        with ThreadPoolExecutor(max_workers=10) as executor:
+            # Use the data from Stage 1 to submit SLOW alignment tasks.
+            # We call `align_words` directly (your `align_audio` function is not needed).
+            future_to_data = {
+                executor.submit(align_words, data["audio_bytes"], strip_tags(data["scene"]["txt"])): data
+                for data in generated_audio_data
+            }
+            # As each SLOW alignment task completes...
+            for future in as_completed(future_to_data):
+                data = future_to_data[future]
+                scene = data["scene"]
+                scene_id = scene["sceneId"]
+                motion_index = scene.get("motionIndex", 0)
+                try:
+                    # 1. Get the alignment result
+                    alignment = future.result()
+                    print("")
+                    print(f'[ "Aligned {scene_id}, Motion: {motion_index}" ]')
+                    # 2. Now, build the final response object with all the data
+                    voice_audio = {
+                        "motion": motion_index,
+                        "audio_base64": data["audio_base64"], # From Stage 1
+                        "alignment": alignment,               # From Stage 2
+                    }
+                    if scene_id not in response_by_scene:
+                        response_by_scene[scene_id] = {"sceneId": scene_id, "audioEvents": []}
+                    response_by_scene[scene_id]["audioEvents"].append(voice_audio)
+                except Exception as e:
+                    print(f"Error during alignment for scene {scene_id}: {e}")
+        if response_by_scene:
+            send_frame(sock, "voice-generated", list(response_by_scene.values()))
+            print("")
+            print(f"[ ← Audios ({len(response_by_scene)}) sent ]")
+def main() -> None:
+    # Setup the Orpheus TTS model on startup.
+    #setup_model()
+    # Setup the aligner (does nothing for aeneas, but keeps pattern consistent)
+    setup_aligner()
+    while True:
+        try:
+            with socket.create_connection((HOST, PORT), timeout=60) as sock:
+                patch_socket_keepalive(sock)
+                print(f'["Connected to server at {HOST}:{PORT}"]')
+                handle_connection(sock)
+        except (ConnectionRefusedError, OSError) as e:
+            print(f"Connection error: {e}, retrying in 5s")
+        except Exception as e:
+            print(f"Unhandled error: {e}, reconnecting in 5s")
+        finally:
+            import time
+            time.sleep(5)
+if __name__ == "__main__":
+    main()

transcript.txt ADDED Viewed

	@@ -0,0 +1,356 @@

+eye_blink_1
+eye_blink_1_L
+eye_blink_1_R
+eye_blink_2
+eye_blink_2_L
+eye_blink_2_R
+eye_smile_1
+eye_smile_1_L
+eye_smile_1_R
+eye_smile_2
+eye_smile_2_L
+eye_smile_2_R
+eye_nagomi
+eye_nagomi_L
+eye_nagomi_R
+eye_happy
+eye_happy_L
+eye_happy_R
+eye_angry
+eye_angry_L
+eye_angry_R
+eye_sad
+eye_sad_L
+eye_sad_R
+eye_surprise
+eye_surprise_L
+eye_surprise_R
+eye_sleepy
+eye_sleepy_L
+eye_sleepy_R
+eye_tsuri_1
+eye_tsuri_1_L
+eye_tsuri_1_R
+eye_tsuri_2
+eye_tsuri_2_L
+eye_tsuri_2_R
+eye_tsuri_3
+eye_tsuri_3_L
+eye_tsuri_3_R
+eye_tare_1
+eye_tare_1_L
+eye_tare_1_R
+eye_tare_2
+eye_tare_2_L
+eye_tare_2_R
+eye_tare_3
+eye_tare_3_L
+eye_tare_3_R
+eye_maru
+eye_maru_L
+eye_maru_R
+eye_><
+eye_><_L
+eye_><_R
+eye_marushiro
+eye_dark
+eye_OO
+eye_big
+eye_small
+eye_○_1
+eye_○_2
+eye_cat
+eye_uru
+eye_heart
+eye_shiitake
+eye_guruguru
+eye_x
+eye_look_up_1
+eye_look_up_2
+eye_look_down
+eye_look_L
+eye_look_R
+eye_look_inside
+eye_look_outside
+eye_back
+eye_pupil_big
+eye_pupil_small
+eye_pupil_up
+eye_pupil_down
+eye_pupil_close
+eye_pupil_far
+eye_pupil_x
+eye_pupil_cat
+eye_pupil_heart
+eye_pupil_heart_big
+eye_pupil_star
+eye_pupil_star_big
+eye_pupil_0
+eye_pupil_0_big
+eye_pupil_○
+eye_pupil_○_big
+eye_pupil_□
+eye_pupil_□_big
+eye_pupil_+
+eye_pupil_+_big
+eye_pupil_X
+eye_pupil_X_big
+eye_up
+eye_down
+eye_close
+eye_far
+eye_size_big
+eye_size_small
+eye_width_up
+eye_width_down
+eye_height_up
+eye_height_down
+FULL EYE listen
+eyebrow_smile
+eyebrow_smile_L
+eyebrow_smile_R
+eyebrow_straight
+eyebrow_straight_L
+eyebrow_straight_R
+eyebrow_angry_1
+eyebrow_angry_1_L
+eyebrow_angry_1_R
+eyebrow_angry_2
+eyebrow_angry_2_L
+eyebrow_angry_2_R
+eyebrow_sad_1
+eyebrow_sad_1_L
+eyebrow_sad_1_R
+eyebrow_sad_2
+eyebrow_sad_2_L
+eyebrow_sad_2_R
+eyebrow_trouble
+eyebrow_trouble_L
+eyebrow_trouble_R
+eyebrow_up
+eyebrow_up_L
+eyebrow_down
+eyebrow_down_L
+eyebrow_down_R
+eyebrow_tsuri
+eyebrow_tsuri_L
+eyebrow_tsuri_R
+eyebrow_tare
+eyebrow_tare_L
+eyebrow_tare_R
+eyebrow_close
+eyebrow_far
+eyebrow_thin
+eyebrow_thick
+eyebrow_wide
+eyebrow_narrow
+eyebrow_forward_1
+eyebrow_forward_1_L
+eyebrow_forward_1_R
+eyebrow_forward_2
+eyebrow_forward_2_L
+eyebrow_forward_2_R
+eyebrow_forward_3
+eyebrow_forward_3_L
+eyebrow_forward_3_R
+eyebrow_back_1
+eyebrow_back_1_L
+eyebrow_back_1_R
+eyebrow_back_2
+eyebrow_back_2_L
+eyebrow_back_2_R
+eyebrow_back_3
+eyebrow_back_3_L
+eyebrow_back_3_R
+eyebrow_maromayu
+eyebrow_shadow_
+eyebrow_×
+FULL EYEBROW
+MOUTH
+mouth_straight
+mouth_straight_L
+mouth_straight_R
+mouth_smile_1
+mouth_smile_1_L
+mouth_smile_1_R
+mouth_smile_2
+mouth_angry_1
+mouth_angry_1_L
+mouth_angry_1_R
+mouth_angry_2
+mouth_wide
+mouth_wide_L
+mouth_wide_R
+mouth_narrow
+mouth_narrow_L
+mouth_narrow_R
+mouth_V
+mouth_∧_1
+mouth_∧_2
+mouth_n
+mouth_pokan
+mouth_hawa
+mouth_nihe
+mouth_o_big
+mouth_o_small
+mouth_△_1
+mouth_△_2
+mouth_□
+mouth__
+mouth___
+mouth_wa
+mouth_ω
+mouth_ω_open_1
+mouth_ω_open_2
+mouth_yodare_L
+mouth_yodare_R
+mouth_grin_L
+mouth_grin_L_open
+mouth_grin_R
+mouth_grin_R_open
+mouth_puku-
+mouth_puku-_L
+mouth_puku-_R
+mouth_pero_1
+mouth_pero_2
+mouth_pero_3
+mouth_pero_4
+mouth_pero_5
+mouth_pero_6
+mouth_tehepero_1
+mouth_tehepero_2
+mouth_tehepero_3
+mouth_wide_ALL
+mouth_narrow_ALL
+mouth_up
+mouth_down
+mouth_forward
+mouth_back
+Current Pelvis Smplx Rot: (X: -0.021377, Y: 0.684438, Z: -0.728309, W: 0.025578)
+Current Pelvis Smplx Rot: (X: -0.361679, Y: 0.580401, Z: -0.611046, W: 0.39868)
+Current Pelvis Smplx Rot: (X: -0.692902, Y: -0.097601, Z: 0.098692, W: 0.707545)
+Current Pelvis Smplx Rot: (X: -0.006667, Y: 0.70305, Z: -0.711109, W: 0.000571)
+Current Pelvis Smplx Rot: (X: -0.493132, Y: 0.452919, Z: -0.492166, W: 0.556289)
+Current Pelvis Smplx Rot: (X: -0.587168, Y: 0.351233, Z: -0.366381, W: 0.630582)
+Current Pelvis Smplx Rot: (X: -0.668992, Y: 0.209959, Z: -0.24474, W: 0.669678)
+Current Pelvis Smplx Rot: (X: -0.670558, Y: -0.093318, Z: 0.111714, W: 0.727436)
+Current Pelvis Smplx Rot: (X: -0.57651, Y: -0.34049, Z: 0.389817, W: 0.632255)
+ Current Pelvis Smplx Rot: (X: -0.265756, Y: 0.609998, Z: -0.671782, W: 0.325552)
+<chat_message> Hey Miko did you know that a banana is a berry, and strawberries aren't actually berries.
+<normal> Hey Miko, did you know that a banana is a berry, and strawberries aren't actually berries.
+<normal> Actually, I did know that
+<normal> Bananas are a baby, a baby. <laugh> Bananas are a Berry.
+<normal> Did you know that avocados are berry? Yeah for real.
+<chat_message> We're eating babies?
+<normal> We're eating babies?
+<normal> No, you're not eating babies!
+<normal> You're eating the reproductive organs of plants. <chuckle> That is true! That's what fruit are, literally! <laugh>
+[00:26] <chat_message> Biboo do you eat your fruit with or without the shell? Ong no cap fr they got me geekin like im on a mission when i finna bust one open and go skrrrt skrrrt like im delulu flexin with the low taper fade
+[00:26] <normal> Biboo do you eat your fruit with or without the shell? Ong no cap fr they got me geekin like I'm on a mission when I finna bust one open and go skrrt skrrt like I'm delulu flexin with the low taper fade.
+[00:42] <normal> <chuckle> The bad thing is I understood all of that.
+[00:45] <normal> Yeah, that got me geekin' for real! <laugh> No cap!
+[00:50] <normal> I eat fruit with the shell! Unless it's kiwis! No shell for kiwis! But I will eat apple shell!
+[00:57] <chat_message> Pineapples?
+[00:58] <normal> Pineapples? have a shell? Yes? <chuckle> I've never... <laugh> I'm about to sound so spoiled. I've never had to eat a pineapple that wasn't cut up for me! Thanks, mom! <laugh> I've never had to eat fruit that wasn't cut up for me. Mom... <laugh>
+[01:13] <normal> <laugh> that's a love language! That's a love language for Asian parents though! That's how you know your mom truly loves you is when she cuts up fruit for you! It's not 'I'm proud of you!', it's when she cuts up fruit after your meal. That's, That's how you know.
+[01:27] <normal> So I've never had to eat pineapple with the shell, okay. I have seen a pineapple with the shell. Yes, it's spiky.
+[01:36] <chat_message> What about Durian?
+[01:36] <normal> What about Durian?
+[01:37] <whisper> I've never had a Durian! oops, I've never had one! I've never eaten one! Sorry.
+[01:43] <chat_message> What?
+[01:43] <normal> What?
+[01:44] <normal> Yeah. though I am willing to give it a chance. I know it smells like really-really bad. I think I smelled it before, but I'm willing to try it! <chuckle> Yeah, it's a really pungent fruit
+00:00] <chat_message> it's super easy to get a new wife, barely an inconvenience
+[00:00] <normal> It's super easy to get a new wife, barely an inconvenience.
+[00:05] <normal> Does that mean it's super easy to get a new Biboo? Huh?
+[00:09] <angry> No! <normal> It's not super easy to get a new Biboo. Cause only Biboo is Biboo. Biboo is just Biboo. We got Momseki. <chuckle>
+[00:19] <shout> NO!
+[00:20] <normal> Biboo is Biboo, and Momseki is Momseki. I still don't hear the... the voice resemblance. Still. <chuckle>
+[00:29] <normal> <sings> ...hitori de ikite ikesou, sore de ii no... <sings>
+[00:37] <normal> I don't think I sound like her, but there were so many comments like: You sound like Momseki. I don't hear it! I don't hear it! She is cute, I guess. She is very cute when she's singing. I will agree with that.
+[00:48] <chat_message> Cuter than you.
+[00:48] <normal> Cuter than you. <gasp> Maybe. <chuckle>
+[01:03] <chat_message> O Wise and Mighty Bojou, Master of Pronunciation, would you please do us the honor of enlightening us to the correct pronunciation of Werno?
+[01:04] <normal> O Wise and Mighty Bijou, Master of Pronunciation, would you please do us the honor of enlightening us to the correct pronunciation of... <breath> I don't know that word, is it safe? Is this a trick? Is this a trap?
+[01:19] <shout> Ohhh!
+[01:25] <normal> Oh, I see... <chuckle> You know the rules and so do aah
+[01:31] <chat_message> Mission failed we'll get her next time
+[01:31] <normal> Mission failed we'll get her next time. You have to try harder! I'm always careful. You have to try harder, okay? If you wanna beat the master of memes, you need to... I'm always careful. I'm always careful because I don't wanna get Deez nutted. <laugh>
+[01:46] <chat_message> Biboo! Can't wait to see you and the rest of Advent girls at conventions soon! ACEN (Anime Central) and AX (Anime Expo) have happened already, but it would be awesome if we can see you at SauCon!
+[01:47] <normal> Biboo! Can't wait to see you and the rest of Advent girls at conventions soon! Oh yeah! That'd be cool! I- I would love to do a meet and greet someday. I would love to meet you guys at SauCon... Deez nutz. Try again.
+[02:03] <chat_message> You should use the Stunseed powerup
+[02:04] <shout> Stunseed is just Deez nutz backwards! Try again!
+[02:08] <chat_message> Do you know the 2nd evolution for Ryhorn Biboo?
+[02:08] <normal> Do you know the 2nd evolution for Ryhorn Biboo?
+[02:11] <shout> Rhydon Deez Nutz! Jottem!
+[02:15] <normal> Try better. Try better. I know all of the easy ones.
+[02:19] <normal> He- hey FromSoft DLC people, if you're listening, I would really appreciate it if you made Ranni a summon. So she could fight with me. I feel like that would be the coolest thing ever and it make people buy the DLC more. So, I'm just saying, maybe make my wife a DLC summon... would be pretty, great.
+[02:41] <chat_message> Does that make Ranni an i-Ladies?
+[02:42] <shout> Does that make Ranni an i-Ladies? I Lay- I LayDeez Nutz on your face! That's EZ! You can't use the obvious ones I keep telling you!
+[02:52] <chat_message> Does Ranni listen to Imagine Dragons?
+[02:52] <normal> Does Ranni listen to Imagine Dragons?
+[02:55] <shout> Imagine dragging Deez Nutz on your face! Again! Again, I warned you! I warned you!
+[03:02] <chat_message> Does Ranni play Sea of Thieves?
+[03:03] <shout> Sea of Thieves? See of- See of Deez Nutz on your face! Dang it! I know it! I know them all! Dang it!
+[04:34] <chat_message> Biboo what do you think about ppl saying m'lady unironically in 2023?
+[04:34] <normal> Biboo what do you think about people saying m'lady unironically in 2023?
+[04:42] <normal> You mean m'layDeez Nutz in your mouth? <chuckle> Yeah! I think that's a Deez Nuts joke. I'm pretty sure. I'm pretty sure that's a deez- I'm pretty sure. <chuckle> I'm pretty sure. <chuckle> GOTTEM! If it's not, then uh... <chuckle> oops, my bad. Thank you! I can never be too careful.
+[05:02] <chat_message> Biboo, have you played Metal Gear Rising: Revengeance? It's an action Metal Gear game where you play as a cyborg named Raidin
+[05:02] <normal> Hey Biboo, have you played Metal Gear Rising: Revengeance? It's an action Metal Gear game where you play as a cyborg named Raidin-
+[05:12] <normal> Raidin Deez Nutz! Good try! <chuckle> And yes I have played Revengeance before. Good try!
+[05:20] <chat_message> They gave you a C because you still need CD's
+[05:20] <normal> AKB, thank you for the super.
+[05:22] <normal> They gave you a C because you still need CD's- <shout> CDEEZ NUTS! <normal> Nice try! <chuckle> Nice try!
+[05:30] <chat_message> Fruit is looking good. Do you like fruit pudding?
+[05:30] <normal> Fruit is looking good. Do you like fruit-
+[05:33] <normal> pudding Deez Nutz in your mouth!
+[05:35] <angry> What even is fruit pudding?! That doesn't exist, not really! <chuckle> Nice try!
+[05:40] <chat_message> Hi Biboo, LOVED your Alice costume in the Advent Halloween cover! But did you know your senpai are also getting new costumes? Whose is your favourite and why is it Gawr Ghoul?
+[05:41] <normal> Hi Biboo, LOVED your Alice costume in the Halloween oven co- <shout> Advent cover! <normal> But did you know your senpai are also getting new costumes?
+[05:48] <normal> Are they?? <normal> Are they? I- that's the first I've heard of it. Whose your favorite and why is it Gawr Ghoul? <laugh> Ah, that's a- that's a funny pun. <laugh>
+[05:58] <normal> Are they? Are they? I can't say who is my favorite in... in like EN Senpai because... It depends  on like who is nice to me at the time. And they're all nice to me. So like, if they- I said this before, but... <chuckle>
+[00:00] <chat_message> Hello Biboo, I just wanted to congratulate you for winning the Hololive Fan Discord Sever LEAST Sexiest hololive member Contest. Here's your prize! $100
+[00:00] <normal> Hello Biboo, I just wanted to congratulate you for winning the hololive fan discord server 'Least Sexiest' hololive member contest.
+[00:09] <sad> here's your prize...
+[00:12] <sad> Kobo and Gura were second and third?
+[00:15] <normal> But I can be sexy...
+[00:17] <shout> Hey! Whatever!
+[00:19] <normal> Least sexiest means all my points on the... on the cute are maxed out on cuteness, okay? So, that just means I am the most cute! Yes
+[00:00] <happy> Peter thank you for the super rock rock <chuckle>
+[00:03] <chat_message> Biboo. Do I Have Your Blessing to Lick the Candy Feet Slippers In the Comfort of My Own Home? Do You Know What They Taste Like? Are They Scented too?
+[00:04] <normal> Biboo. Do I have your blessing- <gasp> what? Do I have your blessing to lick the... uh... candy feet slippers in the comfort of my own home? Do you know what they taste like? Are they scented too?
+[00:21] <normal> Um... you don't need my permission to do anything. Um... what you do in the comfort of your own home... is uh up to you, but I will not tell you what... what... huh? <chuckle> <sigh>
+[00:46] <normal> um... but also get some help. rock rock.
+[00:49] <chat_message> That's a yes? haha
+[00:50] <normal> That's a yes?
+[00:51] <angry> no it's you do you <normal> I will be over here. I will be... I will be over here. You... you do you. I will be uh...
+[01:08] <normal> Oh. <laugh>

util.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import wave
+import io
+import os
+import json
+import requests
+import yaml
+def load_yaml():
+    """
+    Loads a YAML file and extracts the 'model_name' string.
+    """
+    file_path = "../config.yaml"
+    try:
+        with open(file_path, 'r') as file:
+            config = yaml.safe_load(file)
+            return config
+    except FileNotFoundError:
+        print(f"Error: The file '{file_path}' was not found.")
+        return None
+    except yaml.YAMLError as e:
+        print(f"Error parsing YAML file: {e}")
+        return None
+config = load_yaml()
+SYNC_PORT = config["SYNC_PORT"]
+MOTION_SYNC_URL = f"http://localhost:{SYNC_PORT}/update_duration"
+def update_motion_generator_duration(scene_id: str, motion_index: int, duration: float):
+    """Sends the actual audio duration to the motion generator's sync server."""
+    try:
+        payload = {
+            "sceneId": scene_id,
+            "motionIndex": motion_index,
+            "duration": duration
+        }
+        # This sends the data to the server running inside the motion generator script
+        response = requests.post(MOTION_SYNC_URL, json=payload, timeout=5)
+        if response.status_code == 200:
+            print(f"[ sent duration for {scene_id}:{motion_index} ]")
+        else:
+            print(f"⚠️ Failed to notify motion generator for {scene_id}_{motion_index}. Status: {response.status_code}, Response: {response.text}")
+    except requests.exceptions.RequestException as e:
+        print(f"⚠️ Error connecting to motion generator sync server: {e}")
+def calculate_duration_from_bytes(audio_bytes: bytes) -> float:
+    """Calculates audio duration in seconds directly from WAV bytes in memory."""
+    try:
+        with wave.open(io.BytesIO(audio_bytes), 'rb') as wf:
+            frames = wf.getnframes()
+            rate = wf.getframerate()
+            return frames / float(rate) if rate > 0 else 0.0
+    except (wave.Error, ZeroDivisionError) as e:
+        print(f"⚠️ Could not calculate duration from bytes: {e}")
+        return 0.0