# tts_engine.py import wave import asyncio import uuid # Import uuid to generate unique IDs import threading import queue import base64 from io import BytesIO from util import load_yaml from orpheus_tts.engine_class import OrpheusModel from vllm.outputs import RequestOutput from vllm import SamplingParams # --- Background loop to keep vLLM stable across requests --- # This class is correct and does not need changes. class BackgroundEventLoop: def __init__(self): self._loop = asyncio.new_event_loop() self._thread = threading.Thread(target=self._run_loop, daemon=True) self._thread.start() def _run_loop(self): asyncio.set_event_loop(self._loop) self._loop.run_forever() def run_generator(self, async_gen): q = queue.Queue() sentinel = object() async def producer(): try: async for item in async_gen: q.put(item) except Exception as e: q.put(e) finally: q.put(sentinel) asyncio.run_coroutine_threadsafe(producer(), self._loop) while True: item = q.get() if item is sentinel: break if isinstance(item, Exception): raise item yield item # --- Patched Orpheus model using background loop --- tts_event_loop = BackgroundEventLoop() class PatchedOrpheusModel(OrpheusModel): # THE FIX IS HERE def generate_tokens_sync(self, prompt, voice=None, request_id=None, temperature=0.6, top_p=0.8, max_tokens=1200, stop_token_ids=[49158], repetition_penalty=1.3): # If no request_id is provided, generate a new unique one. # This solves the "id already running" error. if request_id is None: request_id = str(uuid.uuid4()) prompt_string = self._format_prompt(prompt, voice) sampling_params = SamplingParams( temperature=temperature, top_p=top_p, max_tokens=max_tokens, stop_token_ids=stop_token_ids, repetition_penalty=repetition_penalty, ) async_gen = self.engine.generate( prompt=prompt_string, sampling_params=sampling_params, request_id=request_id # Use the unique ID ) for result in tts_event_loop.run_generator(async_gen): if not isinstance(result, RequestOutput): raise TypeError(f"Unexpected result type: {type(result)}") yield result.outputs[0].text # --- Persistent global model --- # This section is correct and does not need changes. model = None def setup_model(): global model if model is None: print("Loading TTS model...") config = load_yaml() model = PatchedOrpheusModel(model_name=config["tts"]["model_name"]) print("✅ Model loaded and ready.") def synthesize_for_scene( prompt: str, voice: str = "miko", temperature: float = 0.6, top_p: float = 0.9, repetition_penalty: float = 1.3, max_tokens: int = 1200, ): global model # This function now works correctly in parallel because each call # will trigger a unique request_id in the PatchedOrpheusModel above. chunks = bytearray() for chunk in model.generate_speech( prompt=prompt, voice=voice, temperature=temperature, top_p=top_p, max_tokens=max_tokens, repetition_penalty=repetition_penalty, ): chunks.extend(chunk) buffer = BytesIO() with wave.open(buffer, "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(24000) wf.writeframes(chunks) audio_bytes = buffer.getvalue() audio_base64 = base64.b64encode(audio_bytes).decode("utf-8") return audio_bytes, audio_base64