# app/engines/parler_engine.py # Parler-TTS neural engine — instruction-controlled voice synthesis. # Voice is described in plain English rather than selected by ID. # Directly maps to Bantrly's grade-band persona system. # Paper: "Parler-TTS: Parallel Inference for Text-to-Speech" (HuggingFace, 2024) import time import torch import numpy as np import soundfile as sf from parler_tts import ParlerTTSForConditionalGeneration from transformers import AutoTokenizer from engines.base import TTSEngine # load once at module level _device = "cuda" if torch.cuda.is_available() else "cpu" _model = None _tokenizer = None def _get_model(): global _model, _tokenizer if _model is None: _model = ParlerTTSForConditionalGeneration.from_pretrained( "parler-tts/parler-tts-mini-v1" ).to(_device) _tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1") return _model, _tokenizer class ParlerEngine(TTSEngine): name = "Parler-TTS (mini)" engine_type = "neural-local" cost_per_million_chars = 0.0 is_production_ready = False # 7-10s latency, needs optimization for production requires_internet = False # voice descriptions map directly to Moo persona specs BAND_CONFIG = { "K-2": { "description": ( "A warm, friendly female voice speaking slowly and clearly, " "with an encouraging and celebratory tone, perfect for young children." ), }, "3-5": { "description": ( "A curious, energetic female voice speaking at a comfortable pace, " "with specific emphasis on key words and an upbeat coaching tone." ), }, "6-8": { "description": ( "A calm, direct female voice speaking at a natural pace, " "serious but approachable, validating feelings before coaching." ), }, "9-12": { "description": ( "A collegiate, honest male voice speaking at a brisk conversational pace, " "no hand-holding, multi-part reasoning, treating the listener as an adult." ), }, } def synthesize(self, text: str, band: str, output_path: str) -> dict: config = self.get_band_config(band) description = config["description"] model, tokenizer = _get_model() input_ids = tokenizer(description, return_tensors="pt").input_ids.to(_device) prompt_ids = tokenizer(text, return_tensors="pt").input_ids.to(_device) start = time.time() with torch.no_grad(): generation = model.generate( input_ids=input_ids, prompt_input_ids=prompt_ids, ) latency = round(time.time() - start, 3) audio = generation.cpu().numpy().squeeze() full_path = output_path + ".wav" sf.write(full_path, audio, model.config.sampling_rate) return { "audio_path": full_path, "latency_seconds": latency, "voice": f"described:{band}", "speed": 1.0, "engine": self.name, }