Spaces:
Sleeping
Sleeping
| # app/engines/parler_engine.py | |
| # Parler-TTS neural engine — instruction-controlled voice synthesis. | |
| # Voice is described in plain English rather than selected by ID. | |
| # Directly maps to Bantrly's grade-band persona system. | |
| # Paper: "Parler-TTS: Parallel Inference for Text-to-Speech" (HuggingFace, 2024) | |
| import time | |
| import torch | |
| import numpy as np | |
| import soundfile as sf | |
| from parler_tts import ParlerTTSForConditionalGeneration | |
| from transformers import AutoTokenizer | |
| from engines.base import TTSEngine | |
| # load once at module level | |
| _device = "cuda" if torch.cuda.is_available() else "cpu" | |
| _model = None | |
| _tokenizer = None | |
| def _get_model(): | |
| global _model, _tokenizer | |
| if _model is None: | |
| _model = ParlerTTSForConditionalGeneration.from_pretrained( | |
| "parler-tts/parler-tts-mini-v1" | |
| ).to(_device) | |
| _tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1") | |
| return _model, _tokenizer | |
| class ParlerEngine(TTSEngine): | |
| name = "Parler-TTS (mini)" | |
| engine_type = "neural-local" | |
| cost_per_million_chars = 0.0 | |
| is_production_ready = False # 7-10s latency, needs optimization for production | |
| requires_internet = False | |
| # voice descriptions map directly to Moo persona specs | |
| BAND_CONFIG = { | |
| "K-2": { | |
| "description": ( | |
| "A warm, friendly female voice speaking slowly and clearly, " | |
| "with an encouraging and celebratory tone, perfect for young children." | |
| ), | |
| }, | |
| "3-5": { | |
| "description": ( | |
| "A curious, energetic female voice speaking at a comfortable pace, " | |
| "with specific emphasis on key words and an upbeat coaching tone." | |
| ), | |
| }, | |
| "6-8": { | |
| "description": ( | |
| "A calm, direct female voice speaking at a natural pace, " | |
| "serious but approachable, validating feelings before coaching." | |
| ), | |
| }, | |
| "9-12": { | |
| "description": ( | |
| "A collegiate, honest male voice speaking at a brisk conversational pace, " | |
| "no hand-holding, multi-part reasoning, treating the listener as an adult." | |
| ), | |
| }, | |
| } | |
| def synthesize(self, text: str, band: str, output_path: str) -> dict: | |
| config = self.get_band_config(band) | |
| description = config["description"] | |
| model, tokenizer = _get_model() | |
| input_ids = tokenizer(description, return_tensors="pt").input_ids.to(_device) | |
| prompt_ids = tokenizer(text, return_tensors="pt").input_ids.to(_device) | |
| start = time.time() | |
| with torch.no_grad(): | |
| generation = model.generate( | |
| input_ids=input_ids, | |
| prompt_input_ids=prompt_ids, | |
| ) | |
| latency = round(time.time() - start, 3) | |
| audio = generation.cpu().numpy().squeeze() | |
| full_path = output_path + ".wav" | |
| sf.write(full_path, audio, model.config.sampling_rate) | |
| return { | |
| "audio_path": full_path, | |
| "latency_seconds": latency, | |
| "voice": f"described:{band}", | |
| "speed": 1.0, | |
| "engine": self.name, | |
| } |