tts-eval-framework / app /engines /parler_engine.py
aankitdas's picture
first commit - working app locally
a3419b6
# app/engines/parler_engine.py
# Parler-TTS neural engine — instruction-controlled voice synthesis.
# Voice is described in plain English rather than selected by ID.
# Directly maps to Bantrly's grade-band persona system.
# Paper: "Parler-TTS: Parallel Inference for Text-to-Speech" (HuggingFace, 2024)
import time
import torch
import numpy as np
import soundfile as sf
from parler_tts import ParlerTTSForConditionalGeneration
from transformers import AutoTokenizer
from engines.base import TTSEngine
# load once at module level
_device = "cuda" if torch.cuda.is_available() else "cpu"
_model = None
_tokenizer = None
def _get_model():
global _model, _tokenizer
if _model is None:
_model = ParlerTTSForConditionalGeneration.from_pretrained(
"parler-tts/parler-tts-mini-v1"
).to(_device)
_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
return _model, _tokenizer
class ParlerEngine(TTSEngine):
name = "Parler-TTS (mini)"
engine_type = "neural-local"
cost_per_million_chars = 0.0
is_production_ready = False # 7-10s latency, needs optimization for production
requires_internet = False
# voice descriptions map directly to Moo persona specs
BAND_CONFIG = {
"K-2": {
"description": (
"A warm, friendly female voice speaking slowly and clearly, "
"with an encouraging and celebratory tone, perfect for young children."
),
},
"3-5": {
"description": (
"A curious, energetic female voice speaking at a comfortable pace, "
"with specific emphasis on key words and an upbeat coaching tone."
),
},
"6-8": {
"description": (
"A calm, direct female voice speaking at a natural pace, "
"serious but approachable, validating feelings before coaching."
),
},
"9-12": {
"description": (
"A collegiate, honest male voice speaking at a brisk conversational pace, "
"no hand-holding, multi-part reasoning, treating the listener as an adult."
),
},
}
def synthesize(self, text: str, band: str, output_path: str) -> dict:
config = self.get_band_config(band)
description = config["description"]
model, tokenizer = _get_model()
input_ids = tokenizer(description, return_tensors="pt").input_ids.to(_device)
prompt_ids = tokenizer(text, return_tensors="pt").input_ids.to(_device)
start = time.time()
with torch.no_grad():
generation = model.generate(
input_ids=input_ids,
prompt_input_ids=prompt_ids,
)
latency = round(time.time() - start, 3)
audio = generation.cpu().numpy().squeeze()
full_path = output_path + ".wav"
sf.write(full_path, audio, model.config.sampling_rate)
return {
"audio_path": full_path,
"latency_seconds": latency,
"voice": f"described:{band}",
"speed": 1.0,
"engine": self.name,
}