Spaces:

aankitdas
/

tts-eval-framework

Sleeping

App Files Files Community

tts-eval-framework / app /engines /parler_engine.py

aankitdas

first commit - working app locally

a3419b6 about 2 months ago

raw

history blame contribute delete

3.21 kB

	# app/engines/parler_engine.py
	# Parler-TTS neural engine — instruction-controlled voice synthesis.
	# Voice is described in plain English rather than selected by ID.
	# Directly maps to Bantrly's grade-band persona system.
	# Paper: "Parler-TTS: Parallel Inference for Text-to-Speech" (HuggingFace, 2024)

	import time
	import torch
	import numpy as np
	import soundfile as sf
	from parler_tts import ParlerTTSForConditionalGeneration
	from transformers import AutoTokenizer

	from engines.base import TTSEngine

	# load once at module level
	_device = "cuda" if torch.cuda.is_available() else "cpu"
	_model = None
	_tokenizer = None

	def _get_model():
	global _model, _tokenizer
	if _model is None:
	_model = ParlerTTSForConditionalGeneration.from_pretrained(
	"parler-tts/parler-tts-mini-v1"
	).to(_device)
	_tokenizer = AutoTokenizer.from_pretrained("parler-tts/parler-tts-mini-v1")
	return _model, _tokenizer


	class ParlerEngine(TTSEngine):

	name = "Parler-TTS (mini)"
	engine_type = "neural-local"
	cost_per_million_chars = 0.0
	is_production_ready = False # 7-10s latency, needs optimization for production
	requires_internet = False

	# voice descriptions map directly to Moo persona specs
	BAND_CONFIG = {
	"K-2": {
	"description": (
	"A warm, friendly female voice speaking slowly and clearly, "
	"with an encouraging and celebratory tone, perfect for young children."
	),
	},
	"3-5": {
	"description": (
	"A curious, energetic female voice speaking at a comfortable pace, "
	"with specific emphasis on key words and an upbeat coaching tone."
	),
	},
	"6-8": {
	"description": (
	"A calm, direct female voice speaking at a natural pace, "
	"serious but approachable, validating feelings before coaching."
	),
	},
	"9-12": {
	"description": (
	"A collegiate, honest male voice speaking at a brisk conversational pace, "
	"no hand-holding, multi-part reasoning, treating the listener as an adult."
	),
	},
	}

	def synthesize(self, text: str, band: str, output_path: str) -> dict:
	config = self.get_band_config(band)
	description = config["description"]

	model, tokenizer = _get_model()

	input_ids = tokenizer(description, return_tensors="pt").input_ids.to(_device)
	prompt_ids = tokenizer(text, return_tensors="pt").input_ids.to(_device)

	start = time.time()
	with torch.no_grad():
	generation = model.generate(
	input_ids=input_ids,
	prompt_input_ids=prompt_ids,
	)
	latency = round(time.time() - start, 3)

	audio = generation.cpu().numpy().squeeze()
	full_path = output_path + ".wav"
	sf.write(full_path, audio, model.config.sampling_rate)

	return {
	"audio_path": full_path,
	"latency_seconds": latency,
	"voice": f"described:{band}",
	"speed": 1.0,
	"engine": self.name,
	}