Maxdorger29
/

f5-tts-hungarian

speech-synthesis

Model card Files Files and versions

f5-tts-hungarian / hungarian_preprocessing.py

Maxdorger29's picture

Upload folder using huggingface_hub

75fb62d verified 2 months ago

history blame contribute delete

2.89 kB

	"""
	Hungarian F5-TTS Preprocessing & Postprocessing Pipeline
	--------------------------------------------------------
	This module provides the necessary text normalization and audio cleaning
	required for the Hungarian F5-TTS model to achieve production-grade quality.

	It handles:
	1. Converting raw numbers and symbols to phonetic Hungarian text.
	2. Removing architectural warmup noise (trimming) from the generated audio.
	3. General punctuation normalization for SwayamngdiT tokenizers.
	"""

	import re
	import numpy as np

	# Basic Hungarian number spelling mapping
	HUNGARIAN_NUMBERS = {
	'0': 'nulla', '1': 'egy', '2': 'kettő', '3': 'három', '4': 'négy',
	'5': 'öt', '6': 'hat', '7': 'hét', '8': 'nyolc', '9': 'kilenc',
	'%': 'százalék', '+': 'meg', '=': 'egyenlő'
	}

	def normalize_hungarian_text(text: str) -> str:
	"""
	Pre-processes text BEFORE sending it to the F5-TTS model.
	The model is character-based (UTF-8), so digits and symbols
	must be spelled out phonetically for optimal prosody.
	"""
	# Replace common symbols
	for symbol, word in HUNGARIAN_NUMBERS.items():
	if not symbol.isdigit():
	text = text.replace(symbol, f" {word} ")

	# Simple digit replacement (for production, recommend library like 'num2words')
	def replace_digit(match):
	digit = match.group()
	return HUNGARIAN_NUMBERS.get(digit, digit)

	text = re.sub(r'\d', replace_digit, text)

	# Collapse multiple whitespace
	text = re.sub(r'\s+', ' ', text).strip()
	return text

	def clean_generated_audio(audio_array: np.ndarray, sample_rate: int = 24000, trim_ms: int = 350) -> np.ndarray:
	"""
	Post-processes the generated waveform AFTER inference.
	F5-TTS (DiT) often generates a short static noise/sigh during
	the attention warmup phase at the start of the audio.

	This function truncates the first `trim_ms` milliseconds to
	ensure a clean, studio-like start.
	"""
	trim_samples = int(sample_rate * (trim_ms / 1000.0))
	if len(audio_array) > trim_samples:
	return audio_array[trim_samples:]
	return audio_array

	# ==========================================
	# Example Usage
	# ==========================================
	if __name__ == "__main__":
	raw_prompt = "A processzor terhelés 95% volt, amit 1 teszt is igazol."

	# 1. Normalize
	clean_prompt = normalize_hungarian_text(raw_prompt)
	print(f"Original: {raw_prompt}")
	print(f"Normalized: {clean_prompt}")
	# Output: A processzor terhelés kilencöt százalék volt, amit egy teszt is igazol. (Requires advanced chunking for multi-digit)

	# 2. Run inference (pseudo code)
	# audio_output = f5tts_model.infer(text=clean_prompt, ref_audio="ref.wav")

	# 3. Clean
	# final_audio = clean_generated_audio(audio_output, sample_rate=24000)
	# save_wav(final_audio)