f5-tts-hungarian / hungarian_preprocessing.py
Maxdorger29's picture
Upload folder using huggingface_hub
75fb62d verified
"""
Hungarian F5-TTS Preprocessing & Postprocessing Pipeline
--------------------------------------------------------
This module provides the necessary text normalization and audio cleaning
required for the Hungarian F5-TTS model to achieve production-grade quality.
It handles:
1. Converting raw numbers and symbols to phonetic Hungarian text.
2. Removing architectural warmup noise (trimming) from the generated audio.
3. General punctuation normalization for SwayamngdiT tokenizers.
"""
import re
import numpy as np
# Basic Hungarian number spelling mapping
HUNGARIAN_NUMBERS = {
'0': 'nulla', '1': 'egy', '2': 'kettő', '3': 'három', '4': 'négy',
'5': 'öt', '6': 'hat', '7': 'hét', '8': 'nyolc', '9': 'kilenc',
'%': 'százalék', '+': 'meg', '=': 'egyenlő'
}
def normalize_hungarian_text(text: str) -> str:
"""
Pre-processes text BEFORE sending it to the F5-TTS model.
The model is character-based (UTF-8), so digits and symbols
must be spelled out phonetically for optimal prosody.
"""
# Replace common symbols
for symbol, word in HUNGARIAN_NUMBERS.items():
if not symbol.isdigit():
text = text.replace(symbol, f" {word} ")
# Simple digit replacement (for production, recommend library like 'num2words')
def replace_digit(match):
digit = match.group()
return HUNGARIAN_NUMBERS.get(digit, digit)
text = re.sub(r'\d', replace_digit, text)
# Collapse multiple whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def clean_generated_audio(audio_array: np.ndarray, sample_rate: int = 24000, trim_ms: int = 350) -> np.ndarray:
"""
Post-processes the generated waveform AFTER inference.
F5-TTS (DiT) often generates a short static noise/sigh during
the attention warmup phase at the start of the audio.
This function truncates the first `trim_ms` milliseconds to
ensure a clean, studio-like start.
"""
trim_samples = int(sample_rate * (trim_ms / 1000.0))
if len(audio_array) > trim_samples:
return audio_array[trim_samples:]
return audio_array
# ==========================================
# Example Usage
# ==========================================
if __name__ == "__main__":
raw_prompt = "A processzor terhelés 95% volt, amit 1 teszt is igazol."
# 1. Normalize
clean_prompt = normalize_hungarian_text(raw_prompt)
print(f"Original: {raw_prompt}")
print(f"Normalized: {clean_prompt}")
# Output: A processzor terhelés kilencöt százalék volt, amit egy teszt is igazol. (Requires advanced chunking for multi-digit)
# 2. Run inference (pseudo code)
# audio_output = f5tts_model.infer(text=clean_prompt, ref_audio="ref.wav")
# 3. Clean
# final_audio = clean_generated_audio(audio_output, sample_rate=24000)
# save_wav(final_audio)