hexa-tts-5b / src /text_encoder.py
Hexa09's picture
Upload folder using huggingface_hub
e729286 verified
import torch
from phonemizer import phonemize
from phonemizer.separator import Separator
class TextEncoder:
"""
Handles text-to-phoneme conversion for 14 languages.
"""
def __init__(self, vocab_map=None):
self.separator = Separator(phone=' ', word='|', syllable='')
# Maps 14 languages to phonemizer language codes
self.lang_map = {
'en': 'en-us', 'zh': 'cmn', 'es': 'es', 'fr': 'fr-fr',
'de': 'de', 'ja': 'ja', 'ko': 'ko', 'ru': 'ru',
'pt': 'pt', 'it': 'it', 'hi': 'hi', 'ar': 'ar',
'tr': 'tr', 'nl': 'nl', 'bn': 'bn'
}
# Simple character-to-id mapping (placeholder)
self.vocab = vocab_map if vocab_map else {c: i for i, c in enumerate(" abcdefghijklmnopqrstuvwxyz|")}
def preprocess(self, text, lang_code='en'):
"""
Converts text to phoneme IDs.
"""
if lang_code not in self.lang_map:
print(f"Warning: Language {lang_code} not fully supported, defaulting to English backend.")
backend_lang = 'en-us'
else:
backend_lang = self.lang_map[lang_code]
try:
# Phonemize
phonemes = phonemize(
text,
language=backend_lang,
backend='espeak',
separator=self.separator,
strip=True,
preserve_punctuation=True,
njobs=1
)
except RuntimeError:
print("Warning: eSpeak not found. Falling back to character-level tokenization.")
phonemes = list(text) # Simple list of characters as fallback
# Tokenize (Simple lookup for now)
token_ids = [self.vocab.get(p, 0) for p in phonemes]
return torch.tensor(token_ids).unsqueeze(0) # Batch dim