AstraMindAI
/

xtts2-gpt

Safetensors

xtts_gpt

custom_code

Model card Files Files and versions

xet

Community

mlinmg commited on Dec 2, 2024

Commit

aebdb64

verified ·

1 Parent(s): 69c1c0a

Update tokenizer.py

Browse files

Files changed (1) hide show

tokenizer.py +750 -55

tokenizer.py CHANGED Viewed

@@ -1,25 +1,675 @@
-from typing import List, Optional, Union, Dict, Tuple, Any
-import os
 from functools import cached_property
-from transformers import PreTrainedTokenizerFast
-from transformers.tokenization_utils_base import TruncationStrategy, PaddingStrategy
-from tokenizers import Tokenizer, processors
-from tokenizers.pre_tokenizers import WhitespaceSplit
-from tokenizers.processors import TemplateProcessing
 import torch
 from hangul_romanize import Transliter
 from hangul_romanize.rule import academic
 import cutlet
-from TTS.tts.layers.xtts.tokenizer import (multilingual_cleaners, basic_cleaners,
-                                          chinese_transliterate, korean_transliterate,
-                                          japanese_cleaners)
 class XTTSTokenizerFast(PreTrainedTokenizerFast):
     """
     Fast Tokenizer implementation for XTTS model using HuggingFace's PreTrainedTokenizerFast
     """
     def __init__(
             self,
             vocab_file: str = None,
@@ -28,6 +678,7 @@ class XTTSTokenizerFast(PreTrainedTokenizerFast):
             pad_token: str = "[PAD]",
             bos_token: str = "[START]",
             eos_token: str = "[STOP]",
             clean_up_tokenization_spaces: bool = True,
             **kwargs
     ):
@@ -37,11 +688,6 @@ class XTTSTokenizerFast(PreTrainedTokenizerFast):
         if tokenizer_object is not None:
             # Configure the tokenizer
             tokenizer_object.pre_tokenizer = WhitespaceSplit()
-            tokenizer_object.enable_padding(
-                direction='right',
-                pad_id=tokenizer_object.token_to_id(pad_token) or 0,
-                pad_token=pad_token
-            )
             tokenizer_object.post_processor = TemplateProcessing(
                 single=f"{bos_token} $A {eos_token}",
                 special_tokens=[
@@ -72,41 +718,89 @@ class XTTSTokenizerFast(PreTrainedTokenizerFast):
         self._katsu = None
         self._korean_transliter = Transliter(academic)
     @cached_property
     def katsu(self):
         if self._katsu is None:
             self._katsu = cutlet.Cutlet()
         return self._katsu
-    def check_input_length(self, text: str, lang: str):
-        """Check if input text length is within limits for language"""
-        lang = lang.split("-")[0]  # remove region
-        limit = self.char_limits.get(lang, 250)
-        if len(text) > limit:
-            print(f"Warning: Text length exceeds {limit} char limit for '{lang}', may cause truncation.")
     def preprocess_text(self, text: str, lang: str) -> str:
         """Apply text preprocessing for language"""
-        if lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it",
-                   "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
-            text = multilingual_cleaners(text, lang)
-            if lang == "zh":
                 text = chinese_transliterate(text)
-            if lang == "ko":
-                text = korean_transliterate(text)
-        elif lang == "ja":
             text = japanese_cleaners(text, self.katsu)
         else:
             text = basic_cleaners(text)
         return text
     def _batch_encode_plus(
             self,
             batch_text_or_text_pairs,
             add_special_tokens: bool = True,
-            padding_strategy = PaddingStrategy.DO_NOT_PAD,
-            truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE,
-            max_length: Optional[int] = 402,
             stride: int = 0,
             is_split_into_words: bool = False,
             pad_to_multiple_of: Optional[int] = None,
@@ -125,18 +819,26 @@ class XTTSTokenizerFast(PreTrainedTokenizerFast):
         """
         lang = kwargs.pop("lang", ["en"] * len(batch_text_or_text_pairs))
         if isinstance(lang, str):
-            lang = [lang] * len(batch_text_or_text_pairs)
         # Preprocess each text in the batch with its corresponding language
         processed_texts = []
         for text, text_lang in zip(batch_text_or_text_pairs, lang):
             if isinstance(text, str):
                 # Check length and preprocess
-                self.check_input_length(text, text_lang)
                 processed_text = self.preprocess_text(text, text_lang)
                 # Format text with language tag and spaces
-                lang_code = "zh-cn" if text_lang == "zh" else text_lang
                 processed_text = f"[{lang_code}]{processed_text}"
                 processed_text = processed_text.replace(" ", "[SPACE]")
@@ -165,47 +867,40 @@ class XTTSTokenizerFast(PreTrainedTokenizerFast):
             **kwargs
         )
     def __call__(
             self,
             text: Union[str, List[str]],
             lang: Union[str, List[str]] = "en",
             add_special_tokens: bool = True,
-            padding: Union[bool, str, PaddingStrategy] = True,  # Changed default to True
-            truncation: Union[bool, str, TruncationStrategy] = True,  # Changed default to True
-            max_length: Optional[int] = 402,
             stride: int = 0,
             return_tensors: Optional[str] = None,
             return_token_type_ids: Optional[bool] = None,
-            return_attention_mask: Optional[bool] = True,  # Changed default to True
             **kwargs
     ):
         """
         Main tokenization method
-        Args:
-            text: Text or list of texts to tokenize
-            lang: Language code or list of language codes corresponding to each text
-            add_special_tokens: Whether to add special tokens
-            padding: Padding strategy (default True)
-            truncation: Truncation strategy (default True)
-            max_length: Maximum length
-            stride: Stride for truncation
-            return_tensors: Format of output tensors ("pt" for PyTorch)
-            return_token_type_ids: Whether to return token type IDs
-            return_attention_mask: Whether to return attention mask (default True)
         """
         # Convert single string to list for batch processing
         if isinstance(text, str):
             text = [text]
-            if isinstance(lang, str):
-                lang = [lang]
         # Ensure text and lang lists have same length
         if len(text) != len(lang):
-            raise ValueError(f"Number of texts ({len(text)}) must match number of language codes ({len(lang)})")
         # Convert padding strategy
         if isinstance(padding, bool):
-            padding_strategy = PaddingStrategy.MAX_LENGTH if padding else PaddingStrategy.DO_NOT_PAD
         else:
             padding_strategy = PaddingStrategy(padding)
@@ -230,4 +925,4 @@ class XTTSTokenizerFast(PreTrainedTokenizerFast):
             **kwargs
         )
-        return encoded

+import re
+from typing import List, Optional, Union, Dict, Any
 from functools import cached_property
+import pypinyin
 import torch
 from hangul_romanize import Transliter
 from hangul_romanize.rule import academic
+from num2words import num2words
+from spacy.lang.ar import Arabic
+from spacy.lang.en import English
+from spacy.lang.es import Spanish
+from spacy.lang.ja import Japanese
+from spacy.lang.zh import Chinese
+from transformers import PreTrainedTokenizerFast, BatchEncoding
+from transformers.tokenization_utils_base import TruncationStrategy, PaddingStrategy
+from tokenizers import Tokenizer
+from tokenizers.pre_tokenizers import WhitespaceSplit
+from tokenizers.processors import TemplateProcessing
+from auralis.models.xttsv2.components.tts.layers.xtts.zh_num2words import TextNorm as zh_num2words
 import cutlet
+def get_spacy_lang(lang):
+    if lang == "zh":
+        return Chinese()
+    elif lang == "ja":
+        return Japanese()
+    elif lang == "ar":
+        return Arabic()
+    elif lang == "es":
+        return Spanish()
+    else:
+        # For most languages, English does the job
+        return English()
+def find_best_split_point(text: str, target_pos: int, window_size: int = 30) -> int:
+    """
+    Find best split point near target position considering punctuation and language markers.
+    added for better sentence splitting in TTS.
+    """
+    # Define split markers by priority
+    markers = [
+        # Strong breaks (longest pause)
+        (r'[.!?؟။။။]+[\s]*', 1.0),  # Periods, exclamation, question (multi-script)
+        (r'[\n\r]+\s*[\n\r]+', 1.0),  # Multiple newlines
+        (r'[:|;；：；][\s]*', 0.9),  # Colons, semicolons (multi-script)
+        # Medium breaks
+        (r'[,，،、][\s]*', 0.8),  # Commas (multi-script)
+        (r'[)}\]）】』»›》\s]+', 0.7),  # Closing brackets/parentheses
+        (r'[-—−]+[\s]*', 0.7),  # Dashes
+        # Weak breaks
+        (r'\s+[&+=/\s]+\s+', 0.6),  # Special characters with spaces
+        (r'[\s]+', 0.5),  # Any whitespace as last resort
+    ]
+    # Calculate window boundaries
+    start = max(0, target_pos - window_size)
+    end = min(len(text), target_pos + window_size)
+    window = text[start:end]
+    best_pos = target_pos
+    best_score = 0
+    for pattern, priority in markers:
+        matches = list(re.finditer(pattern, window))
+        for match in matches:
+            # Calculate position score based on distance from target
+            pos = start + match.end()
+            distance = abs(pos - target_pos)
+            distance_score = 1 - (distance / (window_size * 2))
+            # Combine priority and position scores
+            score = priority * distance_score
+            if score > best_score:
+                best_score = score
+                best_pos = pos
+    return best_pos
+def split_sentence(text: str, lang: str, text_split_length: int = 250) -> List[str]:
+    """
+    Enhanced sentence splitting with language awareness and optimal breakpoints.
+    Args:
+        text: Input text to split
+        lang: Language code
+        text_split_length: Target length for splits
+    Returns:
+        List of text splits optimized for TTS
+    """
+    text = text.strip()
+    if len(text) <= text_split_length:
+        return [text]
+    nlp = get_spacy_lang(lang)
+    if "sentencizer" not in nlp.pipe_names:
+        nlp.add_pipe("sentencizer")
+    # Get base sentences using spaCy
+    doc = nlp(text)
+    sentences = list(doc.sents)
+    splits = []
+    current_split = []
+    current_length = 0
+    for sent in sentences:
+        sentence_text = str(sent).strip()
+        sentence_length = len(sentence_text)
+        # If sentence fits in current split
+        if current_length + sentence_length <= text_split_length:
+            current_split.append(sentence_text)
+            current_length += sentence_length + 1
+        # Handle long sentences
+        elif sentence_length > text_split_length:
+            # Add current split if exists
+            if current_split:
+                splits.append(" ".join(current_split))
+                current_split = []
+                current_length = 0
+            # Split long sentence at optimal points
+            remaining = sentence_text
+            while len(remaining) > text_split_length:
+                split_pos = find_best_split_point(
+                    remaining,
+                    text_split_length,
+                    window_size=30
+                )
+                # Add split and continue with remainder
+                splits.append(remaining[:split_pos].strip())
+                remaining = remaining[split_pos:].strip()
+            # Handle remaining text
+            if remaining:
+                current_split = [remaining]
+                current_length = len(remaining)
+        # Start new split
+        else:
+            splits.append(" ".join(current_split))
+            current_split = [sentence_text]
+            current_length = sentence_length
+    # Add final split if needed
+    if current_split:
+        splits.append(" ".join(current_split))
+    cleaned_sentences = [s[:-1]+' ' if s.endswith('.') else s for s in splits if s] # prevents annoying sounds in italian
+    # Clean up splits
+    return cleaned_sentences
+_whitespace_re = re.compile(r"\s+")
+# List of (regular expression, replacement) pairs for abbreviations:
+_abbreviations = {
+    "en": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("mrs", "misess"),
+            ("mr", "mister"),
+            ("dr", "doctor"),
+            ("st", "saint"),
+            ("co", "company"),
+            ("jr", "junior"),
+            ("maj", "major"),
+            ("gen", "general"),
+            ("drs", "doctors"),
+            ("rev", "reverend"),
+            ("lt", "lieutenant"),
+            ("hon", "honorable"),
+            ("sgt", "sergeant"),
+            ("capt", "captain"),
+            ("esq", "esquire"),
+            ("ltd", "limited"),
+            ("col", "colonel"),
+            ("ft", "fort"),
+        ]
+    ],
+    "es": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("sra", "señora"),
+            ("sr", "señor"),
+            ("dr", "doctor"),
+            ("dra", "doctora"),
+            ("st", "santo"),
+            ("co", "compañía"),
+            ("jr", "junior"),
+            ("ltd", "limitada"),
+        ]
+    ],
+    "fr": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("mme", "madame"),
+            ("mr", "monsieur"),
+            ("dr", "docteur"),
+            ("st", "saint"),
+            ("co", "compagnie"),
+            ("jr", "junior"),
+            ("ltd", "limitée"),
+        ]
+    ],
+    "de": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("fr", "frau"),
+            ("dr", "doktor"),
+            ("st", "sankt"),
+            ("co", "firma"),
+            ("jr", "junior"),
+        ]
+    ],
+    "pt": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("sra", "senhora"),
+            ("sr", "senhor"),
+            ("dr", "doutor"),
+            ("dra", "doutora"),
+            ("st", "santo"),
+            ("co", "companhia"),
+            ("jr", "júnior"),
+            ("ltd", "limitada"),
+        ]
+    ],
+    "it": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            # ("sig.ra", "signora"),
+            ("sig", "signore"),
+            ("dr", "dottore"),
+            ("st", "santo"),
+            ("co", "compagnia"),
+            ("jr", "junior"),
+            ("ltd", "limitata"),
+        ]
+    ],
+    "pl": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("p", "pani"),
+            ("m", "pan"),
+            ("dr", "doktor"),
+            ("sw", "święty"),
+            ("jr", "junior"),
+        ]
+    ],
+    "ar": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            # There are not many common abbreviations in Arabic as in English.
+        ]
+    ],
+    "zh": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            # Chinese doesn't typically use abbreviations in the same way as Latin-based scripts.
+        ]
+    ],
+    "cs": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("dr", "doktor"),  # doctor
+            ("ing", "inženýr"),  # engineer
+            ("p", "pan"),  # Could also map to pani for woman but no easy way to do it
+            # Other abbreviations would be specialized and not as common.
+        ]
+    ],
+    "ru": [
+        (re.compile("\\b%s\\b" % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("г-жа", "госпожа"),  # Mrs.
+            ("г-н", "господин"),  # Mr.
+            ("д-р", "доктор"),  # doctor
+            # Other abbreviations are less common or specialized.
+        ]
+    ],
+    "nl": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("dhr", "de heer"),  # Mr.
+            ("mevr", "mevrouw"),  # Mrs.
+            ("dr", "dokter"),  # doctor
+            ("jhr", "jonkheer"),  # young lord or nobleman
+            # Dutch uses more abbreviations, but these are the most common ones.
+        ]
+    ],
+    "tr": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("b", "bay"),  # Mr.
+            ("byk", "büyük"),  # büyük
+            ("dr", "doktor"),  # doctor
+            # Add other Turkish abbreviations here if needed.
+        ]
+    ],
+    "hu": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            ("dr", "doktor"),  # doctor
+            ("b", "bácsi"),  # Mr.
+            ("nőv", "nővér"),  # nurse
+            # Add other Hungarian abbreviations here if needed.
+        ]
+    ],
+    "ko": [
+        (re.compile("\\b%s\\." % x[0], re.IGNORECASE), x[1])
+        for x in [
+            # Korean doesn't typically use abbreviations in the same way as Latin-based scripts.
+        ]
+    ],
+}
+def expand_abbreviations_multilingual(text, lang="en"):
+    if lang in _abbreviations:
+        for regex, replacement in _abbreviations[lang]:
+            text = re.sub(regex, replacement, text)
+    return text
+_symbols_multilingual = {
+    "en": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " and "),
+            ("@", " at "),
+            ("%", " percent "),
+            ("#", " hash "),
+            ("$", " dollar "),
+            ("£", " pound "),
+            ("°", " degree "),
+        ]
+    ],
+    "es": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " y "),
+            ("@", " arroba "),
+            ("%", " por ciento "),
+            ("#", " numeral "),
+            ("$", " dolar "),
+            ("£", " libra "),
+            ("°", " grados "),
+        ]
+    ],
+    "fr": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " et "),
+            ("@", " arobase "),
+            ("%", " pour cent "),
+            ("#", " dièse "),
+            ("$", " dollar "),
+            ("£", " livre "),
+            ("°", " degrés "),
+        ]
+    ],
+    "de": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " und "),
+            ("@", " at "),
+            ("%", " prozent "),
+            ("#", " raute "),
+            ("$", " dollar "),
+            ("£", " pfund "),
+            ("°", " grad "),
+        ]
+    ],
+    "pt": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " e "),
+            ("@", " arroba "),
+            ("%", " por cento "),
+            ("#", " cardinal "),
+            ("$", " dólar "),
+            ("£", " libra "),
+            ("°", " graus "),
+        ]
+    ],
+    "it": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " e "),
+            ("@", " chiocciola "),
+            ("%", " per cento "),
+            ("#", " cancelletto "),
+            ("$", " dollaro "),
+            ("£", " sterlina "),
+            ("°", " gradi "),
+        ]
+    ],
+    "pl": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " i "),
+            ("@", " małpa "),
+            ("%", " procent "),
+            ("#", " krzyżyk "),
+            ("$", " dolar "),
+            ("£", " funt "),
+            ("°", " stopnie "),
+        ]
+    ],
+    "ar": [
+        # Arabic
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " و "),
+            ("@", " على "),
+            ("%", " في المئة "),
+            ("#", " رقم "),
+            ("$", " دولار "),
+            ("£", " جنيه "),
+            ("°", " درجة "),
+        ]
+    ],
+    "zh": [
+        # Chinese
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " 和 "),
+            ("@", " 在 "),
+            ("%", " 百分之 "),
+            ("#", " 号 "),
+            ("$", " 美元 "),
+            ("£", " 英镑 "),
+            ("°", " 度 "),
+        ]
+    ],
+    "cs": [
+        # Czech
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " a "),
+            ("@", " na "),
+            ("%", " procento "),
+            ("#", " křížek "),
+            ("$", " dolar "),
+            ("£", " libra "),
+            ("°", " stupně "),
+        ]
+    ],
+    "ru": [
+        # Russian
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " и "),
+            ("@", " собака "),
+            ("%", " процентов "),
+            ("#", " номер "),
+            ("$", " доллар "),
+            ("£", " фунт "),
+            ("°", " градус "),
+        ]
+    ],
+    "nl": [
+        # Dutch
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " en "),
+            ("@", " bij "),
+            ("%", " procent "),
+            ("#", " hekje "),
+            ("$", " dollar "),
+            ("£", " pond "),
+            ("°", " graden "),
+        ]
+    ],
+    "tr": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " ve "),
+            ("@", " at "),
+            ("%", " yüzde "),
+            ("#", " diyez "),
+            ("$", " dolar "),
+            ("£", " sterlin "),
+            ("°", " derece "),
+        ]
+    ],
+    "hu": [
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " és "),
+            ("@", " kukac "),
+            ("%", " százalék "),
+            ("#", " kettőskereszt "),
+            ("$", " dollár "),
+            ("£", " font "),
+            ("°", " fok "),
+        ]
+    ],
+    "ko": [
+        # Korean
+        (re.compile(r"%s" % re.escape(x[0]), re.IGNORECASE), x[1])
+        for x in [
+            ("&", " 그리고 "),
+            ("@", " 에 "),
+            ("%", " 퍼센트 "),
+            ("#", " 번호 "),
+            ("$", " 달러 "),
+            ("£", " 파운드 "),
+            ("°", " 도 "),
+        ]
+    ],
+}
+def expand_symbols_multilingual(text, lang="en"):
+    if lang in _symbols_multilingual:
+        for regex, replacement in _symbols_multilingual[lang]:
+            text = re.sub(regex, replacement, text)
+            text = text.replace("  ", " ")  # Ensure there are no double spaces
+    return text.strip()
+_ordinal_re = {
+    "en": re.compile(r"([0-9]+)(st|nd|rd|th)"),
+    "es": re.compile(r"([0-9]+)(º|ª|er|o|a|os|as)"),
+    "fr": re.compile(r"([0-9]+)(º|ª|er|re|e|ème)"),
+    "de": re.compile(r"([0-9]+)(st|nd|rd|th|º|ª|\.(?=\s|$))"),
+    "pt": re.compile(r"([0-9]+)(º|ª|o|a|os|as)"),
+    "it": re.compile(r"([0-9]+)(º|°|ª|o|a|i|e)"),
+    "pl": re.compile(r"([0-9]+)(º|ª|st|nd|rd|th)"),
+    "ar": re.compile(r"([0-9]+)(ون|ين|ث|ر|ى)"),
+    "cs": re.compile(r"([0-9]+)\.(?=\s|$)"),  # In Czech, a dot is often used after the number to indicate ordinals.
+    "ru": re.compile(r"([0-9]+)(-й|-я|-е|-ое|-ье|-го)"),
+    "nl": re.compile(r"([0-9]+)(de|ste|e)"),
+    "tr": re.compile(r"([0-9]+)(\.|inci|nci|uncu|üncü|\.)"),
+    "hu": re.compile(r"([0-9]+)(\.|adik|edik|odik|edik|ödik|ödike|ik)"),
+    "ko": re.compile(r"([0-9]+)(번째|번|차|째)"),
+}
+_number_re = re.compile(r"[0-9]+")
+# noinspection Annotator
+_currency_re = {
+    "USD": re.compile(r"((\$[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+\$))"),
+    "GBP": re.compile(r"((£[0-9\.\,]*[0-9]+)|([0-9\.\,]*[0-9]+£))"),
+    "EUR": re.compile(r"(([0-9\.\,]*[0-9]+€)|((€[0-9\.\,]*[0-9]+)))"),
+}
+_comma_number_re = re.compile(r"\b\d{1,3}(,\d{3})*(\.\d+)?\b")
+_dot_number_re = re.compile(r"\b\d{1,3}(\.\d{3})*(\,\d+)?\b")
+_decimal_number_re = re.compile(r"([0-9]+[.,][0-9]+)")
+def _remove_commas(m):
+    text = m.group(0)
+    if "," in text:
+        text = text.replace(",", "")
+    return text
+def _remove_dots(m):
+    text = m.group(0)
+    if "." in text:
+        text = text.replace(".", "")
+    return text
+def _expand_decimal_point(m, lang="en"):
+    amount = m.group(1).replace(",", ".")
+    return num2words(float(amount), lang=lang if lang != "cs" else "cz")
+def _expand_currency(m, lang="en", currency="USD"):
+    amount = float((re.sub(r"[^\d.]", "", m.group(0).replace(",", "."))))
+    full_amount = num2words(amount, to="currency", currency=currency, lang=lang if lang != "cs" else "cz")
+    and_equivalents = {
+        "en": ", ",
+        "es": " con ",
+        "fr": " et ",
+        "de": " und ",
+        "pt": " e ",
+        "it": " e ",
+        "pl": ", ",
+        "cs": ", ",
+        "ru": ", ",
+        "nl": ", ",
+        "ar": ", ",
+        "tr": ", ",
+        "hu": ", ",
+        "ko": ", ",
+    }
+    if amount.is_integer():
+        last_and = full_amount.rfind(and_equivalents.get(lang, ", "))
+        if last_and != -1:
+            full_amount = full_amount[:last_and]
+    return full_amount
+def _expand_ordinal(m, lang="en"):
+    return num2words(int(m.group(1)), ordinal=True, lang=lang if lang != "cs" else "cz")
+def _expand_number(m, lang="en"):
+    return num2words(int(m.group(0)), lang=lang if lang != "cs" else "cz")
+def expand_numbers_multilingual(text, lang="en"):
+    if lang == "zh":
+        text = zh_num2words()(text)
+    else:
+        if lang in ["en", "ru"]:
+            text = re.sub(_comma_number_re, _remove_commas, text)
+        else:
+            text = re.sub(_dot_number_re, _remove_dots, text)
+        try:
+            text = re.sub(_currency_re["GBP"], lambda m: _expand_currency(m, lang, "GBP"), text)
+            text = re.sub(_currency_re["USD"], lambda m: _expand_currency(m, lang, "USD"), text)
+            text = re.sub(_currency_re["EUR"], lambda m: _expand_currency(m, lang, "EUR"), text)
+        except Exception as e:
+            pass
+        if lang != "tr":
+            text = re.sub(_decimal_number_re, lambda m: _expand_decimal_point(m, lang), text)
+        if lang in _ordinal_re:
+            text = re.sub(_ordinal_re[lang], lambda m: _expand_ordinal(m, lang), text)
+        text = re.sub(_number_re, lambda m: _expand_number(m, lang), text)
+    return text
+def lowercase(text):
+    return text.lower()
+def collapse_whitespace(text):
+    return re.sub(_whitespace_re, " ", text)
+def multilingual_cleaners(text, lang):
+    text = text.replace('"', "")
+    if lang == "tr":
+        text = text.replace("İ", "i")
+        text = text.replace("Ö", "ö")
+        text = text.replace("Ü", "ü")
+    text = lowercase(text)
+    text = expand_numbers_multilingual(text, lang)
+    text = expand_abbreviations_multilingual(text, lang)
+    text = expand_symbols_multilingual(text, lang=lang)
+    text = collapse_whitespace(text)
+    return text
+def basic_cleaners(text):
+    """Basic pipeline that lowercases and collapses whitespace without transliteration."""
+    text = lowercase(text)
+    text = collapse_whitespace(text)
+    return text
+def chinese_transliterate(text):
+    return "".join(
+        [p[0] for p in pypinyin.pinyin(text, style=pypinyin.Style.TONE3, heteronym=False, neutral_tone_with_five=True)]
+    )
+def japanese_cleaners(text, katsu):
+    text = katsu.romaji(text)
+    text = lowercase(text)
+    return text
+def korean_transliterate(text, transliter):
+    return transliter.translit(text)
+# Fast Tokenizer Class
 class XTTSTokenizerFast(PreTrainedTokenizerFast):
     """
     Fast Tokenizer implementation for XTTS model using HuggingFace's PreTrainedTokenizerFast
     """
     def __init__(
             self,
             vocab_file: str = None,
             pad_token: str = "[PAD]",
             bos_token: str = "[START]",
             eos_token: str = "[STOP]",
+            auto_map: dict = {"AutoTokenizer": ["AstraMindAI/xtts2-gpt--tokenizer.XTTSTokenizerFast", None]},
             clean_up_tokenization_spaces: bool = True,
             **kwargs
     ):
         if tokenizer_object is not None:
             # Configure the tokenizer
             tokenizer_object.pre_tokenizer = WhitespaceSplit()
             tokenizer_object.post_processor = TemplateProcessing(
                 single=f"{bos_token} $A {eos_token}",
                 special_tokens=[
         self._katsu = None
         self._korean_transliter = Transliter(academic)
+        # Ensure pad_token_id is set
+        if self.pad_token_id is None:
+            self.pad_token_id = self.tokenizer.token_to_id(self.pad_token)
     @cached_property
     def katsu(self):
         if self._katsu is None:
             self._katsu = cutlet.Cutlet()
         return self._katsu
     def preprocess_text(self, text: str, lang: str) -> str:
         """Apply text preprocessing for language"""
+        base_lang = lang.split("-")[0]  # remove region
+        if base_lang in {"ar", "cs", "de", "en", "es", "fr", "hu", "it",
+                         "nl", "pl", "pt", "ru", "tr", "zh", "ko"}:
+            text = multilingual_cleaners(text, base_lang)
+            if base_lang == "zh":
                 text = chinese_transliterate(text)
+            if base_lang == "ko":
+                text = korean_transliterate(text, self._korean_transliter)
+        elif base_lang == "ja":
             text = japanese_cleaners(text, self.katsu)
         else:
             text = basic_cleaners(text)
         return text
+    def batch_encode_with_split(self, texts: Union[str, List[str]], lang: Union[str, List[str]],
+                                **kwargs) -> torch.Tensor:
+        """
+        Split texts into smaller chunks based on language character limits and encode them using HuggingFace fast tokenizer.
+        strictly mimic the xttsv2 tokenizer
+        """
+        # Convert single inputs to lists
+        if isinstance(texts, str):
+            texts = [texts]
+        if isinstance(lang, str):
+            lang = [lang]
+        # Ensure lang list matches texts list
+        if len(lang) == 1 and len(texts) > 1:
+            lang = lang * len(texts)
+        # Check if texts and lang have the same length
+        if len(texts) != len(lang):
+            raise ValueError(f"Number of texts ({len(texts)}) does not match number of languages ({len(lang)}).")
+        chunk_list = []
+        max_splits = 0
+        # For each text, split into chunks based on character limit
+        for text, text_lang in zip(texts, lang):
+            # Get language character limit
+            base_lang = text_lang.split("-")[0]
+            char_limit = self.char_limits.get(base_lang, 250)
+            # Clean and preprocess
+            text = self.preprocess_text(text, text_lang)
+            # Split text into sentences/chunks based on language
+            chunk_list = split_sentence(text, base_lang, text_split_length=char_limit)
+        # Ensure the tokenizer is a fast tokenizer
+        if not self.is_fast:
+            raise ValueError("The tokenizer must be a fast tokenizer.")
+        # Encode all chunks using the fast tokenizer
+        encoding: BatchEncoding = self(
+            chunk_list,
+            lang = lang,
+            add_special_tokens=False,
+            padding=False,
+            **kwargs
+        )
+        # The 'input_ids' tensor will have shape [total_chunks, max_sequence_length]
+        return encoding['input_ids']  # Tensor of shape [total_chunks, sequence_length]
     def _batch_encode_plus(
             self,
             batch_text_or_text_pairs,
             add_special_tokens: bool = True,
+            padding_strategy=PaddingStrategy.DO_NOT_PAD,
+            truncation_strategy=TruncationStrategy.DO_NOT_TRUNCATE,
+            max_length: Optional[int] = None,
             stride: int = 0,
             is_split_into_words: bool = False,
             pad_to_multiple_of: Optional[int] = None,
         """
         lang = kwargs.pop("lang", ["en"] * len(batch_text_or_text_pairs))
         if isinstance(lang, str):
+            lang = [lang]
+        # Ensure lang list matches texts list
+        if len(lang) == 1 and len(batch_text_or_text_pairs) > 1:
+            lang = lang * len(batch_text_or_text_pairs)
+        # Check if batch_text_or_text_pairs and lang have the same length
+        if len(batch_text_or_text_pairs) != len(lang):
+            raise ValueError(f"Number of texts ({len(batch_text_or_text_pairs)}) does not match number of languages ({len(lang)}).")
         # Preprocess each text in the batch with its corresponding language
         processed_texts = []
         for text, text_lang in zip(batch_text_or_text_pairs, lang):
             if isinstance(text, str):
                 # Check length and preprocess
+                #self.check_input_length(text, text_lang)
                 processed_text = self.preprocess_text(text, text_lang)
                 # Format text with language tag and spaces
+                base_lang = text_lang.split("-")[0]
+                lang_code = "zh-cn" if base_lang == "zh" else base_lang
                 processed_text = f"[{lang_code}]{processed_text}"
                 processed_text = processed_text.replace(" ", "[SPACE]")
             **kwargs
         )
     def __call__(
             self,
             text: Union[str, List[str]],
             lang: Union[str, List[str]] = "en",
             add_special_tokens: bool = True,
+            padding: Union[bool, str, PaddingStrategy] = False,
+            truncation: Union[bool, str, TruncationStrategy] = False,
+            max_length: Optional[int] = None,
             stride: int = 0,
             return_tensors: Optional[str] = None,
             return_token_type_ids: Optional[bool] = None,
+            return_attention_mask: Optional[bool] = True,
             **kwargs
     ):
         """
         Main tokenization method
         """
         # Convert single string to list for batch processing
         if isinstance(text, str):
             text = [text]
+        if isinstance(lang, str):
+            lang = [lang]
+        # Ensure lang list matches texts list
+        if len(lang) == 1 and len(text) > 1:
+            lang = lang * len(text)
         # Ensure text and lang lists have same length
         if len(text) != len(lang):
+            raise ValueError(f"Number of texts ({len(text)}) does not match number of languages ({len(lang)}).")
         # Convert padding strategy
         if isinstance(padding, bool):
+            padding_strategy = PaddingStrategy.LONGEST if padding else PaddingStrategy.DO_NOT_PAD
         else:
             padding_strategy = PaddingStrategy(padding)
             **kwargs
         )
+        return encoded