from modules.languages.constants import LANGUAGES import re import unicodedata def clean_text(text: str) -> str: """ Remove invisible or non-standard Unicode characters that break transliteration. """ # Normalize Unicode (decompose + recompose) text = unicodedata.normalize("NFC", text) # Remove zero-width and control characters invisible_pattern = r'[\u200B-\u200D\uFEFF\u2060]' text = re.sub(invisible_pattern, '', text) # Replace non-breaking spaces with regular spaces text = text.replace('\xa0', ' ') # Remove stray control chars except \n text = ''.join(ch for ch in text if ch.isprintable() or ch == '\n') # Trim multiple spaces text = re.sub(r'[ ]{2,}', ' ', text) return text.strip() def fn_transliterate(input_text: str, input_language: str = 'autodetect') -> dict: try: from aksharamukha import transliterate input_text = clean_text(input_text) # <-- sanitize here target_scripts = {lang["code"]: lang["aksharamukha_name"] for lang in LANGUAGES} output = {} for code, script_name in target_scripts.items(): if(input_text is not None and input_text.strip() != ""): transliterated_text = transliterate.process(input_language, script_name, input_text) else: transliterated_text = input_text output[code] = transliterated_text return output except Exception as e: print(f"Error transliterating '{input_text[:30]}...': {e}") return {lang["code"]: '-' for lang in LANGUAGES}