# ==================================================================================================
# DEEPFAKE AUDIO - synthesizer/utils/text.py (Linguistic Tokenization Engine)
# ==================================================================================================
# 
# 📝 DESCRIPTION
# This module implements the text-to-sequence transformation logic. It handles 
# cleaning, ARPAbet embedding (via curly brace detection), and numeric ID 
# mapping, converting human-readable text into neural embeddings for the 
# synthesizer.
#
# 👤 AUTHORS
# - Amey Thakur (https://github.com/Amey-Thakur)
# - Mega Satish (https://github.com/msatmod)
#
# 🤝🏻 CREDITS
# Original Real-Time Voice Cloning methodology by CorentinJ
# Repository: https://github.com/CorentinJ/Real-Time-Voice-Cloning
#
# 🔗 PROJECT LINKS
# Repository: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO
# Video Demo: https://youtu.be/i3wnBcbHDbs
# Research: https://github.com/Amey-Thakur/DEEPFAKE-AUDIO/blob/main/DEEPFAKE-AUDIO.ipynb
#
# 📜 LICENSE
# Released under the MIT License
# Release Date: 2021-02-06
# ==================================================================================================

from synthesizer.utils.symbols import symbols
from synthesizer.utils import cleaners
import re

# Neural Mapping: Bidi dictionary for character/ID conversion
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
_id_to_symbol = {i: s for i, s in enumerate(symbols)}

# ARPAbet Detection: Identifies phonetic sequences in curly braces
_curly_re = re.compile(r"(.*?)\{(.+?)\}(.*)")

def text_to_sequence(text, cleaner_names):
    """
    Symbolic Ingestion:
    Converts raw text into a categorical sequence of token IDs. 
    Supports hybrid text/ARPAbet inputs via curly brace tagging.
    """
    sequence = []

    # Sequential Parsing Segment by Segment
    while len(text):
        m = _curly_re.match(text)
        if not m:
            sequence += _symbols_to_sequence(_clean_text(text, cleaner_names))
            break
        sequence += _symbols_to_sequence(_clean_text(m.group(1), cleaner_names))
        sequence += _arpabet_to_sequence(m.group(2))
        text = m.group(3)

    # Termination: Append EOS token
    sequence.append(_symbol_to_id["~"])
    return sequence

def sequence_to_text(sequence):
    """Linguistic Restoration: Decodes token ID sequences back into human-readable text."""
    result = ""
    for symbol_id in sequence:
        if symbol_id in _id_to_symbol:
            s = _id_to_symbol[symbol_id]
            # Handle ARPAbet re-bracketing
            if len(s) > 1 and s[0] == "@":
                s = "{%s}" % s[1:]
            result += s
    return result.replace("}{", " ")

def _clean_text(text, cleaner_names):
    """Pipeline Orchestration: Runs text through specified normalization filters."""
    for name in cleaner_names:
        cleaner = getattr(cleaners, name)
        if not cleaner:
            raise Exception("Unknown cleaner: %s" % name)
        text = cleaner(text)
    return text

def _symbols_to_sequence(symbols):
    """ID Conversion: Maps a list of character tokens to their numeric analogues."""
    return [_symbol_to_id[s] for s in symbols if _should_keep_symbol(s)]

def _arpabet_to_sequence(text):
    """Phonetic Encoding: Converts space-delimited ARPAbet codes into ID sequences."""
    return _symbols_to_sequence(["@" + s for s in text.split()])

def _should_keep_symbol(s):
    """Filter Logic: Ensures only valid tokens are included in the sequence."""
    return s in _symbol_to_id and s not in ("_", "~")