File size: 13,276 Bytes

9c737ff

# Copyright 2026 Jakub Sykała
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#                           Luna Tokenizer
#
# The 9 Features
#--------------------------------------------------------------------------
# 0: syllable_id     - Unique syllable identifier
# 1: onset_id        - Initial consonant cluster (e.g., "str" in "string")
# 2: nucleus_id      - Vowel core (e.g., "i" in "string")
# 3: coda_id         - Final consonants (e.g., "ng" in "string")
# 4: position        - Position in word (0=mid, 1=start, 2=end, 3=both)
# 5: is_capitalized  - Starts with uppercase? (0 or 1)
# 6: token_type      - 0=syllable, 1=number, 2=punctuation, 3=special
# 7: has_space_after - Space follows this token? (0 or 1)
# 8: is_word_end     - Last syllable of word? (0 or 1)
#--------------------------------------------------------------------------

import re
from typing import List, Dict, Tuple, Optional

try:
    import pyphen
    PYPHEN_AVAILABLE = True
except ImportError:
    PYPHEN_AVAILABLE = False
    print("Warning: pyphen not installed. Using basic syllabification.")

class LunaTokenizer:
    """
    Luna Tokenizer - Phonetically-aware tokenization.

    Converts text into 9-dimensional token representations based on
    syllable structure (Onset-Nucleus-Coda) plus metadata features.
    """

    # Regex patterns
    WORD_PATTERN = re.compile(r"([a-zA-Z]+|[0-9]|[^\w\s]|\s+)")

    def __init__(self):
        # Initialize pyphen for syllabification
        if PYPHEN_AVAILABLE:
            self.syllabifier = pyphen.Pyphen(lang='en_US')
        else:
            self.syllabifier = None

        # Vocabularies (built during encoding or loaded from vocab.json)
        self.syllable_to_id: Dict[str, int] = {'<pad>': 0, '<unk>': 1}
        self.id_to_syllable: Dict[int, str] = {0: '<pad>', 1:'<unk>'}

        # Phonetic component vocabularies
        self.onset_to_id: Dict[str, int] = {
            '<pad>':0, '': 1, '<num>': 2, '<punct>': 3, '<special>': 4
        }
        self.nucleus_to_id: Dict[str, int] = {'<pad>': 0, '': 1}
        self.coda_to_id: Dict[str, int] = {'<pad>': 0, '': 1}

    def get_feature_names(self) -> List[str]:
        """Return ordered list of feature names (9 features)"""
        return [
            'syllable_id',      # 0
            'onset_id',         # 1
            'nucleus_id',       # 2
            'coda_id',          # 3
            'position',         # 4
            'is_capitalized',   # 5
            'token_type',       # 6
            'has_space_after',  # 7
            'is_word_end',      # 8
        ]
    
    def _syllabify(self, word:str) -> List[str]:
        """Split word into syllables."""
        if not word:
            return []
        
        if self.syllabifier:
            hyphenated = self.syllabifier.inserted(word.lower())
            return hyphenated.split('-') if hyphenated else [word.lower()]
        else:
            # Basic fallback: Split on vowel boundries
            return self._basic_syllabify(word.lower())
        
    def _basic_syllabify(self, word: str) -> List[str]:
        """Basic syllabification fallback."""
        vowels = set('aeiouy')
        syllables = []
        current = ""

        for i, char in enumerate(word):
            current += char
            if char in vowels:
                # Look ahead -  if next char is consonant followed by vowel, split
                if i + 2 < len(word) and word[i+1] not in vowels and word[i+2] in vowels:
                    syllables.append(current)
                    current = ""
        
        if current:
            if syllables:
                syllables[-1] += current
            else:
                syllables.append(current)
        return syllables if syllables else [word]
    
    def _extract_onset_nucleus_coda(self, syllable: str) -> Tuple[str, str, str]:
        """
        Extract Onset-Nucles-Coda from syllable.

        Example: "string" -> onset="str", nucleus="i", coda="ng"
        """

        syllable = syllable.lower()
        vowels = set('aeiouy')

        # Find nucleus (first vowel sequence)
        nucleus_start = -1
        nucleus_end = -1

        for i, char in enumerate(syllable):
            if char in vowels:
                if nucleus_start == -1:
                    nucleus_start = i
                nucleus_end = i + 1
            elif nucleus_start != -1:
                break

        if nucleus_start == -1:
            # No vowel - treat whole thing as onset
            return syllable, '', ''
        
        onset = syllable[:nucleus_start]
        nucleus = syllable[nucleus_start:nucleus_end]
        coda = syllable[nucleus_end:]

        return onset, nucleus, coda
    
    def _get_or_add_syllable(self, syllable:str) -> int:
        """Get syllable ID, adding to vocab if new."""
        if syllable not in self.syllable_to_id:
            idx = len(self.syllable_to_id)
            self.syllable_to_id[syllable] = idx
            self.id_to_syllable[idx] = syllable
        return self.syllable_to_id[syllable]
    
    def _get_or_add_onset(self, onset: str) -> int:
        if onset not in self.onset_to_id:
            self.onset_to_id[onset] = len(self.onset_to_id)
        return self.onset_to_id[onset]
    
    def _get_or_add_nucleus(self, nucleus: str) -> int:
        if nucleus not in self.nucleus_to_id:
            self.nucleus_to_id[nucleus] = len(self.nucleus_to_id)
        return self.nucleus_to_id[nucleus]
    
    def _get_or_add_coda(self, coda: str) -> int:
        if coda not in self.coda_to_id:
            self.coda_to_id[coda] = len(self.coda_to_id)
        return self.coda_to_id[coda]
    
    def _determine_token_type(self, text: str) -> int:
        """Determine token type: 0=syllable, 1=number, 2=punct, 3=special."""
        if text.isdigit():
            return 1
        elif text in '.,!?;:\'"()-[]{}':
            return 2
        elif text.isalpha():
            return 0
        else:
            return 3
        
    def encode(self, text: str) -> List[Dict]:
        """
        Encode text into list of 9-feature token dictionaries.

        Return list of dicts with keys matching get_feature_names().
        """
        if not text:
            return[]
        
        tokens = []
        segments = self.WORD_PATTERN.findall(text)

        for seg_idx, segment in enumerate(segments):
            # Skip whitespace - encode as has_space_after on previous token
            if segment.isspace():
                if tokens:
                    tokens[-1]['has_space_after'] = 1
                    continue

            # Check if next segment is whitespace
            has_space = 0
            if seg_idx + 1 < len(segments) and segments [seg_idx + 1].isspace():
                has_space = 1

            # Determine token type
            token_type = self._determine_token_type(segment)
            is_cap = 1 if segment and segment[0].isupper() else 0

            if token_type == 0:     # Regular word -> syllabify
                syllables = self._syllabify(segment)
                n_syls = len(syllables)

                for i, syl in enumerate(syllables):
                    # Position encoding
                    if n_syls == 1:
                        position = 3 # both start and end
                    elif i == 0:
                        position = 1 # start
                    elif i == n_syls -1:
                        position = 2 # end
                    else:
                        postiion = 0 # middle

                    # Extract phonetic components
                    onset, nucleus, coda = self._extract_onset_nucleus_coda(syl)

                    # Get/create IDs
                    syl_id = self._get_or_add_syllable(syl.lower())
                    onset_id = self._get_or_add_onset(onset)
                    nucleus_id = self._get_or_add_nucleus(nucleus)
                    coda_id = self._get_or_add_coda(coda)

                    # Only first syllable inherits capitalization
                    syl_cap = is_cap if i == 0 else 0

                    # Space only after last syllable of word
                    syl_space = has_space if i == n_syls - 1 else 0

                    tokens.append({
                        'text': syl,
                        'syllable_id': syl_id,
                        'onset_id': onset_id,
                        'nucleus_id': nucleus_id,
                        'coda_id': coda_id,
                        'position': position,
                        'is_capitalized': syl_cap,
                        'token_type': token_type,
                        'has_space_after': syl_space,
                        'is_word_end': 1 if i == n_syls - 1 else 0,
                    })
            elif token_type == 1: # Number
                syl_key = f"<num_{segment}>"
                syl_id = self._get_or_add_syllable(syl_key)

                tokens.append({
                    'text': segment,
                    'syllable_id': syl_id,
                    'onset_id': self.onset_to_id['<num>'],
                    'nucleus_id': self.nucleus_to_id.get(segment, 1),
                    'coda_id': self.coda_to_id.get('', 1),
                    'position': 3,
                    'is_capitalized': 0,
                    'token_type': token_type,
                    'has_space_after': has_space,
                    'is_word_end': 1,
                })

            elif token_type == 2:   # Punctuation
                syl_key = f"<punct_{segment}>"
                syl_id = self._get_or_add_syllable(syl_key)

                tokens.append({
                    'text': segment,
                    'syllable_id': syl_id,
                    'onset_id': self.onset_to_id['<punct>'],
                    'nucleus_id': self.nucleus_to_id.get(segment, 1),
                    'coda_id': self.coda_to_id.get('', 1),
                    'position': 3,
                    'is_capitalized': 0,
                    'token_type': token_type,
                    'has_space_after': has_space,
                    'is_word_end': 1,
                })

            else:  # Special characters
                syl_key = f"<char_{segment}>"
                syl_id = self._get_or_add_syllable(syl_key)
                
                tokens.append({
                    'text': segment,
                    'syllable_id': syl_id,
                    'onset_id': self.onset_to_id['<special>'],
                    'nucleus_id': self.nucleus_to_id.get('', 1),
                    'coda_id': self.coda_to_id.get('', 1),
                    'position': 3,
                    'is_capitalized': 0,
                    'token_type': token_type,
                    'has_space_after': has_space,
                    'is_word_end': 1,
                })

        return tokens
    def decode(self, tokens: List[Dict]) -> str:
        """Decode token list back to text."""
        parts = []

        for token in tokens:
            syl_id = token.get('syllable_id', 0)
            syl = self.id_to_syllable.get(syl_id, '<unk>')

            # Handle special tokens
            if syl.startswith('<punct_') and syl.endswith('>'):
                text = syl[7:-1]
            elif syl.startswith('<num_') and syl.endswith('>'):
                text = syl[5:-1]
            elif syl.startswith('<char_') and syl.endswith('>'):
                text = syl[6:-1]
            elif syl in ('<pad>', '<unk>'):
                continue
            else:
                text = syl
                # Apply capitalization
                if token.get('is_capitalized', 0):
                    text = text[0].upper() + text[1:] if len(text) > 1 else text.upper()

            parts.appent(text)

            # Add space if has_space_after
            if token.get('has_space_after', 0):
                parts.append(' ')

        return ''.join(parts)
    
#-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# Little Test

if __name__ == "__main__":
    print("=" * 70)
    print("SyllableLM v4 - Tokenizer Test")
    print("=" * 70)
    
    tokenizer = LunaTokenizer()
    
    tests = [
        "Hello World!",
        "The quick brown fox jumps over the lazy dog.",
        "Artificial intelligence is fascinating.",
    ]
    
    for text in tests:
        print(f"\nInput: '{text}'")
        encoded = tokenizer.encode(text)
        decoded = tokenizer.decode(encoded)
        print(f"Tokens: {len(encoded)}")
        print(f"Decoded: '{decoded}'")
        print(f"Match: {text == decoded}")
    
    print(f"\nFeatures ({len(tokenizer.get_feature_names())}): {tokenizer.get_feature_names()}")
    print(f"Vocab size: {len(tokenizer.syllable_to_id)}")