# Copyright 2026 Jakub SykaƂa # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # Luna Tokenizer # # The 9 Features #-------------------------------------------------------------------------- # 0: syllable_id - Unique syllable identifier # 1: onset_id - Initial consonant cluster (e.g., "str" in "string") # 2: nucleus_id - Vowel core (e.g., "i" in "string") # 3: coda_id - Final consonants (e.g., "ng" in "string") # 4: position - Position in word (0=mid, 1=start, 2=end, 3=both) # 5: is_capitalized - Starts with uppercase? (0 or 1) # 6: token_type - 0=syllable, 1=number, 2=punctuation, 3=special # 7: has_space_after - Space follows this token? (0 or 1) # 8: is_word_end - Last syllable of word? (0 or 1) #-------------------------------------------------------------------------- import re from typing import List, Dict, Tuple, Optional try: import pyphen PYPHEN_AVAILABLE = True except ImportError: PYPHEN_AVAILABLE = False print("Warning: pyphen not installed. Using basic syllabification.") class LunaTokenizer: """ Luna Tokenizer - Phonetically-aware tokenization. Converts text into 9-dimensional token representations based on syllable structure (Onset-Nucleus-Coda) plus metadata features. """ # Regex patterns WORD_PATTERN = re.compile(r"([a-zA-Z]+|[0-9]|[^\w\s]|\s+)") def __init__(self): # Initialize pyphen for syllabification if PYPHEN_AVAILABLE: self.syllabifier = pyphen.Pyphen(lang='en_US') else: self.syllabifier = None # Vocabularies (built during encoding or loaded from vocab.json) self.syllable_to_id: Dict[str, int] = {'': 0, '': 1} self.id_to_syllable: Dict[int, str] = {0: '', 1:''} # Phonetic component vocabularies self.onset_to_id: Dict[str, int] = { '':0, '': 1, '': 2, '': 3, '': 4 } self.nucleus_to_id: Dict[str, int] = {'': 0, '': 1} self.coda_to_id: Dict[str, int] = {'': 0, '': 1} def get_feature_names(self) -> List[str]: """Return ordered list of feature names (9 features)""" return [ 'syllable_id', # 0 'onset_id', # 1 'nucleus_id', # 2 'coda_id', # 3 'position', # 4 'is_capitalized', # 5 'token_type', # 6 'has_space_after', # 7 'is_word_end', # 8 ] def _syllabify(self, word:str) -> List[str]: """Split word into syllables.""" if not word: return [] if self.syllabifier: hyphenated = self.syllabifier.inserted(word.lower()) return hyphenated.split('-') if hyphenated else [word.lower()] else: # Basic fallback: Split on vowel boundries return self._basic_syllabify(word.lower()) def _basic_syllabify(self, word: str) -> List[str]: """Basic syllabification fallback.""" vowels = set('aeiouy') syllables = [] current = "" for i, char in enumerate(word): current += char if char in vowels: # Look ahead - if next char is consonant followed by vowel, split if i + 2 < len(word) and word[i+1] not in vowels and word[i+2] in vowels: syllables.append(current) current = "" if current: if syllables: syllables[-1] += current else: syllables.append(current) return syllables if syllables else [word] def _extract_onset_nucleus_coda(self, syllable: str) -> Tuple[str, str, str]: """ Extract Onset-Nucles-Coda from syllable. Example: "string" -> onset="str", nucleus="i", coda="ng" """ syllable = syllable.lower() vowels = set('aeiouy') # Find nucleus (first vowel sequence) nucleus_start = -1 nucleus_end = -1 for i, char in enumerate(syllable): if char in vowels: if nucleus_start == -1: nucleus_start = i nucleus_end = i + 1 elif nucleus_start != -1: break if nucleus_start == -1: # No vowel - treat whole thing as onset return syllable, '', '' onset = syllable[:nucleus_start] nucleus = syllable[nucleus_start:nucleus_end] coda = syllable[nucleus_end:] return onset, nucleus, coda def _get_or_add_syllable(self, syllable:str) -> int: """Get syllable ID, adding to vocab if new.""" if syllable not in self.syllable_to_id: idx = len(self.syllable_to_id) self.syllable_to_id[syllable] = idx self.id_to_syllable[idx] = syllable return self.syllable_to_id[syllable] def _get_or_add_onset(self, onset: str) -> int: if onset not in self.onset_to_id: self.onset_to_id[onset] = len(self.onset_to_id) return self.onset_to_id[onset] def _get_or_add_nucleus(self, nucleus: str) -> int: if nucleus not in self.nucleus_to_id: self.nucleus_to_id[nucleus] = len(self.nucleus_to_id) return self.nucleus_to_id[nucleus] def _get_or_add_coda(self, coda: str) -> int: if coda not in self.coda_to_id: self.coda_to_id[coda] = len(self.coda_to_id) return self.coda_to_id[coda] def _determine_token_type(self, text: str) -> int: """Determine token type: 0=syllable, 1=number, 2=punct, 3=special.""" if text.isdigit(): return 1 elif text in '.,!?;:\'"()-[]{}': return 2 elif text.isalpha(): return 0 else: return 3 def encode(self, text: str) -> List[Dict]: """ Encode text into list of 9-feature token dictionaries. Return list of dicts with keys matching get_feature_names(). """ if not text: return[] tokens = [] segments = self.WORD_PATTERN.findall(text) for seg_idx, segment in enumerate(segments): # Skip whitespace - encode as has_space_after on previous token if segment.isspace(): if tokens: tokens[-1]['has_space_after'] = 1 continue # Check if next segment is whitespace has_space = 0 if seg_idx + 1 < len(segments) and segments [seg_idx + 1].isspace(): has_space = 1 # Determine token type token_type = self._determine_token_type(segment) is_cap = 1 if segment and segment[0].isupper() else 0 if token_type == 0: # Regular word -> syllabify syllables = self._syllabify(segment) n_syls = len(syllables) for i, syl in enumerate(syllables): # Position encoding if n_syls == 1: position = 3 # both start and end elif i == 0: position = 1 # start elif i == n_syls -1: position = 2 # end else: postiion = 0 # middle # Extract phonetic components onset, nucleus, coda = self._extract_onset_nucleus_coda(syl) # Get/create IDs syl_id = self._get_or_add_syllable(syl.lower()) onset_id = self._get_or_add_onset(onset) nucleus_id = self._get_or_add_nucleus(nucleus) coda_id = self._get_or_add_coda(coda) # Only first syllable inherits capitalization syl_cap = is_cap if i == 0 else 0 # Space only after last syllable of word syl_space = has_space if i == n_syls - 1 else 0 tokens.append({ 'text': syl, 'syllable_id': syl_id, 'onset_id': onset_id, 'nucleus_id': nucleus_id, 'coda_id': coda_id, 'position': position, 'is_capitalized': syl_cap, 'token_type': token_type, 'has_space_after': syl_space, 'is_word_end': 1 if i == n_syls - 1 else 0, }) elif token_type == 1: # Number syl_key = f"" syl_id = self._get_or_add_syllable(syl_key) tokens.append({ 'text': segment, 'syllable_id': syl_id, 'onset_id': self.onset_to_id[''], 'nucleus_id': self.nucleus_to_id.get(segment, 1), 'coda_id': self.coda_to_id.get('', 1), 'position': 3, 'is_capitalized': 0, 'token_type': token_type, 'has_space_after': has_space, 'is_word_end': 1, }) elif token_type == 2: # Punctuation syl_key = f"" syl_id = self._get_or_add_syllable(syl_key) tokens.append({ 'text': segment, 'syllable_id': syl_id, 'onset_id': self.onset_to_id[''], 'nucleus_id': self.nucleus_to_id.get(segment, 1), 'coda_id': self.coda_to_id.get('', 1), 'position': 3, 'is_capitalized': 0, 'token_type': token_type, 'has_space_after': has_space, 'is_word_end': 1, }) else: # Special characters syl_key = f"" syl_id = self._get_or_add_syllable(syl_key) tokens.append({ 'text': segment, 'syllable_id': syl_id, 'onset_id': self.onset_to_id[''], 'nucleus_id': self.nucleus_to_id.get('', 1), 'coda_id': self.coda_to_id.get('', 1), 'position': 3, 'is_capitalized': 0, 'token_type': token_type, 'has_space_after': has_space, 'is_word_end': 1, }) return tokens def decode(self, tokens: List[Dict]) -> str: """Decode token list back to text.""" parts = [] for token in tokens: syl_id = token.get('syllable_id', 0) syl = self.id_to_syllable.get(syl_id, '') # Handle special tokens if syl.startswith(''): text = syl[7:-1] elif syl.startswith(''): text = syl[5:-1] elif syl.startswith(''): text = syl[6:-1] elif syl in ('', ''): continue else: text = syl # Apply capitalization if token.get('is_capitalized', 0): text = text[0].upper() + text[1:] if len(text) > 1 else text.upper() parts.appent(text) # Add space if has_space_after if token.get('has_space_after', 0): parts.append(' ') return ''.join(parts) #-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-= # Little Test if __name__ == "__main__": print("=" * 70) print("SyllableLM v4 - Tokenizer Test") print("=" * 70) tokenizer = LunaTokenizer() tests = [ "Hello World!", "The quick brown fox jumps over the lazy dog.", "Artificial intelligence is fascinating.", ] for text in tests: print(f"\nInput: '{text}'") encoded = tokenizer.encode(text) decoded = tokenizer.decode(encoded) print(f"Tokens: {len(encoded)}") print(f"Decoded: '{decoded}'") print(f"Match: {text == decoded}") print(f"\nFeatures ({len(tokenizer.get_feature_names())}): {tokenizer.get_feature_names()}") print(f"Vocab size: {len(tokenizer.syllable_to_id)}")