Luna-150M / tokenizer.py
JMSykala's picture
Upload 9 files
9c737ff verified
# Copyright 2026 Jakub Sykała
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Luna Tokenizer
#
# The 9 Features
#--------------------------------------------------------------------------
# 0: syllable_id - Unique syllable identifier
# 1: onset_id - Initial consonant cluster (e.g., "str" in "string")
# 2: nucleus_id - Vowel core (e.g., "i" in "string")
# 3: coda_id - Final consonants (e.g., "ng" in "string")
# 4: position - Position in word (0=mid, 1=start, 2=end, 3=both)
# 5: is_capitalized - Starts with uppercase? (0 or 1)
# 6: token_type - 0=syllable, 1=number, 2=punctuation, 3=special
# 7: has_space_after - Space follows this token? (0 or 1)
# 8: is_word_end - Last syllable of word? (0 or 1)
#--------------------------------------------------------------------------
import re
from typing import List, Dict, Tuple, Optional
try:
import pyphen
PYPHEN_AVAILABLE = True
except ImportError:
PYPHEN_AVAILABLE = False
print("Warning: pyphen not installed. Using basic syllabification.")
class LunaTokenizer:
"""
Luna Tokenizer - Phonetically-aware tokenization.
Converts text into 9-dimensional token representations based on
syllable structure (Onset-Nucleus-Coda) plus metadata features.
"""
# Regex patterns
WORD_PATTERN = re.compile(r"([a-zA-Z]+|[0-9]|[^\w\s]|\s+)")
def __init__(self):
# Initialize pyphen for syllabification
if PYPHEN_AVAILABLE:
self.syllabifier = pyphen.Pyphen(lang='en_US')
else:
self.syllabifier = None
# Vocabularies (built during encoding or loaded from vocab.json)
self.syllable_to_id: Dict[str, int] = {'<pad>': 0, '<unk>': 1}
self.id_to_syllable: Dict[int, str] = {0: '<pad>', 1:'<unk>'}
# Phonetic component vocabularies
self.onset_to_id: Dict[str, int] = {
'<pad>':0, '': 1, '<num>': 2, '<punct>': 3, '<special>': 4
}
self.nucleus_to_id: Dict[str, int] = {'<pad>': 0, '': 1}
self.coda_to_id: Dict[str, int] = {'<pad>': 0, '': 1}
def get_feature_names(self) -> List[str]:
"""Return ordered list of feature names (9 features)"""
return [
'syllable_id', # 0
'onset_id', # 1
'nucleus_id', # 2
'coda_id', # 3
'position', # 4
'is_capitalized', # 5
'token_type', # 6
'has_space_after', # 7
'is_word_end', # 8
]
def _syllabify(self, word:str) -> List[str]:
"""Split word into syllables."""
if not word:
return []
if self.syllabifier:
hyphenated = self.syllabifier.inserted(word.lower())
return hyphenated.split('-') if hyphenated else [word.lower()]
else:
# Basic fallback: Split on vowel boundries
return self._basic_syllabify(word.lower())
def _basic_syllabify(self, word: str) -> List[str]:
"""Basic syllabification fallback."""
vowels = set('aeiouy')
syllables = []
current = ""
for i, char in enumerate(word):
current += char
if char in vowels:
# Look ahead - if next char is consonant followed by vowel, split
if i + 2 < len(word) and word[i+1] not in vowels and word[i+2] in vowels:
syllables.append(current)
current = ""
if current:
if syllables:
syllables[-1] += current
else:
syllables.append(current)
return syllables if syllables else [word]
def _extract_onset_nucleus_coda(self, syllable: str) -> Tuple[str, str, str]:
"""
Extract Onset-Nucles-Coda from syllable.
Example: "string" -> onset="str", nucleus="i", coda="ng"
"""
syllable = syllable.lower()
vowels = set('aeiouy')
# Find nucleus (first vowel sequence)
nucleus_start = -1
nucleus_end = -1
for i, char in enumerate(syllable):
if char in vowels:
if nucleus_start == -1:
nucleus_start = i
nucleus_end = i + 1
elif nucleus_start != -1:
break
if nucleus_start == -1:
# No vowel - treat whole thing as onset
return syllable, '', ''
onset = syllable[:nucleus_start]
nucleus = syllable[nucleus_start:nucleus_end]
coda = syllable[nucleus_end:]
return onset, nucleus, coda
def _get_or_add_syllable(self, syllable:str) -> int:
"""Get syllable ID, adding to vocab if new."""
if syllable not in self.syllable_to_id:
idx = len(self.syllable_to_id)
self.syllable_to_id[syllable] = idx
self.id_to_syllable[idx] = syllable
return self.syllable_to_id[syllable]
def _get_or_add_onset(self, onset: str) -> int:
if onset not in self.onset_to_id:
self.onset_to_id[onset] = len(self.onset_to_id)
return self.onset_to_id[onset]
def _get_or_add_nucleus(self, nucleus: str) -> int:
if nucleus not in self.nucleus_to_id:
self.nucleus_to_id[nucleus] = len(self.nucleus_to_id)
return self.nucleus_to_id[nucleus]
def _get_or_add_coda(self, coda: str) -> int:
if coda not in self.coda_to_id:
self.coda_to_id[coda] = len(self.coda_to_id)
return self.coda_to_id[coda]
def _determine_token_type(self, text: str) -> int:
"""Determine token type: 0=syllable, 1=number, 2=punct, 3=special."""
if text.isdigit():
return 1
elif text in '.,!?;:\'"()-[]{}':
return 2
elif text.isalpha():
return 0
else:
return 3
def encode(self, text: str) -> List[Dict]:
"""
Encode text into list of 9-feature token dictionaries.
Return list of dicts with keys matching get_feature_names().
"""
if not text:
return[]
tokens = []
segments = self.WORD_PATTERN.findall(text)
for seg_idx, segment in enumerate(segments):
# Skip whitespace - encode as has_space_after on previous token
if segment.isspace():
if tokens:
tokens[-1]['has_space_after'] = 1
continue
# Check if next segment is whitespace
has_space = 0
if seg_idx + 1 < len(segments) and segments [seg_idx + 1].isspace():
has_space = 1
# Determine token type
token_type = self._determine_token_type(segment)
is_cap = 1 if segment and segment[0].isupper() else 0
if token_type == 0: # Regular word -> syllabify
syllables = self._syllabify(segment)
n_syls = len(syllables)
for i, syl in enumerate(syllables):
# Position encoding
if n_syls == 1:
position = 3 # both start and end
elif i == 0:
position = 1 # start
elif i == n_syls -1:
position = 2 # end
else:
postiion = 0 # middle
# Extract phonetic components
onset, nucleus, coda = self._extract_onset_nucleus_coda(syl)
# Get/create IDs
syl_id = self._get_or_add_syllable(syl.lower())
onset_id = self._get_or_add_onset(onset)
nucleus_id = self._get_or_add_nucleus(nucleus)
coda_id = self._get_or_add_coda(coda)
# Only first syllable inherits capitalization
syl_cap = is_cap if i == 0 else 0
# Space only after last syllable of word
syl_space = has_space if i == n_syls - 1 else 0
tokens.append({
'text': syl,
'syllable_id': syl_id,
'onset_id': onset_id,
'nucleus_id': nucleus_id,
'coda_id': coda_id,
'position': position,
'is_capitalized': syl_cap,
'token_type': token_type,
'has_space_after': syl_space,
'is_word_end': 1 if i == n_syls - 1 else 0,
})
elif token_type == 1: # Number
syl_key = f"<num_{segment}>"
syl_id = self._get_or_add_syllable(syl_key)
tokens.append({
'text': segment,
'syllable_id': syl_id,
'onset_id': self.onset_to_id['<num>'],
'nucleus_id': self.nucleus_to_id.get(segment, 1),
'coda_id': self.coda_to_id.get('', 1),
'position': 3,
'is_capitalized': 0,
'token_type': token_type,
'has_space_after': has_space,
'is_word_end': 1,
})
elif token_type == 2: # Punctuation
syl_key = f"<punct_{segment}>"
syl_id = self._get_or_add_syllable(syl_key)
tokens.append({
'text': segment,
'syllable_id': syl_id,
'onset_id': self.onset_to_id['<punct>'],
'nucleus_id': self.nucleus_to_id.get(segment, 1),
'coda_id': self.coda_to_id.get('', 1),
'position': 3,
'is_capitalized': 0,
'token_type': token_type,
'has_space_after': has_space,
'is_word_end': 1,
})
else: # Special characters
syl_key = f"<char_{segment}>"
syl_id = self._get_or_add_syllable(syl_key)
tokens.append({
'text': segment,
'syllable_id': syl_id,
'onset_id': self.onset_to_id['<special>'],
'nucleus_id': self.nucleus_to_id.get('', 1),
'coda_id': self.coda_to_id.get('', 1),
'position': 3,
'is_capitalized': 0,
'token_type': token_type,
'has_space_after': has_space,
'is_word_end': 1,
})
return tokens
def decode(self, tokens: List[Dict]) -> str:
"""Decode token list back to text."""
parts = []
for token in tokens:
syl_id = token.get('syllable_id', 0)
syl = self.id_to_syllable.get(syl_id, '<unk>')
# Handle special tokens
if syl.startswith('<punct_') and syl.endswith('>'):
text = syl[7:-1]
elif syl.startswith('<num_') and syl.endswith('>'):
text = syl[5:-1]
elif syl.startswith('<char_') and syl.endswith('>'):
text = syl[6:-1]
elif syl in ('<pad>', '<unk>'):
continue
else:
text = syl
# Apply capitalization
if token.get('is_capitalized', 0):
text = text[0].upper() + text[1:] if len(text) > 1 else text.upper()
parts.appent(text)
# Add space if has_space_after
if token.get('has_space_after', 0):
parts.append(' ')
return ''.join(parts)
#-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
# Little Test
if __name__ == "__main__":
print("=" * 70)
print("SyllableLM v4 - Tokenizer Test")
print("=" * 70)
tokenizer = LunaTokenizer()
tests = [
"Hello World!",
"The quick brown fox jumps over the lazy dog.",
"Artificial intelligence is fascinating.",
]
for text in tests:
print(f"\nInput: '{text}'")
encoded = tokenizer.encode(text)
decoded = tokenizer.decode(encoded)
print(f"Tokens: {len(encoded)}")
print(f"Decoded: '{decoded}'")
print(f"Match: {text == decoded}")
print(f"\nFeatures ({len(tokenizer.get_feature_names())}): {tokenizer.get_feature_names()}")
print(f"Vocab size: {len(tokenizer.syllable_to_id)}")