Luna-150M / tokenizer.py

Upload 9 files

9c737ff verified 12 days ago

13.3 kB

	# Copyright 2026 Jakub Sykała
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	# Luna Tokenizer
	#
	# The 9 Features
	#--------------------------------------------------------------------------
	# 0: syllable_id - Unique syllable identifier
	# 1: onset_id - Initial consonant cluster (e.g., "str" in "string")
	# 2: nucleus_id - Vowel core (e.g., "i" in "string")
	# 3: coda_id - Final consonants (e.g., "ng" in "string")
	# 4: position - Position in word (0=mid, 1=start, 2=end, 3=both)
	# 5: is_capitalized - Starts with uppercase? (0 or 1)
	# 6: token_type - 0=syllable, 1=number, 2=punctuation, 3=special
	# 7: has_space_after - Space follows this token? (0 or 1)
	# 8: is_word_end - Last syllable of word? (0 or 1)
	#--------------------------------------------------------------------------

	import re
	from typing import List, Dict, Tuple, Optional

	try:
	import pyphen
	PYPHEN_AVAILABLE = True
	except ImportError:
	PYPHEN_AVAILABLE = False
	print("Warning: pyphen not installed. Using basic syllabification.")

	class LunaTokenizer:
	"""
	Luna Tokenizer - Phonetically-aware tokenization.

	Converts text into 9-dimensional token representations based on
	syllable structure (Onset-Nucleus-Coda) plus metadata features.
	"""

	# Regex patterns
	WORD_PATTERN = re.compile(r"([a-zA-Z]+\|[0-9]\|[^\w\s]\|\s+)")

	def __init__(self):
	# Initialize pyphen for syllabification
	if PYPHEN_AVAILABLE:
	self.syllabifier = pyphen.Pyphen(lang='en_US')
	else:
	self.syllabifier = None

	# Vocabularies (built during encoding or loaded from vocab.json)
	self.syllable_to_id: Dict[str, int] = {'<pad>': 0, '<unk>': 1}
	self.id_to_syllable: Dict[int, str] = {0: '<pad>', 1:'<unk>'}

	# Phonetic component vocabularies
	self.onset_to_id: Dict[str, int] = {
	'<pad>':0, '': 1, '<num>': 2, '<punct>': 3, '<special>': 4
	}
	self.nucleus_to_id: Dict[str, int] = {'<pad>': 0, '': 1}
	self.coda_to_id: Dict[str, int] = {'<pad>': 0, '': 1}

	def get_feature_names(self) -> List[str]:
	"""Return ordered list of feature names (9 features)"""
	return [
	'syllable_id', # 0
	'onset_id', # 1
	'nucleus_id', # 2
	'coda_id', # 3
	'position', # 4
	'is_capitalized', # 5
	'token_type', # 6
	'has_space_after', # 7
	'is_word_end', # 8
	]

	def _syllabify(self, word:str) -> List[str]:
	"""Split word into syllables."""
	if not word:
	return []

	if self.syllabifier:
	hyphenated = self.syllabifier.inserted(word.lower())
	return hyphenated.split('-') if hyphenated else [word.lower()]
	else:
	# Basic fallback: Split on vowel boundries
	return self._basic_syllabify(word.lower())

	def _basic_syllabify(self, word: str) -> List[str]:
	"""Basic syllabification fallback."""
	vowels = set('aeiouy')
	syllables = []
	current = ""

	for i, char in enumerate(word):
	current += char
	if char in vowels:
	# Look ahead - if next char is consonant followed by vowel, split
	if i + 2 < len(word) and word[i+1] not in vowels and word[i+2] in vowels:
	syllables.append(current)
	current = ""

	if current:
	if syllables:
	syllables[-1] += current
	else:
	syllables.append(current)
	return syllables if syllables else [word]

	def _extract_onset_nucleus_coda(self, syllable: str) -> Tuple[str, str, str]:
	"""
	Extract Onset-Nucles-Coda from syllable.

	Example: "string" -> onset="str", nucleus="i", coda="ng"
	"""

	syllable = syllable.lower()
	vowels = set('aeiouy')

	# Find nucleus (first vowel sequence)
	nucleus_start = -1
	nucleus_end = -1

	for i, char in enumerate(syllable):
	if char in vowels:
	if nucleus_start == -1:
	nucleus_start = i
	nucleus_end = i + 1
	elif nucleus_start != -1:
	break

	if nucleus_start == -1:
	# No vowel - treat whole thing as onset
	return syllable, '', ''

	onset = syllable[:nucleus_start]
	nucleus = syllable[nucleus_start:nucleus_end]
	coda = syllable[nucleus_end:]

	return onset, nucleus, coda

	def _get_or_add_syllable(self, syllable:str) -> int:
	"""Get syllable ID, adding to vocab if new."""
	if syllable not in self.syllable_to_id:
	idx = len(self.syllable_to_id)
	self.syllable_to_id[syllable] = idx
	self.id_to_syllable[idx] = syllable
	return self.syllable_to_id[syllable]

	def _get_or_add_onset(self, onset: str) -> int:
	if onset not in self.onset_to_id:
	self.onset_to_id[onset] = len(self.onset_to_id)
	return self.onset_to_id[onset]

	def _get_or_add_nucleus(self, nucleus: str) -> int:
	if nucleus not in self.nucleus_to_id:
	self.nucleus_to_id[nucleus] = len(self.nucleus_to_id)
	return self.nucleus_to_id[nucleus]

	def _get_or_add_coda(self, coda: str) -> int:
	if coda not in self.coda_to_id:
	self.coda_to_id[coda] = len(self.coda_to_id)
	return self.coda_to_id[coda]

	def _determine_token_type(self, text: str) -> int:
	"""Determine token type: 0=syllable, 1=number, 2=punct, 3=special."""
	if text.isdigit():
	return 1
	elif text in '.,!?;:\'"()-[]{}':
	return 2
	elif text.isalpha():
	return 0
	else:
	return 3

	def encode(self, text: str) -> List[Dict]:
	"""
	Encode text into list of 9-feature token dictionaries.

	Return list of dicts with keys matching get_feature_names().
	"""
	if not text:
	return[]

	tokens = []
	segments = self.WORD_PATTERN.findall(text)

	for seg_idx, segment in enumerate(segments):
	# Skip whitespace - encode as has_space_after on previous token
	if segment.isspace():
	if tokens:
	tokens[-1]['has_space_after'] = 1
	continue

	# Check if next segment is whitespace
	has_space = 0
	if seg_idx + 1 < len(segments) and segments [seg_idx + 1].isspace():
	has_space = 1

	# Determine token type
	token_type = self._determine_token_type(segment)
	is_cap = 1 if segment and segment[0].isupper() else 0

	if token_type == 0: # Regular word -> syllabify
	syllables = self._syllabify(segment)
	n_syls = len(syllables)

	for i, syl in enumerate(syllables):
	# Position encoding
	if n_syls == 1:
	position = 3 # both start and end
	elif i == 0:
	position = 1 # start
	elif i == n_syls -1:
	position = 2 # end
	else:
	postiion = 0 # middle

	# Extract phonetic components
	onset, nucleus, coda = self._extract_onset_nucleus_coda(syl)

	# Get/create IDs
	syl_id = self._get_or_add_syllable(syl.lower())
	onset_id = self._get_or_add_onset(onset)
	nucleus_id = self._get_or_add_nucleus(nucleus)
	coda_id = self._get_or_add_coda(coda)

	# Only first syllable inherits capitalization
	syl_cap = is_cap if i == 0 else 0

	# Space only after last syllable of word
	syl_space = has_space if i == n_syls - 1 else 0

	tokens.append({
	'text': syl,
	'syllable_id': syl_id,
	'onset_id': onset_id,
	'nucleus_id': nucleus_id,
	'coda_id': coda_id,
	'position': position,
	'is_capitalized': syl_cap,
	'token_type': token_type,
	'has_space_after': syl_space,
	'is_word_end': 1 if i == n_syls - 1 else 0,
	})
	elif token_type == 1: # Number
	syl_key = f"<num_{segment}>"
	syl_id = self._get_or_add_syllable(syl_key)

	tokens.append({
	'text': segment,
	'syllable_id': syl_id,
	'onset_id': self.onset_to_id['<num>'],
	'nucleus_id': self.nucleus_to_id.get(segment, 1),
	'coda_id': self.coda_to_id.get('', 1),
	'position': 3,
	'is_capitalized': 0,
	'token_type': token_type,
	'has_space_after': has_space,
	'is_word_end': 1,
	})

	elif token_type == 2: # Punctuation
	syl_key = f"<punct_{segment}>"
	syl_id = self._get_or_add_syllable(syl_key)

	tokens.append({
	'text': segment,
	'syllable_id': syl_id,
	'onset_id': self.onset_to_id['<punct>'],
	'nucleus_id': self.nucleus_to_id.get(segment, 1),
	'coda_id': self.coda_to_id.get('', 1),
	'position': 3,
	'is_capitalized': 0,
	'token_type': token_type,
	'has_space_after': has_space,
	'is_word_end': 1,
	})

	else: # Special characters
	syl_key = f"<char_{segment}>"
	syl_id = self._get_or_add_syllable(syl_key)

	tokens.append({
	'text': segment,
	'syllable_id': syl_id,
	'onset_id': self.onset_to_id['<special>'],
	'nucleus_id': self.nucleus_to_id.get('', 1),
	'coda_id': self.coda_to_id.get('', 1),
	'position': 3,
	'is_capitalized': 0,
	'token_type': token_type,
	'has_space_after': has_space,
	'is_word_end': 1,
	})

	return tokens
	def decode(self, tokens: List[Dict]) -> str:
	"""Decode token list back to text."""
	parts = []

	for token in tokens:
	syl_id = token.get('syllable_id', 0)
	syl = self.id_to_syllable.get(syl_id, '<unk>')

	# Handle special tokens
	if syl.startswith('<punct_') and syl.endswith('>'):
	text = syl[7:-1]
	elif syl.startswith('<num_') and syl.endswith('>'):
	text = syl[5:-1]
	elif syl.startswith('<char_') and syl.endswith('>'):
	text = syl[6:-1]
	elif syl in ('<pad>', '<unk>'):
	continue
	else:
	text = syl
	# Apply capitalization
	if token.get('is_capitalized', 0):
	text = text[0].upper() + text[1:] if len(text) > 1 else text.upper()

	parts.appent(text)

	# Add space if has_space_after
	if token.get('has_space_after', 0):
	parts.append(' ')

	return ''.join(parts)

	#-=-=-=-=-=-=-=--=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
	# Little Test

	if __name__ == "__main__":
	print("=" * 70)
	print("SyllableLM v4 - Tokenizer Test")
	print("=" * 70)

	tokenizer = LunaTokenizer()

	tests = [
	"Hello World!",
	"The quick brown fox jumps over the lazy dog.",
	"Artificial intelligence is fascinating.",
	]

	for text in tests:
	print(f"\nInput: '{text}'")
	encoded = tokenizer.encode(text)
	decoded = tokenizer.decode(encoded)
	print(f"Tokens: {len(encoded)}")
	print(f"Decoded: '{decoded}'")
	print(f"Match: {text == decoded}")

	print(f"\nFeatures ({len(tokenizer.get_feature_names())}): {tokenizer.get_feature_names()}")
	print(f"Vocab size: {len(tokenizer.syllable_to_id)}")