|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import re |
|
|
from typing import List, Dict, Tuple, Optional |
|
|
|
|
|
try: |
|
|
import pyphen |
|
|
PYPHEN_AVAILABLE = True |
|
|
except ImportError: |
|
|
PYPHEN_AVAILABLE = False |
|
|
print("Warning: pyphen not installed. Using basic syllabification.") |
|
|
|
|
|
class LunaTokenizer: |
|
|
""" |
|
|
Luna Tokenizer - Phonetically-aware tokenization. |
|
|
|
|
|
Converts text into 9-dimensional token representations based on |
|
|
syllable structure (Onset-Nucleus-Coda) plus metadata features. |
|
|
""" |
|
|
|
|
|
|
|
|
WORD_PATTERN = re.compile(r"([a-zA-Z]+|[0-9]|[^\w\s]|\s+)") |
|
|
|
|
|
def __init__(self): |
|
|
|
|
|
if PYPHEN_AVAILABLE: |
|
|
self.syllabifier = pyphen.Pyphen(lang='en_US') |
|
|
else: |
|
|
self.syllabifier = None |
|
|
|
|
|
|
|
|
self.syllable_to_id: Dict[str, int] = {'<pad>': 0, '<unk>': 1} |
|
|
self.id_to_syllable: Dict[int, str] = {0: '<pad>', 1:'<unk>'} |
|
|
|
|
|
|
|
|
self.onset_to_id: Dict[str, int] = { |
|
|
'<pad>':0, '': 1, '<num>': 2, '<punct>': 3, '<special>': 4 |
|
|
} |
|
|
self.nucleus_to_id: Dict[str, int] = {'<pad>': 0, '': 1} |
|
|
self.coda_to_id: Dict[str, int] = {'<pad>': 0, '': 1} |
|
|
|
|
|
def get_feature_names(self) -> List[str]: |
|
|
"""Return ordered list of feature names (9 features)""" |
|
|
return [ |
|
|
'syllable_id', |
|
|
'onset_id', |
|
|
'nucleus_id', |
|
|
'coda_id', |
|
|
'position', |
|
|
'is_capitalized', |
|
|
'token_type', |
|
|
'has_space_after', |
|
|
'is_word_end', |
|
|
] |
|
|
|
|
|
def _syllabify(self, word:str) -> List[str]: |
|
|
"""Split word into syllables.""" |
|
|
if not word: |
|
|
return [] |
|
|
|
|
|
if self.syllabifier: |
|
|
hyphenated = self.syllabifier.inserted(word.lower()) |
|
|
return hyphenated.split('-') if hyphenated else [word.lower()] |
|
|
else: |
|
|
|
|
|
return self._basic_syllabify(word.lower()) |
|
|
|
|
|
def _basic_syllabify(self, word: str) -> List[str]: |
|
|
"""Basic syllabification fallback.""" |
|
|
vowels = set('aeiouy') |
|
|
syllables = [] |
|
|
current = "" |
|
|
|
|
|
for i, char in enumerate(word): |
|
|
current += char |
|
|
if char in vowels: |
|
|
|
|
|
if i + 2 < len(word) and word[i+1] not in vowels and word[i+2] in vowels: |
|
|
syllables.append(current) |
|
|
current = "" |
|
|
|
|
|
if current: |
|
|
if syllables: |
|
|
syllables[-1] += current |
|
|
else: |
|
|
syllables.append(current) |
|
|
return syllables if syllables else [word] |
|
|
|
|
|
def _extract_onset_nucleus_coda(self, syllable: str) -> Tuple[str, str, str]: |
|
|
""" |
|
|
Extract Onset-Nucles-Coda from syllable. |
|
|
|
|
|
Example: "string" -> onset="str", nucleus="i", coda="ng" |
|
|
""" |
|
|
|
|
|
syllable = syllable.lower() |
|
|
vowels = set('aeiouy') |
|
|
|
|
|
|
|
|
nucleus_start = -1 |
|
|
nucleus_end = -1 |
|
|
|
|
|
for i, char in enumerate(syllable): |
|
|
if char in vowels: |
|
|
if nucleus_start == -1: |
|
|
nucleus_start = i |
|
|
nucleus_end = i + 1 |
|
|
elif nucleus_start != -1: |
|
|
break |
|
|
|
|
|
if nucleus_start == -1: |
|
|
|
|
|
return syllable, '', '' |
|
|
|
|
|
onset = syllable[:nucleus_start] |
|
|
nucleus = syllable[nucleus_start:nucleus_end] |
|
|
coda = syllable[nucleus_end:] |
|
|
|
|
|
return onset, nucleus, coda |
|
|
|
|
|
def _get_or_add_syllable(self, syllable:str) -> int: |
|
|
"""Get syllable ID, adding to vocab if new.""" |
|
|
if syllable not in self.syllable_to_id: |
|
|
idx = len(self.syllable_to_id) |
|
|
self.syllable_to_id[syllable] = idx |
|
|
self.id_to_syllable[idx] = syllable |
|
|
return self.syllable_to_id[syllable] |
|
|
|
|
|
def _get_or_add_onset(self, onset: str) -> int: |
|
|
if onset not in self.onset_to_id: |
|
|
self.onset_to_id[onset] = len(self.onset_to_id) |
|
|
return self.onset_to_id[onset] |
|
|
|
|
|
def _get_or_add_nucleus(self, nucleus: str) -> int: |
|
|
if nucleus not in self.nucleus_to_id: |
|
|
self.nucleus_to_id[nucleus] = len(self.nucleus_to_id) |
|
|
return self.nucleus_to_id[nucleus] |
|
|
|
|
|
def _get_or_add_coda(self, coda: str) -> int: |
|
|
if coda not in self.coda_to_id: |
|
|
self.coda_to_id[coda] = len(self.coda_to_id) |
|
|
return self.coda_to_id[coda] |
|
|
|
|
|
def _determine_token_type(self, text: str) -> int: |
|
|
"""Determine token type: 0=syllable, 1=number, 2=punct, 3=special.""" |
|
|
if text.isdigit(): |
|
|
return 1 |
|
|
elif text in '.,!?;:\'"()-[]{}': |
|
|
return 2 |
|
|
elif text.isalpha(): |
|
|
return 0 |
|
|
else: |
|
|
return 3 |
|
|
|
|
|
def encode(self, text: str) -> List[Dict]: |
|
|
""" |
|
|
Encode text into list of 9-feature token dictionaries. |
|
|
|
|
|
Return list of dicts with keys matching get_feature_names(). |
|
|
""" |
|
|
if not text: |
|
|
return[] |
|
|
|
|
|
tokens = [] |
|
|
segments = self.WORD_PATTERN.findall(text) |
|
|
|
|
|
for seg_idx, segment in enumerate(segments): |
|
|
|
|
|
if segment.isspace(): |
|
|
if tokens: |
|
|
tokens[-1]['has_space_after'] = 1 |
|
|
continue |
|
|
|
|
|
|
|
|
has_space = 0 |
|
|
if seg_idx + 1 < len(segments) and segments [seg_idx + 1].isspace(): |
|
|
has_space = 1 |
|
|
|
|
|
|
|
|
token_type = self._determine_token_type(segment) |
|
|
is_cap = 1 if segment and segment[0].isupper() else 0 |
|
|
|
|
|
if token_type == 0: |
|
|
syllables = self._syllabify(segment) |
|
|
n_syls = len(syllables) |
|
|
|
|
|
for i, syl in enumerate(syllables): |
|
|
|
|
|
if n_syls == 1: |
|
|
position = 3 |
|
|
elif i == 0: |
|
|
position = 1 |
|
|
elif i == n_syls -1: |
|
|
position = 2 |
|
|
else: |
|
|
postiion = 0 |
|
|
|
|
|
|
|
|
onset, nucleus, coda = self._extract_onset_nucleus_coda(syl) |
|
|
|
|
|
|
|
|
syl_id = self._get_or_add_syllable(syl.lower()) |
|
|
onset_id = self._get_or_add_onset(onset) |
|
|
nucleus_id = self._get_or_add_nucleus(nucleus) |
|
|
coda_id = self._get_or_add_coda(coda) |
|
|
|
|
|
|
|
|
syl_cap = is_cap if i == 0 else 0 |
|
|
|
|
|
|
|
|
syl_space = has_space if i == n_syls - 1 else 0 |
|
|
|
|
|
tokens.append({ |
|
|
'text': syl, |
|
|
'syllable_id': syl_id, |
|
|
'onset_id': onset_id, |
|
|
'nucleus_id': nucleus_id, |
|
|
'coda_id': coda_id, |
|
|
'position': position, |
|
|
'is_capitalized': syl_cap, |
|
|
'token_type': token_type, |
|
|
'has_space_after': syl_space, |
|
|
'is_word_end': 1 if i == n_syls - 1 else 0, |
|
|
}) |
|
|
elif token_type == 1: |
|
|
syl_key = f"<num_{segment}>" |
|
|
syl_id = self._get_or_add_syllable(syl_key) |
|
|
|
|
|
tokens.append({ |
|
|
'text': segment, |
|
|
'syllable_id': syl_id, |
|
|
'onset_id': self.onset_to_id['<num>'], |
|
|
'nucleus_id': self.nucleus_to_id.get(segment, 1), |
|
|
'coda_id': self.coda_to_id.get('', 1), |
|
|
'position': 3, |
|
|
'is_capitalized': 0, |
|
|
'token_type': token_type, |
|
|
'has_space_after': has_space, |
|
|
'is_word_end': 1, |
|
|
}) |
|
|
|
|
|
elif token_type == 2: |
|
|
syl_key = f"<punct_{segment}>" |
|
|
syl_id = self._get_or_add_syllable(syl_key) |
|
|
|
|
|
tokens.append({ |
|
|
'text': segment, |
|
|
'syllable_id': syl_id, |
|
|
'onset_id': self.onset_to_id['<punct>'], |
|
|
'nucleus_id': self.nucleus_to_id.get(segment, 1), |
|
|
'coda_id': self.coda_to_id.get('', 1), |
|
|
'position': 3, |
|
|
'is_capitalized': 0, |
|
|
'token_type': token_type, |
|
|
'has_space_after': has_space, |
|
|
'is_word_end': 1, |
|
|
}) |
|
|
|
|
|
else: |
|
|
syl_key = f"<char_{segment}>" |
|
|
syl_id = self._get_or_add_syllable(syl_key) |
|
|
|
|
|
tokens.append({ |
|
|
'text': segment, |
|
|
'syllable_id': syl_id, |
|
|
'onset_id': self.onset_to_id['<special>'], |
|
|
'nucleus_id': self.nucleus_to_id.get('', 1), |
|
|
'coda_id': self.coda_to_id.get('', 1), |
|
|
'position': 3, |
|
|
'is_capitalized': 0, |
|
|
'token_type': token_type, |
|
|
'has_space_after': has_space, |
|
|
'is_word_end': 1, |
|
|
}) |
|
|
|
|
|
return tokens |
|
|
def decode(self, tokens: List[Dict]) -> str: |
|
|
"""Decode token list back to text.""" |
|
|
parts = [] |
|
|
|
|
|
for token in tokens: |
|
|
syl_id = token.get('syllable_id', 0) |
|
|
syl = self.id_to_syllable.get(syl_id, '<unk>') |
|
|
|
|
|
|
|
|
if syl.startswith('<punct_') and syl.endswith('>'): |
|
|
text = syl[7:-1] |
|
|
elif syl.startswith('<num_') and syl.endswith('>'): |
|
|
text = syl[5:-1] |
|
|
elif syl.startswith('<char_') and syl.endswith('>'): |
|
|
text = syl[6:-1] |
|
|
elif syl in ('<pad>', '<unk>'): |
|
|
continue |
|
|
else: |
|
|
text = syl |
|
|
|
|
|
if token.get('is_capitalized', 0): |
|
|
text = text[0].upper() + text[1:] if len(text) > 1 else text.upper() |
|
|
|
|
|
parts.appent(text) |
|
|
|
|
|
|
|
|
if token.get('has_space_after', 0): |
|
|
parts.append(' ') |
|
|
|
|
|
return ''.join(parts) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
print("=" * 70) |
|
|
print("SyllableLM v4 - Tokenizer Test") |
|
|
print("=" * 70) |
|
|
|
|
|
tokenizer = LunaTokenizer() |
|
|
|
|
|
tests = [ |
|
|
"Hello World!", |
|
|
"The quick brown fox jumps over the lazy dog.", |
|
|
"Artificial intelligence is fascinating.", |
|
|
] |
|
|
|
|
|
for text in tests: |
|
|
print(f"\nInput: '{text}'") |
|
|
encoded = tokenizer.encode(text) |
|
|
decoded = tokenizer.decode(encoded) |
|
|
print(f"Tokens: {len(encoded)}") |
|
|
print(f"Decoded: '{decoded}'") |
|
|
print(f"Match: {text == decoded}") |
|
|
|
|
|
print(f"\nFeatures ({len(tokenizer.get_feature_names())}): {tokenizer.get_feature_names()}") |
|
|
print(f"Vocab size: {len(tokenizer.syllable_to_id)}") |
|
|
|