Vortex-7b-V1 / tokenizer /vortex_tokenizer.py
Zandy-Wandy's picture
Upload Vortex model
bf64b03 verified
"""
VortexScienceTokenizer: A custom BPE tokenizer optimized for scientific text.
Trains on science corpus and extends vocabulary with domain-specific tokens.
"""
import os
import json
import re
from pathlib import Path
from typing import List, Dict, Optional, Tuple, Union
import torch
try:
from tokenizers import Tokenizer, models, pre_tokenizers, processors, trainers
from tokenizers.normalizers import Lowercase, NFD, StripAccents
except ImportError:
print("Please install tokenizers: pip install tokenizers")
raise
class VortexScienceTokenizer:
"""
Science-optimized BPE tokenizer with domain extensions.
Features:
- Base BPE vocabulary (40,000 tokens) trained on scientific corpus
- Extended science vocabulary (10,000 tokens) for LaTeX, chemistry, units, etc.
- Special tokens for equation/citation/molecule spans
- Domain tags for science areas
- Digit-level number handling (optional, can be toggled)
"""
def __init__(
self,
config: Dict,
tokenizer_path: Optional[str] = None,
vocab_size: int = 50000,
base_vocab_size: int = 40000,
extension_vocab_size: int = 10000,
):
"""
Initialize the tokenizer.
Args:
config: Model configuration with special tokens
tokenizer_path: Path to pre-trained tokenizer (if loading)
vocab_size: Total vocabulary size
base_vocab_size: Size of base BPE vocabulary
extension_vocab_size: Size of science extension vocabulary
"""
self.config = config
self.base_vocab_size = base_vocab_size
self.extension_vocab_size = extension_vocab_size
self._vocab_size = vocab_size
self.special_tokens = config.get("special_tokens", {})
self.domain_tags = config.get("domain_tags", [])
if tokenizer_path and os.path.exists(tokenizer_path):
self.tokenizer = Tokenizer.from_file(tokenizer_path)
print(f"Loaded tokenizer from {tokenizer_path}")
else:
# Initialize empty BPE tokenizer
self.tokenizer = Tokenizer(models.BPE())
self._setup_pre_tokenizer()
print("Initialized empty BPE tokenizer")
def _setup_pre_tokenizer(self):
"""Configure pre-tokenization rules."""
# Use byte-level pre-tokenization for robustness
self.tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
self.tokenizer.normalizer = None # Keep original casing for science terms
def train(
self,
file_paths: List[str],
min_frequency: int = 2,
special_tokens: Optional[List[str]] = None,
):
"""
Train the BPE tokenizer on scientific text files.
Args:
file_paths: List of text file paths for training
min_frequency: Minimum token frequency to keep
special_tokens: Additional special tokens to add
"""
if special_tokens is None:
special_tokens = list(self.special_tokens.keys()) + self.domain_tags
print(f"Training tokenizer on {len(file_paths)} files...")
print(f"Base vocab size: {self.base_vocab_size}")
print(f"Special tokens: {special_tokens}")
trainer = trainers.BpeTrainer(
vocab_size=self.base_vocab_size,
min_frequency=min_frequency,
special_tokens=special_tokens,
show_progress=True,
)
self.tokenizer.train(file_paths, trainer=trainer)
print(f"Training complete. Vocabulary size: {self.tokenizer.get_vocab_size()}")
# Extend with science-specific tokens
self._extend_science_vocabulary()
def _extend_science_vocabulary(self):
"""Add science-specific tokens to the vocabulary."""
current_vocab = self.tokenizer.get_vocab()
new_tokens = []
# LaTeX math symbols (common ones)
latex_symbols = [
"\\alpha", "\\beta", "\\gamma", "\\delta", "\\epsilon", "\\zeta",
"\\eta", "\\theta", "\\iota", "\\kappa", "\\lambda", "\\mu",
"\\nu", "\\xi", "\\pi", "\\rho", "\\sigma", "\\tau",
"\\upsilon", "\\phi", "\\chi", "\\psi", "\\omega",
"\\Gamma", "\\Delta", "\\Theta", "\\Lambda", "\\Xi", "\\Pi",
"\\Sigma", "\\Phi", "\\Psi", "\\Omega",
"\\sum", "\\prod", "\\int", "\\partial", "\\nabla", "\\infty",
"\\leq", "\\geq", "\\neq", "\\approx", "\\equiv", "\\sim",
"\\in", "\\notin", "\\subset", "\\supset", "\\cup", "\\cap",
"\\forall", "\\exists", "\\neg", "\\land", "\\lor", "\\rightarrow",
"\\leftarrow", "\\Rightarrow", "\\Leftarrow", "\\leftrightarrow",
"\\frac", "\\sqrt", "\\binom", "\\begin", "\\end", "\\mathbf",
"\\mathcal", "\\mathrm", "\\mathbb", "\\mathfrak",
]
new_tokens.extend(latex_symbols)
# Greek letters (Unicode)
greek_letters = [
"α", "β", "γ", "δ", "ε", "ζ", "η", "θ", "ι", "κ", "λ", "μ",
"ν", "ξ", "ο", "π", "ρ", "σ", "τ", "υ", "φ", "χ", "ψ", "ω",
"Γ", "Δ", "Θ", "Λ", "Ξ", "Π", "Σ", "Φ", "Ψ", "Ω",
]
new_tokens.extend(greek_letters)
# SI units and derived units
si_units = [
"m", "kg", "s", "mol", "K", "A", "cd", "mol",
"Hz", "N", "Pa", "J", "W", "C", "V", "F", "Ω", "S",
"Wb", "T", "H", "lm", "lx", "Bq", "Gy", "Sv", "kat",
"eV", "u", "Da", "Å", "°C", "%", "‰",
"M", "mM", "μM", "nM", "pM",
"g", "mg", "μg", "ng", "pg",
"km", "m", "cm", "mm", "μm", "nm", "pm",
"L", "mL", "μL", "nL",
"h", "min", "s", "ms", "μs", "ns",
]
new_tokens.extend(si_units)
# Common scientific abbreviations
sci_abbrevs = [
"DNA", "RNA", "mRNA", "tRNA", "rRNA", "cDNA", "gDNA",
"ATP", "ADP", "AMP", "NAD", "NADP", "FAD", "CoA",
"pH", "pKa", "pKb", "pI",
"PCR", "RT", "qPCR", "NGS", "WGS",
"IC50", "EC50", "KD", "Ki",
"XRD", "NMR", "IR", "UV", "VIS", "MS", "GC", "HPLC",
"SEM", "TEM", "AFM", "STM",
"S/N", "SNR", "RMS", "Std", "Var", "Cov",
"et al.", "vs.", "cf.", "viz.",
"Fig", "Eq", "Ref", "Tab", "Suppl",
]
new_tokens.extend(sci_abbrevs)
# Chemical element symbols
elements = [
"H", "He", "Li", "Be", "B", "C", "N", "O", "F", "Ne",
"Na", "Mg", "Al", "Si", "P", "S", "Cl", "Ar",
"K", "Ca", "Sc", "Ti", "V", "Cr", "Mn", "Fe", "Co", "Ni", "Cu", "Zn",
"Ga", "Ge", "As", "Se", "Br", "Kr",
"Rb", "Sr", "Y", "Zr", "Nb", "Mo", "Tc", "Ru", "Rh", "Pd", "Ag", "Cd",
"In", "Sn", "Sb", "Te", "I", "Xe",
"Cs", "Ba", "La", "Ce", "Pr", "Nd", "Pm", "Sm", "Eu", "Gd", "Tb",
"Dy", "Ho", "Er", "Tm", "Yb", "Lu",
"Hf", "Ta", "W", "Re", "Os", "Ir", "Pt", "Au", "Hg", "Tl", "Pb",
"Bi", "Po", "At", "Rn",
"Fr", "Ra", "Ac", "Th", "Pa", "U", "Np", "Pu", "Am", "Cm", "Bk",
"Cf", "Es", "Fm", "Md", "No", "Lr",
"Rf", "Db", "Sg", "Bh", "Hs", "Mt", "Ds", "Rg", "Cn", "Nh",
"Fl", "Mc", "Lv", "Ts", "Og",
]
new_tokens.extend(elements)
# Amino acid single-letter codes
amino_acids = ["A", "R", "N", "D", "C", "Q", "E", "G", "H", "I",
"L", "K", "M", "F", "P", "S", "T", "W", "Y", "V"]
new_tokens.extend(amino_acids)
# Mathematical operators (Unicode)
math_ops = [
"±", "∓", "×", "÷", "∈", "∉", "∋", "∏", "∑", "∧", "∨", "¬",
"≤", "≥", "≠", "≈", "≡", "≅", "≆", "≇", "≉", "≊", "≋",
"⊂", "⊃", "⊆", "⊇", "⊄", "⊅", "⊈", "⊉",
"∞", "∂", "∇", "√", "∛", "∜",
"∫", "∬", "∭", "∮", "∯", "∰",
"∴", "∵", "∶", "∷", "∼", "∽", "≈", "≋",
"⟨", "⟩", "|", "‖", "‵", "′", "″", "‴",
"•", "·", "‣", "⁂", "※", "‼", "⁇", "⁈",
]
new_tokens.extend(math_ops)
# Add tokens that aren't already in vocabulary
for token in new_tokens:
if token not in current_vocab:
self.tokenizer.add_tokens([token])
print(f"Extended vocabulary with {len(new_tokens)} science tokens")
print(f"Final vocabulary size: {self.tokenizer.get_vocab_size()}")
def save(self, path: str):
"""Save tokenizer to disk."""
self.tokenizer.save(path)
print(f"Tokenizer saved to {path}")
def encode(
self,
text: str,
add_special_tokens: bool = True,
return_tensors: str = "pt",
) -> Union[Dict, torch.Tensor]:
"""
Encode text to token IDs.
Args:
text: Input text
add_special_tokens: Add BOS/EOS tokens
return_tensors: "pt" for PyTorch tensors, "np" for numpy, None for list
Returns:
Dictionary with input_ids and attention_mask, or tensors/list
"""
encoding = self.tokenizer.encode(text, add_special_tokens=add_special_tokens)
result = {
"input_ids": encoding.ids,
"attention_mask": encoding.attention_mask,
}
if return_tensors == "pt":
result = {k: torch.tensor(v).unsqueeze(0) for k, v in result.items()}
elif return_tensors == "np":
import numpy as np
result = {k: np.array(v) for k, v in result.items()}
return result
def decode(self, token_ids: List[int], skip_special_tokens: bool = True) -> str:
"""Decode token IDs back to text."""
return self.tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
def batch_encode(
self,
texts: List[str],
padding: bool = True,
truncation: bool = True,
max_length: Optional[int] = None,
return_tensors: str = "pt",
) -> Dict:
"""
Encode a batch of texts.
Args:
texts: List of input texts
padding: Pad to same length
truncation: Truncate to max_length
max_length: Maximum sequence length
return_tensors: Tensor format
Returns:
Batch encoded dictionary
"""
if max_length is None:
max_length = self.config.get("max_seq_len", 16384)
encodings = self.tokenizer.encode_batch(
texts,
add_special_tokens=True,
)
# Manual padding/truncation
input_ids = []
attention_masks = []
for enc in encodings:
ids = enc.ids
mask = enc.attention_mask
if truncation and len(ids) > max_length:
ids = ids[:max_length]
mask = mask[:max_length]
input_ids.append(ids)
attention_masks.append(mask)
# Pad to same length if requested
if padding:
max_len = max(len(ids) for ids in input_ids)
padded_ids = []
padded_masks = []
for ids, mask in zip(input_ids, attention_masks):
pad_len = max_len - len(ids)
padded_ids.append(ids + [self.special_tokens["[PAD]"]] * pad_len)
padded_masks.append(mask + [0] * pad_len)
input_ids = padded_ids
attention_masks = padded_masks
result = {
"input_ids": input_ids,
"attention_mask": attention_masks,
}
if return_tensors == "pt":
result = {k: torch.tensor(v) for k, v in result.items()}
return result
@property
def vocab_size(self) -> int:
"""Get vocabulary size."""
return self.tokenizer.get_vocab_size()
def get_vocab(self) -> Dict[str, int]:
"""Get vocabulary dictionary."""
return self.tokenizer.get_vocab()
def token_to_id(self, token: str) -> int:
"""Convert token to ID."""
return self.tokenizer.token_to_id(token)
def id_to_token(self, id: int) -> str:
"""Convert ID to token."""
return self.tokenizer.id_to_token(id)
def build_science_vocabulary_file(output_path: str):
"""
Build a science vocabulary text file for BPE training.
This file contains seed vocabulary terms to ensure science tokens are present.
"""
science_terms = []
# LaTeX commands
latex_terms = [
"\\alpha", "\\beta", "\\gamma", "\\delta", "\\epsilon", "\\zeta",
"\\eta", "\\theta", "\\iota", "\\kappa", "\\lambda", "\\mu",
"\\nu", "\\xi", "\\pi", "\\rho", "\\sigma", "\\tau",
"\\upsilon", "\\phi", "\\chi", "\\psi", "\\omega",
"\\sum", "\\prod", "\\int", "\\partial", "\\nabla", "\\infty",
"\\frac", "\\sqrt", "\\binom", "\\begin", "\\end",
"\\mathbf", "\\mathcal", "\\mathrm", "\\mathbb",
"\\in", "\\subset", "\\cup", "\\cap", "\\forall", "\\exists",
"\\rightarrow", "\\leftarrow", "\\Rightarrow", "\\Leftarrow",
"\\leq", "\\geq", "\\neq", "\\approx", "\\equiv",
]
science_terms.extend(latex_terms)
# Chemical formulas
chem_formulas = [
"H2O", "CO2", "O2", "N2", "H2", "CH4", "C2H6", "C3H8",
"C6H12O6", "C12H22O11", "HCl", "H2SO4", "HNO3", "H3PO4",
"NaOH", "KOH", "CaCO3", "NaCl", "KCl", "MgCl2",
"Fe2O3", "Fe3O4", "CuO", "Cu2O", "ZnO", "Al2O3",
"SiO2", "TiO2", "MnO2", "NH3", "NO", "NO2", "N2O",
"SO2", "SO3", "CO", "CH3COOH", "C2H5OH",
]
science_terms.extend(chem_formulas)
# Mathematical expressions
math_exprs = [
"x^2", "x^3", "e^x", "ln(x)", "log(x)", "sin(x)", "cos(x)",
"tan(x)", "arcsin(x)", "arccos(x)", "arctan(x)",
"f(x)", "g(x)", "h(x)", "F(x)", "G(x)",
"dx", "dy", "dz", "dt", "∂x", "∂y", "∂z",
"∫", "∬", "∭", "∮", "∑_{i=1}^{n}", "∏_{i=1}^{n}",
]
science_terms.extend(math_exprs)
# Units with numbers
unit_exprs = [
"10^6", "10^9", "10^12", "10^15", "10^18",
"10^-3", "10^-6", "10^-9", "10^-12",
"m/s", "km/h", "cm/s", "mm/s",
"J/mol", "kJ/mol", "cal", "kcal",
"eV", "MeV", "GeV", "TeV",
"Hz", "kHz", "MHz", "GHz",
"Pa", "kPa", "MPa", "GPa",
"°C", "K", "°F",
]
science_terms.extend(unit_exprs)
# Write to file
with open(output_path, "w", encoding="utf-8") as f:
for term in science_terms:
f.write(term + "\n")
print(f"Science vocabulary seed file written to {output_path}")
print(f"Total seed terms: {len(science_terms)}")
if __name__ == "__main__":
# Example usage
import sys
if len(sys.argv) < 2:
print("Usage: python vortex_tokenizer.py <train_data.txt> [output_dir]")
sys.exit(1)
train_data = sys.argv[1]
output_dir = sys.argv[2] if len(sys.argv) > 2 else "."
# Load config (simplified for standalone)
config = {
"special_tokens": {
"[PAD]": 0, "[UNK]": 1, "[BOS]": 2, "[EOS]": 3,
"[EQUATION]": 4, "[/EQUATION]": 5,
"[CITATION]": 6, "[/CITATION]": 7,
"[MOLECULE]": 8, "[/MOLECULE]": 9,
"[FIGURE]": 10, "[TABLE]": 11,
"[MATH]": 12, "[CHEM]": 13, "[BIO]": 14,
"[PHYS]": 15, "[EARTH]": 16, "[SPACE]": 17, "[ZOO]": 18,
},
"domain_tags": ["[MATH]", "[CHEM]", "[BIO]", "[PHYS]", "[EARTH]", "[SPACE]", "[ZOO]"],
"max_seq_len": 16384,
}
# Build seed vocabulary
seed_vocab_path = os.path.join(output_dir, "science_seed_vocab.txt")
build_science_vocabulary_file(seed_vocab_path)
# Initialize and train tokenizer
tokenizer = VortexScienceTokenizer(config)
tokenizer.train([train_data])
# Save tokenizer
tokenizer_path = os.path.join(output_dir, "vortex_tokenizer.json")
tokenizer.save(tokenizer_path)
print(f"Tokenizer saved to {tokenizer_path}")