ChemMiniQ3-HoriFIE / FastChemTokenizer.py

Upload repo

9b6d935 verified 4 months ago

23.1 kB

	import torch
	import json
	import os
	from typing import List, Union, Optional, Tuple
	from transformers.tokenization_utils_base import BatchEncoding
	from functools import lru_cache

	# Copyright 2025 Genta Pramillean Bayu (@gbyuvd)
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	class TrieNode:
	__slots__ = ['children', 'token_id']
	def __init__(self):
	self.children = {}
	self.token_id = None # If set, this node completes a valid token


	class FastChemTokenizer:
	def __init__(self, token_to_id, model_max_length=512):
	self.token_to_id = token_to_id
	self.id_to_token = {v: k for k, v in token_to_id.items()}
	# No more self.token_set — replaced by trie
	self.model_max_length = model_max_length

	# Precompute max token length for possible use & clarity
	self.max_token_len = max(len(t) for t in token_to_id.keys())

	# Build trie for fast longest-match lookup
	self.trie_root = self._build_trie(token_to_id)

	# Validate required special tokens
	required_special_tokens = ["<s>", "</s>", "<pad>", "<unk>", "<mask>"]
	for tok in required_special_tokens:
	if tok not in token_to_id:
	raise KeyError(f"Required special token '{tok}' not found in vocab.")

	# Special token IDs
	self.bos_token_id = token_to_id["<s>"]
	self.eos_token_id = token_to_id["</s>"]
	self.pad_token_id = token_to_id["<pad>"]
	self.unk_token_id = token_to_id["<unk>"]
	self.mask_token_id = token_to_id["<mask>"]

	# Special tokens for convenience
	self.bos_token = "<s>"
	self.eos_token = "</s>"
	self.pad_token = "<pad>"
	self.unk_token = "<unk>"
	self.mask_token = "<mask>"

	def _build_trie(self, token_to_id):
	root = TrieNode()
	for token, tid in token_to_id.items():
	node = root
	for char in token:
	if char not in node.children:
	node.children[char] = TrieNode()
	node = node.children[char]
	node.token_id = tid
	return root

	def __len__(self):
	"""Return vocab size — REQUIRED for HF compatibility."""
	return len(self.token_to_id)

	def __call__(self, text: Union[str, List[str]], text_pair: Optional[Union[str, List[str]]] = None, **kwargs) -> BatchEncoding:
	if isinstance(text, list):
	batch = [(t, p) if p is not None else t for t, p in zip(text, text_pair)] if text_pair else text
	return self.batch_encode_plus(batch, **kwargs)
	else:
	return self.encode_plus(text=text, text_pair=text_pair, **kwargs)

	@lru_cache(maxsize=10000)
	def _cached_encode_str(self, s: str) -> Tuple[int, ...]:
	return tuple(self._encode_core(s))

	def _encode_core(self, text: str) -> List[int]:
	"""Core encoding logic using Trie — no caching."""
	tokens = text
	result_ids = []
	i = 0
	n = len(tokens)

	while i < n:
	node = self.trie_root
	j = i
	last_match_id = None
	last_match_end = i

	# Traverse trie while characters match
	while j < n and tokens[j] in node.children:
	node = node.children[tokens[j]]
	j += 1
	if node.token_id is not None:
	last_match_id = node.token_id
	last_match_end = j # Remember end of valid token

	if last_match_id is not None:
	result_ids.append(last_match_id)
	i = last_match_end
	else:
	# Fallback: encode single char
	tok = tokens[i]
	result_ids.append(self.token_to_id.get(tok, self.unk_token_id))
	i += 1

	return result_ids

	def encode(self, text: str) -> List[int]:
	"""Public encode method — strips input and uses cache."""
	return list(self._cached_encode_str(text.strip()))

	def decode(self, token_ids: Union[List[int], torch.Tensor], skip_special_tokens: bool = False) -> str:
	if isinstance(token_ids, torch.Tensor):
	token_ids = token_ids.tolist()

	if skip_special_tokens:
	special_ids = {
	self.bos_token_id,
	self.eos_token_id,
	self.pad_token_id,
	self.mask_token_id,
	}
	else:
	special_ids = set()

	tokens = []
	for tid in token_ids:
	if tid in special_ids:
	continue
	token = self.id_to_token.get(tid, self.unk_token)
	tokens.append(token)

	return "".join(tokens)

	def decode_with_trace(self, token_ids: List[int]) -> None:
	print(f"\n Decoding {len(token_ids)} tokens:")
	for i, tid in enumerate(token_ids):
	token = self.id_to_token.get(tid, self.unk_token)
	print(f" [{i:03d}] ID={tid:5d} → '{token}'")

	def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
	return [self.id_to_token.get(i, self.unk_token) for i in ids]

	def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
	return [self.token_to_id.get(t, self.unk_token_id) for t in tokens]

	def encode_plus(
	self,
	text: str,
	text_pair: Optional[str] = None,
	add_special_tokens: bool = True,
	padding: Union[bool, str] = False,
	truncation: bool = False,
	max_length: Optional[int] = None,
	return_tensors: Optional[str] = None,
	return_attention_mask: bool = True,
	return_token_type_ids: bool = True,
	) -> BatchEncoding:
	if max_length is None:
	max_length = self.model_max_length

	ids_a = self.encode(text)

	if text_pair is not None:
	ids_b = self.encode(text_pair)
	else:
	ids_b = None

	input_ids = []
	token_type_ids = []

	if add_special_tokens:
	input_ids.append(self.bos_token_id)
	token_type_ids.append(0)
	if ids_b is not None:
	input_ids.extend(ids_a)
	token_type_ids.extend([0] * len(ids_a))
	input_ids.append(self.eos_token_id)
	token_type_ids.append(0)

	input_ids.extend(ids_b)
	token_type_ids.extend([1] * len(ids_b))
	input_ids.append(self.eos_token_id)
	token_type_ids.append(1)
	else:
	input_ids.extend(ids_a)
	token_type_ids.extend([0] * len(ids_a))
	input_ids.append(self.eos_token_id)
	token_type_ids.append(0)
	else:
	input_ids = ids_a
	token_type_ids = [0] * len(input_ids)
	if ids_b is not None:
	input_ids.extend(ids_b)
	token_type_ids.extend([1] * len(ids_b))

	if truncation and len(input_ids) > max_length:
	input_ids = input_ids[:max_length]
	token_type_ids = token_type_ids[:max_length]

	if padding:
	pad_len = max_length - len(input_ids)
	if pad_len > 0:
	input_ids.extend([self.pad_token_id] * pad_len)
	token_type_ids.extend([0] * pad_len)

	attention_mask = [1 if tid != self.pad_token_id else 0 for tid in input_ids]

	encoded_dict = {
	"input_ids": input_ids,
	"attention_mask": attention_mask,
	}
	if return_token_type_ids:
	encoded_dict["token_type_ids"] = token_type_ids

	if return_tensors == "pt":
	output = {}
	for k, v in encoded_dict.items():
	tensor = torch.tensor(v, dtype=torch.long) # Fixed: use torch.tensor, not as_tensor
	if tensor.ndim == 1:
	tensor = tensor.unsqueeze(0)
	output[k] = tensor
	else:
	output = encoded_dict

	return BatchEncoding(output, tensor_type=return_tensors)

	def batch_encode_plus(
	self,
	batch_text_or_text_pairs: List[Union[str, Tuple[str, str]]],
	**kwargs
	) -> BatchEncoding:
	all_input_ids = []
	all_attention_masks = []
	all_token_type_ids = []

	for item in batch_text_or_text_pairs:
	if isinstance(item, tuple):
	text, text_pair = item
	else:
	text, text_pair = item, None

	encoded = self.encode_plus(
	text=text,
	text_pair=text_pair,
	**kwargs
	)
	all_input_ids.append(encoded["input_ids"])
	all_attention_masks.append(encoded["attention_mask"])
	if "token_type_ids" in encoded:
	all_token_type_ids.append(encoded["token_type_ids"])

	batched = {
	"input_ids": all_input_ids,
	"attention_mask": all_attention_masks,
	}
	if all_token_type_ids:
	batched["token_type_ids"] = all_token_type_ids

	if kwargs.get("return_tensors") == "pt":
	def to_tensor_list(lst):
	# Fixed: Handle both tensor and non-tensor items properly
	return [item.clone().detach() if isinstance(item, torch.Tensor)
	else torch.tensor(item, dtype=torch.long) for item in lst]
	batched = {
	k: torch.nn.utils.rnn.pad_sequence(
	to_tensor_list(v),
	batch_first=True,
	padding_value=self.pad_token_id if k == "input_ids" else 0
	)
	for k, v in batched.items()
	}

	return BatchEncoding(batched, tensor_type=kwargs.get("return_tensors"))

	# Save vocab to directory
	def save_pretrained(self, save_directory: str):
	"""
	Save tokenizer vocab as `vocab.json` in target directory.
	Mimics Hugging Face convention.
	"""
	if not os.path.exists(save_directory):
	os.makedirs(save_directory)

	vocab_file = os.path.join(save_directory, "vocab.json")

	# Keys are strings, values are ints — JSON-safe
	with open(vocab_file, "w", encoding="utf-8") as f:
	json.dump(self.token_to_id, f, ensure_ascii=False, indent=2)

	print(f" Tokenizer vocab saved to: {vocab_file}")

	# Load from pretrained directory
	@classmethod
	def from_pretrained(cls, pretrained_directory: str, model_max_length=512):
	"""
	Load tokenizer from directory containing `vocab.json`.
	"""
	vocab_file = os.path.join(pretrained_directory, "vocab.json")

	if not os.path.exists(vocab_file):
	raise FileNotFoundError(f"Vocab file not found: {vocab_file}")

	with open(vocab_file, "r", encoding="utf-8") as f:
	token_to_id = json.load(f)

	# Convert keys to str (JSON loads as str anyway), values to int
	token_to_id = {str(k): int(v) for k, v in token_to_id.items()}

	return cls(token_to_id=token_to_id, model_max_length=model_max_length)

	class FastChemTokenizerSelfies:
	def __init__(self, token_to_id, model_max_length=512):
	self.token_to_id = token_to_id
	self.id_to_token = {v: k for k, v in token_to_id.items()}
	# No more self.token_set — replaced by trie
	self.model_max_length = model_max_length

	# Precompute max token length for possible use & clarity
	self.max_token_len = max(len(t) for t in token_to_id.keys())

	# Build trie for fast longest-match lookup
	self.trie_root = self._build_trie(token_to_id)

	# Validate required special tokens
	required_special_tokens = ["<s>", "</s>", "<pad>", "<unk>", "<mask>"]
	for tok in required_special_tokens:
	if tok not in token_to_id:
	raise KeyError(f"Required special token '{tok}' not found in vocab.")

	# Special token IDs
	self.bos_token_id = token_to_id["<s>"]
	self.eos_token_id = token_to_id["</s>"]
	self.pad_token_id = token_to_id["<pad>"]
	self.unk_token_id = token_to_id["<unk>"]
	self.mask_token_id = token_to_id["<mask>"]

	# Special tokens for convenience
	self.bos_token = "<s>"
	self.eos_token = "</s>"
	self.pad_token = "<pad>"
	self.unk_token = "<unk>"
	self.mask_token = "<mask>"

	def _build_trie(self, token_to_id):
	root = TrieNode()
	for token, tid in token_to_id.items():
	node = root
	for char in token:
	if char not in node.children:
	node.children[char] = TrieNode()
	node = node.children[char]
	node.token_id = tid
	return root

	def __len__(self):
	"""Return vocab size — REQUIRED for HF compatibility."""
	return len(self.token_to_id)

	def __call__(self, text: Union[str, List[str]], text_pair: Optional[Union[str, List[str]]] = None, **kwargs) -> BatchEncoding:
	if isinstance(text, list):
	batch = [(t, p) if p is not None else t for t, p in zip(text, text_pair)] if text_pair else text
	return self.batch_encode_plus(batch, **kwargs)
	else:
	return self.encode_plus(text=text, text_pair=text_pair, **kwargs)

	@lru_cache(maxsize=10000)
	def _cached_encode_str(self, s: str) -> Tuple[int, ...]:
	return tuple(self._encode_core(s))

	def _encode_core(self, text: str) -> List[int]:
	"""Core encoding logic using Trie — skips whitespace if not part of a token."""
	result_ids = []
	i = 0
	n = len(text)

	while i < n:
	if text[i].isspace(): # ← Skip whitespace unless part of a token
	i += 1
	continue

	node = self.trie_root
	j = i
	last_match_id = None
	last_match_end = i

	# Traverse trie while characters match
	while j < n and text[j] in node.children:
	node = node.children[text[j]]
	j += 1
	if node.token_id is not None:
	last_match_id = node.token_id
	last_match_end = j

	if last_match_id is not None:
	result_ids.append(last_match_id)
	i = last_match_end
	else:
	# Fallback: encode single char
	result_ids.append(self.token_to_id.get(text[i], self.unk_token_id))
	i += 1

	return result_ids


	def encode(self, text: str) -> List[int]:
	"""Public encode method — strips input and uses cache."""
	return list(self._cached_encode_str(text.strip()))

	def decode(self, token_ids: Union[List[int], torch.Tensor], skip_special_tokens: bool = False) -> str:
	if isinstance(token_ids, torch.Tensor):
	token_ids = token_ids.tolist()

	if skip_special_tokens:
	special_ids = {
	self.bos_token_id,
	self.eos_token_id,
	self.pad_token_id,
	self.mask_token_id,
	}
	else:
	special_ids = set()

	tokens = []
	for tid in token_ids:
	if tid in special_ids:
	continue
	token = self.id_to_token.get(tid, self.unk_token)
	tokens.append(token)

	# Join with SPACE between tokens — this reconstructs original format
	return " ".join(tokens)

	def decode_with_trace(self, token_ids: List[int]) -> None:
	print(f"\n Decoding {len(token_ids)} tokens:")
	for i, tid in enumerate(token_ids):
	token = self.id_to_token.get(tid, self.unk_token)
	print(f" [{i:03d}] ID={tid:5d} → '{token}'")

	def convert_ids_to_tokens(self, ids: List[int]) -> List[str]:
	return [self.id_to_token.get(i, self.unk_token) for i in ids]

	def convert_tokens_to_ids(self, tokens: List[str]) -> List[int]:
	return [self.token_to_id.get(t, self.unk_token_id) for t in tokens]

	def encode_plus(
	self,
	text: str,
	text_pair: Optional[str] = None,
	add_special_tokens: bool = True,
	padding: Union[bool, str] = False,
	truncation: bool = False,
	max_length: Optional[int] = None,
	return_tensors: Optional[str] = None,
	return_attention_mask: bool = True,
	return_token_type_ids: bool = True,
	) -> BatchEncoding:
	if max_length is None:
	max_length = self.model_max_length

	ids_a = self.encode(text)

	if text_pair is not None:
	ids_b = self.encode(text_pair)
	else:
	ids_b = None

	input_ids = []
	token_type_ids = []

	if add_special_tokens:
	input_ids.append(self.bos_token_id)
	token_type_ids.append(0)
	if ids_b is not None:
	input_ids.extend(ids_a)
	token_type_ids.extend([0] * len(ids_a))
	input_ids.append(self.eos_token_id)
	token_type_ids.append(0)

	input_ids.extend(ids_b)
	token_type_ids.extend([1] * len(ids_b))
	input_ids.append(self.eos_token_id)
	token_type_ids.append(1)
	else:
	input_ids.extend(ids_a)
	token_type_ids.extend([0] * len(ids_a))
	input_ids.append(self.eos_token_id)
	token_type_ids.append(0)
	else:
	input_ids = ids_a
	token_type_ids = [0] * len(input_ids)
	if ids_b is not None:
	input_ids.extend(ids_b)
	token_type_ids.extend([1] * len(ids_b))

	if truncation and len(input_ids) > max_length:
	input_ids = input_ids[:max_length]
	token_type_ids = token_type_ids[:max_length]

	if padding:
	pad_len = max_length - len(input_ids)
	if pad_len > 0:
	input_ids.extend([self.pad_token_id] * pad_len)
	token_type_ids.extend([0] * pad_len)

	attention_mask = [1 if tid != self.pad_token_id else 0 for tid in input_ids]

	encoded_dict = {
	"input_ids": input_ids,
	"attention_mask": attention_mask,
	}
	if return_token_type_ids:
	encoded_dict["token_type_ids"] = token_type_ids

	if return_tensors == "pt":
	output = {}
	for k, v in encoded_dict.items():
	tensor = torch.tensor(v, dtype=torch.long) # Fixed: use torch.tensor, not as_tensor
	if tensor.ndim == 1:
	tensor = tensor.unsqueeze(0)
	output[k] = tensor
	else:
	output = encoded_dict

	return BatchEncoding(output, tensor_type=return_tensors)

	def batch_encode_plus(
	self,
	batch_text_or_text_pairs: List[Union[str, Tuple[str, str]]],
	**kwargs
	) -> BatchEncoding:
	all_input_ids = []
	all_attention_masks = []
	all_token_type_ids = []

	for item in batch_text_or_text_pairs:
	if isinstance(item, tuple):
	text, text_pair = item
	else:
	text, text_pair = item, None

	encoded = self.encode_plus(
	text=text,
	text_pair=text_pair,
	**kwargs
	)
	all_input_ids.append(encoded["input_ids"])
	all_attention_masks.append(encoded["attention_mask"])
	if "token_type_ids" in encoded:
	all_token_type_ids.append(encoded["token_type_ids"])

	batched = {
	"input_ids": all_input_ids,
	"attention_mask": all_attention_masks,
	}
	if all_token_type_ids:
	batched["token_type_ids"] = all_token_type_ids

	if kwargs.get("return_tensors") == "pt":
	def to_tensor_list(lst):
	# Fixed: Handle both tensor and non-tensor items properly
	return [item.clone().detach() if isinstance(item, torch.Tensor)
	else torch.tensor(item, dtype=torch.long) for item in lst]
	batched = {
	k: torch.nn.utils.rnn.pad_sequence(
	to_tensor_list(v),
	batch_first=True,
	padding_value=self.pad_token_id if k == "input_ids" else 0
	)
	for k, v in batched.items()
	}

	return BatchEncoding(batched, tensor_type=kwargs.get("return_tensors"))

	# Save vocab to directory
	def save_pretrained(self, save_directory: str):
	"""
	Save tokenizer vocab as `vocab.json` in target directory.
	Mimics Hugging Face convention.
	"""
	if not os.path.exists(save_directory):
	os.makedirs(save_directory)

	vocab_file = os.path.join(save_directory, "vocab.json")

	# Keys are strings, values are ints — JSON-safe
	with open(vocab_file, "w", encoding="utf-8") as f:
	json.dump(self.token_to_id, f, ensure_ascii=False, indent=2)

	print(f"Tokenizer vocab saved to: {vocab_file}")

	# Load from pretrained directory
	@classmethod
	def from_pretrained(cls, pretrained_directory: str, model_max_length=512):
	"""
	Load tokenizer from directory containing `vocab.json`.
	"""
	vocab_file = os.path.join(pretrained_directory, "vocab.json")

	if not os.path.exists(vocab_file):
	raise FileNotFoundError(f"Vocab file not found: {vocab_file}")

	with open(vocab_file, "r", encoding="utf-8") as f:
	token_to_id = json.load(f)

	# Convert keys to str (JSON loads as str anyway), values to int
	token_to_id = {str(k): int(v) for k, v in token_to_id.items()}

	return cls(token_to_id=token_to_id, model_max_length=model_max_length)