Sync from GitHub Actions

c792dcb verified 6 days ago

4.93 kB

	"""
	Simple Word-Level Tokenizer
	==============================
	A basic tokenizer for demonstration purposes.
	Converts text to token IDs with special tokens.
	"""

	import re
	import json
	import torch
	from typing import Dict, List, Optional
	from collections import Counter
	from tqdm import tqdm


	class SimpleTokenizer:
	"""
	A simple word-level tokenizer with special tokens.

	Special Tokens:
	- [PAD]: Padding token (id=0)
	- [UNK]: Unknown token (id=1)
	- [CLS]: Classification token (id=2)
	- [SEP]: Separator token (id=3)
	"""

	def __init__(self, vocab_size: int = 30000):
	self.vocab_size = vocab_size

	# Special tokens
	self.special_tokens = {
	'[PAD]': 0,
	'[UNK]': 1,
	'[CLS]': 2,
	'[SEP]': 3,
	}

	# Word to ID mapping
	self.word_to_id: Dict[str, int] = dict(self.special_tokens)
	self.id_to_word: Dict[int, str] = {v: k for k, v in self.special_tokens.items()}

	# Special token IDs
	self.pad_token_id = 0
	self.unk_token_id = 1
	self.cls_token_id = 2
	self.sep_token_id = 3

	def _tokenize(self, text: str) -> List[str]:
	"""
	Split text into tokens (simple word-level tokenization).

	Args:
	text: Input text string

	Returns:
	List of tokens
	"""
	# Lowercase and basic cleaning
	text = text.lower().strip()

	# Simple word tokenization with punctuation handling
	tokens = re.findall(r'\b\w+\b\|[^\w\s]', text)

	return tokens

	def build_vocab(self, texts: List[str], min_freq: int = 2):
	"""
	Build vocabulary from a list of texts.

	Args:
	texts: List of text strings
	min_freq: Minimum frequency for a word to be included
	"""
	# Count word frequencies
	word_counts = Counter()

	for text in tqdm(texts, desc="Building vocabulary"):
	tokens = self._tokenize(text)
	word_counts.update(tokens)

	# Sort by frequency and take top vocab_size - special_tokens
	max_words = self.vocab_size - len(self.special_tokens)

	sorted_words = sorted(
	word_counts.items(),
	key=lambda x: x[1],
	reverse=True
	)

	# Add words to vocabulary
	for word, count in sorted_words[:max_words]:
	if count >= min_freq and word not in self.word_to_id:
	idx = len(self.word_to_id)
	self.word_to_id[word] = idx
	self.id_to_word[idx] = word

	print(f"Vocabulary size: {len(self.word_to_id)}")

	def encode(self, text: str, max_length: int = 128) -> Dict:
	# Tokenize
	tokens = self._tokenize(text)

	# Convert to IDs (with CLS and SEP)
	token_ids = [self.cls_token_id]

	for token in tokens[:max_length - 2]: # Reserve space for CLS and SEP
	token_id = self.word_to_id.get(token, self.unk_token_id)
	token_ids.append(token_id)

	token_ids.append(self.sep_token_id)

	# Create attention mask
	attention_mask = [1] * len(token_ids)

	# Pad to max_length
	padding_length = max_length - len(token_ids)
	token_ids.extend([self.pad_token_id] * padding_length)
	attention_mask.extend([0] * padding_length)

	return {
	'input_ids': torch.tensor(token_ids, dtype=torch.long),
	'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
	}

	def decode(self, token_ids: List[int]) -> str:
	"""
	Decode token IDs back to text.

	Args:
	token_ids: List of token IDs

	Returns:
	Decoded text string
	"""
	tokens = []
	for idx in token_ids:
	if idx in [self.pad_token_id, self.cls_token_id, self.sep_token_id]:
	continue
	token = self.id_to_word.get(idx, '[UNK]')
	tokens.append(token)
	return ' '.join(tokens)

	def save(self, path: str):
	"""Save tokenizer vocabulary to JSON file."""
	data = {
	'vocab_size': self.vocab_size,
	'word_to_id': self.word_to_id,
	}
	with open(path, 'w') as f:
	json.dump(data, f, indent=2)

	def load(self, path: str):
	"""Load tokenizer vocabulary from JSON file."""
	with open(path, 'r') as f:
	data = json.load(f)

	self.vocab_size = data['vocab_size']
	self.word_to_id = data['word_to_id']
	self.id_to_word = {int(v): k for k, v in self.word_to_id.items()}

	def __len__(self) -> int:
	return len(self.word_to_id)