Ahadhassan-2003
deploy: update HF Space
dc4e6da
"""
Character-level tokenizer for handwriting generation.
Supports special tokens and can be saved/loaded for inference.
"""
import json
import os
from typing import List, Dict, Optional
import numpy as np
class CharTokenizer:
"""Character-level tokenizer with special tokens."""
# Special tokens
PAD_TOKEN = "<PAD>"
UNK_TOKEN = "<UNK>"
SOS_TOKEN = "<SOS>"
EOS_TOKEN = "<EOS>"
def __init__(
self,
vocab: Optional[Dict[str, int]] = None,
max_length: int = 128
):
"""
Initialize tokenizer.
Args:
vocab: Character to index mapping. If None, will be built from data.
max_length: Maximum sequence length for padding/truncation.
"""
self.max_length = max_length
if vocab is None:
# Initialize with special tokens only
self.char_to_idx = {
self.PAD_TOKEN: 0,
self.UNK_TOKEN: 1,
self.SOS_TOKEN: 2,
self.EOS_TOKEN: 3,
}
else:
self.char_to_idx = vocab
self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()}
self.vocab_size = len(self.char_to_idx)
def build_vocab(self, texts: List[str]) -> None:
"""
Build vocabulary from list of texts.
Args:
texts: List of text strings to build vocabulary from.
"""
# Collect all unique characters
unique_chars = set()
for text in texts:
unique_chars.update(text)
# Sort for deterministic ordering
unique_chars = sorted(list(unique_chars))
# Add to vocabulary (starting after special tokens)
for char in unique_chars:
if char not in self.char_to_idx:
self.char_to_idx[char] = len(self.char_to_idx)
# Update reverse mapping
self.idx_to_char = {idx: char for char, idx in self.char_to_idx.items()}
self.vocab_size = len(self.char_to_idx)
print(f"Built vocabulary with {self.vocab_size} characters")
print(f"Sample characters: {list(unique_chars)[:20]}")
def encode(
self,
text: str,
add_special_tokens: bool = True,
padding: bool = True,
truncation: bool = True,
return_attention_mask: bool = True
) -> Dict[str, np.ndarray]:
"""
Encode text to token indices.
Args:
text: Input text string.
add_special_tokens: Whether to add SOS/EOS tokens.
padding: Whether to pad to max_length.
truncation: Whether to truncate to max_length.
return_attention_mask: Whether to return attention mask.
Returns:
Dictionary with 'input_ids' and optionally 'attention_mask'.
"""
# Convert characters to indices
token_ids = []
if add_special_tokens:
token_ids.append(self.char_to_idx[self.SOS_TOKEN])
for char in text:
token_ids.append(
self.char_to_idx.get(char, self.char_to_idx[self.UNK_TOKEN])
)
if add_special_tokens:
token_ids.append(self.char_to_idx[self.EOS_TOKEN])
# Truncation
if truncation and len(token_ids) > self.max_length:
token_ids = token_ids[:self.max_length]
if add_special_tokens:
token_ids[-1] = self.char_to_idx[self.EOS_TOKEN]
# Create attention mask (1 for real tokens, 0 for padding)
attention_mask = [1] * len(token_ids)
# Padding
if padding and len(token_ids) < self.max_length:
padding_length = self.max_length - len(token_ids)
token_ids.extend([self.char_to_idx[self.PAD_TOKEN]] * padding_length)
attention_mask.extend([0] * padding_length)
result = {
'input_ids': np.array(token_ids, dtype=np.int64)
}
if return_attention_mask:
result['attention_mask'] = np.array(attention_mask, dtype=np.float32)
return result
def encode_batch(
self,
texts: List[str],
add_special_tokens: bool = True,
padding: bool = True,
truncation: bool = True,
return_attention_mask: bool = True
) -> Dict[str, np.ndarray]:
"""
Encode batch of texts.
Args:
texts: List of text strings.
add_special_tokens: Whether to add SOS/EOS tokens.
padding: Whether to pad to max_length.
truncation: Whether to truncate to max_length.
return_attention_mask: Whether to return attention mask.
Returns:
Dictionary with batched 'input_ids' and optionally 'attention_mask'.
"""
batch_encoding = [
self.encode(
text,
add_special_tokens=add_special_tokens,
padding=padding,
truncation=truncation,
return_attention_mask=return_attention_mask
)
for text in texts
]
result = {
'input_ids': np.stack([enc['input_ids'] for enc in batch_encoding])
}
if return_attention_mask:
result['attention_mask'] = np.stack([enc['attention_mask'] for enc in batch_encoding])
return result
def decode(
self,
token_ids: List[int],
skip_special_tokens: bool = True
) -> str:
"""
Decode token indices to text.
Args:
token_ids: List of token indices.
skip_special_tokens: Whether to skip special tokens in output.
Returns:
Decoded text string.
"""
chars = []
special_tokens = {
self.char_to_idx[self.PAD_TOKEN],
self.char_to_idx[self.UNK_TOKEN],
self.char_to_idx[self.SOS_TOKEN],
self.char_to_idx[self.EOS_TOKEN]
}
for idx in token_ids:
if skip_special_tokens and idx in special_tokens:
continue
chars.append(self.idx_to_char.get(idx, self.UNK_TOKEN))
return ''.join(chars)
def save(self, save_path: str) -> None:
"""
Save tokenizer to file.
Args:
save_path: Path to save tokenizer (JSON file).
"""
os.makedirs(os.path.dirname(save_path), exist_ok=True)
config = {
'char_to_idx': self.char_to_idx,
'max_length': self.max_length,
'vocab_size': self.vocab_size
}
with open(save_path, 'w', encoding='utf-8') as f:
json.dump(config, f, ensure_ascii=False, indent=2)
print(f"Tokenizer saved to {save_path}")
@classmethod
def load(cls, load_path: str) -> "CharTokenizer":
"""
Load tokenizer from file.
Args:
load_path: Path to load tokenizer from (JSON file).
Returns:
Loaded tokenizer instance.
"""
with open(load_path, 'r', encoding='utf-8') as f:
config = json.load(f)
tokenizer = cls(
vocab=config['char_to_idx'],
max_length=config['max_length']
)
print(f"Tokenizer loaded from {load_path}")
print(f"Vocabulary size: {tokenizer.vocab_size}")
return tokenizer
def __len__(self) -> int:
"""Return vocabulary size."""
return self.vocab_size
def __repr__(self) -> str:
return f"CharTokenizer(vocab_size={self.vocab_size}, max_length={self.max_length})"
def build_tokenizer_from_csv(csv_path: str, max_length: int = 128) -> CharTokenizer:
"""
Build tokenizer from IAM dataset CSV file.
Args:
csv_path: Path to dataset_metadata.csv
max_length: Maximum sequence length
Returns:
Built tokenizer
"""
import pandas as pd
print(f"Loading texts from {csv_path}...")
df = pd.read_csv(csv_path)
texts = df['text'].astype(str).tolist()
print(f"Building vocabulary from {len(texts)} samples...")
tokenizer = CharTokenizer(max_length=max_length)
tokenizer.build_vocab(texts)
return tokenizer
if __name__ == "__main__":
# Example: Build tokenizer from IAM dataset
tokenizer = build_tokenizer_from_csv(
"../iam_dataset_processed/dataset_metadata.csv",
max_length=128
)
# Save tokenizer
tokenizer.save("../training/tokenizer.json")
# Test encoding
test_text = "Hello, World!"
encoded = tokenizer.encode(test_text)
print(f"\nTest encoding for: '{test_text}'")
print(f"Input IDs: {encoded['input_ids'][:20]}")
print(f"Attention mask: {encoded['attention_mask'][:20]}")
# Test decoding
decoded = tokenizer.decode(encoded['input_ids'])
print(f"Decoded: '{decoded}'")