File size: 5,231 Bytes
adc0ea3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
"""
Simple Word-Level Tokenizer
==============================
A basic tokenizer for demonstration purposes.
Converts text to token IDs with special tokens.
"""

import re
import json
from typing import Dict, List, Optional
from collections import Counter
from tqdm import tqdm


class SimpleTokenizer:
    """
    A simple word-level tokenizer with special tokens.
    
    Special Tokens:
    - [PAD]: Padding token (id=0)
    - [UNK]: Unknown token (id=1)
    - [CLS]: Classification token (id=2)
    - [SEP]: Separator token (id=3)
    """
    
    def __init__(self, vocab_size: int = 30000):
        self.vocab_size = vocab_size
        
        # Special tokens
        self.special_tokens = {
            '[PAD]': 0,
            '[UNK]': 1,
            '[CLS]': 2,
            '[SEP]': 3,
        }
        
        # Word to ID mapping
        self.word_to_id: Dict[str, int] = dict(self.special_tokens)
        self.id_to_word: Dict[int, str] = {v: k for k, v in self.special_tokens.items()}
        
        # Special token IDs
        self.pad_token_id = 0
        self.unk_token_id = 1
        self.cls_token_id = 2
        self.sep_token_id = 3
    
    def _tokenize(self, text: str) -> List[str]:
        """
        Split text into tokens (simple word-level tokenization).
        
        Args:
            text: Input text string
            
        Returns:
            List of tokens
        """
        # Lowercase and basic cleaning
        text = text.lower().strip()
        
        # Simple word tokenization with punctuation handling
        tokens = re.findall(r'\b\w+\b|[^\w\s]', text)
        
        return tokens
    
    def build_vocab(self, texts: List[str], min_freq: int = 2):
        """
        Build vocabulary from a list of texts.
        
        Args:
            texts: List of text strings
            min_freq: Minimum frequency for a word to be included
        """
        # Count word frequencies
        word_counts = Counter()
        
        for text in tqdm(texts, desc="Building vocabulary"):
            tokens = self._tokenize(text)
            word_counts.update(tokens)
        
        # Sort by frequency and take top vocab_size - special_tokens
        max_words = self.vocab_size - len(self.special_tokens)
        
        sorted_words = sorted(
            word_counts.items(),
            key=lambda x: x[1],
            reverse=True
        )
        
        # Add words to vocabulary
        for word, count in sorted_words[:max_words]:
            if count >= min_freq and word not in self.word_to_id:
                idx = len(self.word_to_id)
                self.word_to_id[word] = idx
                self.id_to_word[idx] = word
        
        print(f"Vocabulary size: {len(self.word_to_id)}")
    
    def encode(self, text: str, max_length: int = 128) -> Dict:
        """
        Encode text to token IDs with attention mask.
        
        Args:
            text: Input text string
            max_length: Maximum sequence length
            
        Returns:
            Dictionary with 'input_ids' and 'attention_mask' tensors
        """
        import torch
        
        # Tokenize
        tokens = self._tokenize(text)
        
        # Convert to IDs (with CLS and SEP)
        token_ids = [self.cls_token_id]
        
        for token in tokens[:max_length - 2]:  # Reserve space for CLS and SEP
            token_id = self.word_to_id.get(token, self.unk_token_id)
            token_ids.append(token_id)
        
        token_ids.append(self.sep_token_id)
        
        # Create attention mask
        attention_mask = [1] * len(token_ids)
        
        # Pad to max_length
        padding_length = max_length - len(token_ids)
        token_ids.extend([self.pad_token_id] * padding_length)
        attention_mask.extend([0] * padding_length)
        
        return {
            'input_ids': torch.tensor(token_ids, dtype=torch.long),
            'attention_mask': torch.tensor(attention_mask, dtype=torch.long)
        }
    
    def decode(self, token_ids: List[int]) -> str:
        """
        Decode token IDs back to text.
        
        Args:
            token_ids: List of token IDs
            
        Returns:
            Decoded text string
        """
        tokens = []
        for idx in token_ids:
            if idx in [self.pad_token_id, self.cls_token_id, self.sep_token_id]:
                continue
            token = self.id_to_word.get(idx, '[UNK]')
            tokens.append(token)
        return ' '.join(tokens)
    
    def save(self, path: str):
        """Save tokenizer vocabulary to JSON file."""
        data = {
            'vocab_size': self.vocab_size,
            'word_to_id': self.word_to_id,
        }
        with open(path, 'w') as f:
            json.dump(data, f, indent=2)
    
    def load(self, path: str):
        """Load tokenizer vocabulary from JSON file."""
        with open(path, 'r') as f:
            data = json.load(f)
        
        self.vocab_size = data['vocab_size']
        self.word_to_id = data['word_to_id']
        self.id_to_word = {int(v): k for k, v in self.word_to_id.items()}
    
    def __len__(self) -> int:
        return len(self.word_to_id)