Viclim commited on
Commit
9299fff
·
verified ·
1 Parent(s): ce2f120

Upload 17 files

Browse files
checkpoints/best_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1bf35017912c87c7681e3c232ffe5a2481c97ec4e166ef55e5a4f7f9e780c5a5
3
+ size 13068032
checkpoints/checkpoint_step_100.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a384cf7782c53e58fef5a5d5ba3ffa0c1724fa45e707fe94a9cb413620a99e68
3
+ size 13068032
checkpoints/checkpoint_step_25.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff3858e7c4ea74c99ee40ac94096a284f443274bcd5bb3c3c650fa4083a1f723
3
+ size 13068032
checkpoints/checkpoint_step_50.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b98d0bded63833efd902dcc514e0472352ae972912c4e763b17ced507c9b405f
3
+ size 13068032
checkpoints/final_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:459b6ab5c6fe9c32084c28426145ed187e8b9c50e40fdecdedbdb2b170525672
3
+ size 13068016
config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "VicAIModel"
4
+ ],
5
+ "vocab_size": 2000,
6
+ "dim": 128,
7
+ "n_layers": 4,
8
+ "n_heads": 4,
9
+ "n_kv_heads": 4,
10
+ "hidden_dim": 256,
11
+ "max_seq_len": 512,
12
+ "tie_weights": false,
13
+ "model_type": "vicai",
14
+ "tokenizer_class": "ByteLevelBPETokenizer",
15
+ "pad_token_id": 1,
16
+ "eos_token_id": 0,
17
+ "unk_token_id": 2,
18
+ "bos_token_id": 3
19
+ }
dataset.py ADDED
@@ -0,0 +1,347 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VicAI Dataset
3
+ Dataset handling for training on Wikipedia and other text sources.
4
+ """
5
+
6
+ import os
7
+ import random
8
+ import re
9
+ from typing import Dict, Iterator, List, Optional
10
+
11
+ import requests
12
+ import torch
13
+ from torch.utils.data import Dataset, IterableDataset
14
+
15
+
16
+ class WikipediaDataset(IterableDataset):
17
+ """Stream Wikipedia articles for training."""
18
+
19
+ def __init__(
20
+ self,
21
+ tokenizer,
22
+ max_length: int = 2048,
23
+ languages: List[str] = ['en'],
24
+ min_article_length: int = 100,
25
+ ):
26
+ self.tokenizer = tokenizer
27
+ self.max_length = max_length
28
+ self.languages = languages
29
+ self.min_article_length = min_article_length
30
+ self.base_url = "https://en.wikipedia.org/w/api.php"
31
+
32
+ def _fetch_random_article(self) -> Optional[str]:
33
+ """Fetch a random Wikipedia article."""
34
+ try:
35
+ params = {
36
+ 'action': 'query',
37
+ 'format': 'json',
38
+ 'generator': 'random',
39
+ 'grnnamespace': 0,
40
+ 'grnlimit': 1,
41
+ 'prop': 'extracts',
42
+ 'explaintext': True,
43
+ 'exsentences': 50,
44
+ }
45
+ response = requests.get(self.base_url, params=params, timeout=10)
46
+ data = response.json()
47
+
48
+ pages = data['query']['pages']
49
+ for page_id, page_data in pages.items():
50
+ text = page_data.get('extract', '')
51
+ if len(text) > self.min_article_length:
52
+ return text
53
+ return None
54
+ except Exception as e:
55
+ print(f"Error fetching article: {e}")
56
+ return None
57
+
58
+ def _fetch_article_by_title(self, title: str) -> Optional[str]:
59
+ """Fetch a specific Wikipedia article by title."""
60
+ try:
61
+ params = {
62
+ 'action': 'query',
63
+ 'format': 'json',
64
+ 'titles': title,
65
+ 'prop': 'extracts',
66
+ 'explaintext': True,
67
+ 'exlimit': 1,
68
+ }
69
+ response = requests.get(self.base_url, params=params, timeout=10)
70
+ data = response.json()
71
+
72
+ pages = data['query']['pages']
73
+ for page_id, page_data in pages.items():
74
+ if page_id != '-1':
75
+ return page_data.get('extract', '')
76
+ return None
77
+ except Exception as e:
78
+ print(f"Error fetching article: {e}")
79
+ return None
80
+
81
+ def _clean_text(self, text: str) -> str:
82
+ """Clean Wikipedia text."""
83
+ # Remove citation markers
84
+ text = re.sub(r'\[\d+\]', '', text)
85
+ # Remove multiple spaces
86
+ text = re.sub(r'\s+', ' ', text)
87
+ # Remove special characters but keep basic punctuation
88
+ text = re.sub(r'[^\w\s.,!?;:\'\"()-]', ' ', text)
89
+ return text.strip()
90
+
91
+ def _tokenize_text(self, text: str) -> List[int]:
92
+ """Tokenize text and create chunks."""
93
+ cleaned = self._clean_text(text)
94
+ tokens = self.tokenizer.encode(cleaned, add_special_tokens=True)
95
+ return tokens
96
+
97
+ def __iter__(self):
98
+ """Iterate over Wikipedia articles."""
99
+ while True:
100
+ text = self._fetch_random_article()
101
+ if text:
102
+ tokens = self._tokenize_text(text)
103
+
104
+ # Create chunks of max_length
105
+ for i in range(0, len(tokens), self.max_length):
106
+ chunk = tokens[i:i + self.max_length]
107
+ if len(chunk) > 10: # Minimum chunk size
108
+ # Pad if necessary
109
+ if len(chunk) < self.max_length:
110
+ chunk.extend([self.tokenizer.pad_token_id] * (self.max_length - len(chunk)))
111
+
112
+ input_ids = torch.tensor(chunk[:-1])
113
+ labels = torch.tensor(chunk[1:])
114
+
115
+ yield {
116
+ 'input_ids': input_ids,
117
+ 'labels': labels,
118
+ 'attention_mask': (input_ids != self.tokenizer.pad_token_id).long(),
119
+ }
120
+
121
+
122
+ class TextFileDataset(Dataset):
123
+ """Dataset from local text files."""
124
+
125
+ def __init__(
126
+ self,
127
+ file_path: str,
128
+ tokenizer,
129
+ max_length: int = 2048,
130
+ stride: int = 512,
131
+ ):
132
+ self.tokenizer = tokenizer
133
+ self.max_length = max_length
134
+ self.stride = stride
135
+
136
+ print(f"Loading dataset from {file_path}...")
137
+ with open(file_path, 'r', encoding='utf-8') as f:
138
+ text = f.read()
139
+
140
+ # Tokenize full text
141
+ self.tokens = tokenizer.encode(text, add_special_tokens=False)
142
+
143
+ # Create chunks
144
+ self.chunks = []
145
+ for i in range(0, len(self.tokens) - max_length, stride):
146
+ chunk = self.tokens[i:i + max_length + 1]
147
+ if len(chunk) == max_length + 1:
148
+ self.chunks.append(chunk)
149
+
150
+ print(f"Created {len(self.chunks)} chunks from {len(self.tokens)} tokens")
151
+
152
+ def __len__(self):
153
+ return len(self.chunks)
154
+
155
+ def __getitem__(self, idx):
156
+ chunk = self.chunks[idx]
157
+ input_ids = torch.tensor(chunk[:-1])
158
+ labels = torch.tensor(chunk[1:])
159
+
160
+ return {
161
+ 'input_ids': input_ids,
162
+ 'labels': labels,
163
+ 'attention_mask': torch.ones_like(input_ids),
164
+ }
165
+
166
+
167
+ class MixedDataset(IterableDataset):
168
+ """Mix multiple data sources."""
169
+
170
+ def __init__(
171
+ self,
172
+ datasets: List[IterableDataset],
173
+ weights: Optional[List[float]] = None,
174
+ ):
175
+ self.datasets = datasets
176
+ self.weights = weights or [1.0] * len(datasets)
177
+ assert len(self.datasets) == len(self.weights)
178
+
179
+ # Normalize weights
180
+ total = sum(self.weights)
181
+ self.weights = [w / total for w in self.weights]
182
+
183
+ def __iter__(self):
184
+ """Sample from datasets according to weights."""
185
+ iterators = [iter(ds) for ds in self.datasets]
186
+
187
+ while True:
188
+ # Choose dataset based on weights
189
+ dataset_idx = random.choices(range(len(self.datasets)), weights=self.weights)[0]
190
+
191
+ try:
192
+ yield next(iterators[dataset_idx])
193
+ except StopIteration:
194
+ # Restart iterator if exhausted
195
+ iterators[dataset_idx] = iter(self.datasets[dataset_idx])
196
+ yield next(iterators[dataset_idx])
197
+
198
+
199
+ class PretokenizedDataset(Dataset):
200
+ """Dataset from pre-tokenized binary files."""
201
+
202
+ def __init__(self, data_dir: str, max_length: int = 2048):
203
+ self.data_dir = data_dir
204
+ self.max_length = max_length
205
+
206
+ # Load all .pt files
207
+ self.files = []
208
+ for fname in os.listdir(data_dir):
209
+ if fname.endswith('.pt'):
210
+ self.files.append(os.path.join(data_dir, fname))
211
+
212
+ self.files.sort()
213
+ print(f"Found {len(self.files)} pre-tokenized files")
214
+
215
+ # Load metadata
216
+ self.lengths = []
217
+ for f in self.files:
218
+ data = torch.load(f, map_location='cpu')
219
+ self.lengths.append(len(data) // max_length)
220
+
221
+ self.total_length = sum(self.lengths)
222
+
223
+ def __len__(self):
224
+ return self.total_length
225
+
226
+ def __getitem__(self, idx):
227
+ # Find which file contains this index
228
+ file_idx = 0
229
+ remaining = idx
230
+ for i, length in enumerate(self.lengths):
231
+ if remaining < length:
232
+ file_idx = i
233
+ break
234
+ remaining -= length
235
+
236
+ # Load data
237
+ data = torch.load(self.files[file_idx], map_location='cpu')
238
+ start = remaining * self.max_length
239
+ chunk = data[start:start + self.max_length + 1]
240
+
241
+ input_ids = chunk[:-1]
242
+ labels = chunk[1:]
243
+
244
+ return {
245
+ 'input_ids': input_ids,
246
+ 'labels': labels,
247
+ 'attention_mask': torch.ones_like(input_ids),
248
+ }
249
+
250
+
251
+ def download_wikipedia_dump(output_dir: str, language: str = 'en'):
252
+ """Download Wikipedia dump for offline processing."""
253
+ os.makedirs(output_dir, exist_ok=True)
254
+
255
+ # Wikipedia dump URLs
256
+ base_url = f"https://dumps.wikimedia.org/{language}wiki/latest/"
257
+ files = [
258
+ f"{language}wiki-latest-pages-articles-multistream.xml.bz2",
259
+ ]
260
+
261
+ for filename in files:
262
+ url = base_url + filename
263
+ output_path = os.path.join(output_dir, filename)
264
+
265
+ if os.path.exists(output_path):
266
+ print(f"{filename} already exists")
267
+ continue
268
+
269
+ print(f"Downloading {filename}...")
270
+ try:
271
+ response = requests.get(url, stream=True)
272
+ response.raise_for_status()
273
+
274
+ with open(output_path, 'wb') as f:
275
+ for chunk in response.iter_content(chunk_size=8192):
276
+ f.write(chunk)
277
+
278
+ print(f"Downloaded {filename}")
279
+ except Exception as e:
280
+ print(f"Error downloading {filename}: {e}")
281
+
282
+
283
+ def create_sample_corpus(output_file: str = "sample_corpus.txt", num_articles: int = 1000):
284
+ """Create a sample corpus by fetching Wikipedia articles."""
285
+ print(f"Creating sample corpus with {num_articles} articles...")
286
+
287
+ dataset = WikipediaDataset(
288
+ tokenizer=None, # We'll use raw text
289
+ max_length=100000, # Large to get full articles
290
+ )
291
+
292
+ articles = []
293
+ for i, item in enumerate(dataset):
294
+ if i >= num_articles:
295
+ break
296
+
297
+ # Get raw text from the article fetch
298
+ text = dataset._fetch_random_article()
299
+ if text:
300
+ articles.append(text)
301
+
302
+ if (i + 1) % 100 == 0:
303
+ print(f" Fetched {i + 1}/{num_articles} articles")
304
+
305
+ # Write to file
306
+ with open(output_file, 'w', encoding='utf-8') as f:
307
+ for article in articles:
308
+ f.write(article + '\n\n<|endoftext|>\n\n')
309
+
310
+ print(f"Sample corpus saved to {output_file}")
311
+ return output_file
312
+
313
+
314
+ def prepare_openwebtext_data(output_dir: str, max_files: int = 100):
315
+ """
316
+ Download and prepare OpenWebText corpus.
317
+ Note: This is a placeholder - actual OpenWebText requires specific download.
318
+ """
319
+ os.makedirs(output_dir, exist_ok=True)
320
+ print(f"OpenWebText data preparation placeholder")
321
+ print(f"Please download OpenWebText from https://github.com/jcpeterson/openwebtext")
322
+ print(f"and place files in {output_dir}")
323
+
324
+
325
+ if __name__ == "__main__":
326
+ # Test dataset
327
+ from tokenizer import BPETokenizer
328
+
329
+ # Create sample tokenizer
330
+ sample_texts = [
331
+ "This is a sample text for testing.",
332
+ "Wikipedia contains many interesting articles.",
333
+ "Machine learning models need lots of data.",
334
+ ]
335
+ tokenizer = BPETokenizer(vocab_size=1000)
336
+ tokenizer.train(sample_texts)
337
+
338
+ # Test Wikipedia dataset
339
+ print("\nTesting Wikipedia dataset...")
340
+ wiki_dataset = WikipediaDataset(tokenizer, max_length=128)
341
+
342
+ for i, batch in enumerate(wiki_dataset):
343
+ if i >= 3:
344
+ break
345
+ print(f"\nBatch {i + 1}:")
346
+ print(f" Input shape: {batch['input_ids'].shape}")
347
+ print(f" Labels shape: {batch['labels'].shape}")
generate.py ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VicAI Text Generation
3
+ Interactive text generation and sampling utilities.
4
+ """
5
+
6
+ import argparse
7
+ import sys
8
+
9
+ import torch
10
+
11
+ from model import VicAIModel, VicAIConfig, create_vicai_5b
12
+ from tokenizer import ByteLevelBPETokenizer, BPETokenizer
13
+ from utils import get_logger
14
+
15
+
16
+ def generate_interactive(
17
+ model,
18
+ tokenizer,
19
+ device,
20
+ max_new_tokens: int = 256,
21
+ temperature: float = 0.8,
22
+ top_k: int = 50,
23
+ top_p: float = 0.9,
24
+ repetition_penalty: float = 1.1,
25
+ ):
26
+ """Interactive text generation loop."""
27
+ print("\n" + "=" * 60)
28
+ print("VicAI Interactive Generation")
29
+ print("=" * 60)
30
+ print("Commands:")
31
+ print(" /quit - Exit the program")
32
+ print(" /config - Show current generation settings")
33
+ print(" /temp X - Set temperature (0.1 - 2.0)")
34
+ print(" /topk X - Set top-k (1 - 100)")
35
+ print(" /topp X - Set top-p (0.0 - 1.0)")
36
+ print(" /reppen X - Set repetition penalty (1.0 - 2.0)")
37
+ print(" /maxlen X - Set max new tokens")
38
+ print("=" * 60 + "\n")
39
+
40
+ # Current settings
41
+ settings = {
42
+ 'temperature': temperature,
43
+ 'top_k': top_k,
44
+ 'top_p': top_p,
45
+ 'repetition_penalty': repetition_penalty,
46
+ 'max_new_tokens': max_new_tokens,
47
+ }
48
+
49
+ while True:
50
+ try:
51
+ # Get prompt
52
+ prompt = input("\nPrompt: ").strip()
53
+
54
+ # Handle commands
55
+ if prompt == '/quit':
56
+ print("Goodbye!")
57
+ break
58
+
59
+ if prompt == '/config':
60
+ print("\nCurrent settings:")
61
+ for key, value in settings.items():
62
+ print(f" {key}: {value}")
63
+ continue
64
+
65
+ if prompt.startswith('/temp '):
66
+ try:
67
+ settings['temperature'] = float(prompt.split()[1])
68
+ print(f"Temperature set to {settings['temperature']}")
69
+ except (ValueError, IndexError):
70
+ print("Invalid temperature value")
71
+ continue
72
+
73
+ if prompt.startswith('/topk '):
74
+ try:
75
+ settings['top_k'] = int(prompt.split()[1])
76
+ print(f"Top-k set to {settings['top_k']}")
77
+ except (ValueError, IndexError):
78
+ print("Invalid top-k value")
79
+ continue
80
+
81
+ if prompt.startswith('/topp '):
82
+ try:
83
+ settings['top_p'] = float(prompt.split()[1])
84
+ print(f"Top-p set to {settings['top_p']}")
85
+ except (ValueError, IndexError):
86
+ print("Invalid top-p value")
87
+ continue
88
+
89
+ if prompt.startswith('/reppen '):
90
+ try:
91
+ settings['repetition_penalty'] = float(prompt.split()[1])
92
+ print(f"Repetition penalty set to {settings['repetition_penalty']}")
93
+ except (ValueError, IndexError):
94
+ print("Invalid repetition penalty value")
95
+ continue
96
+
97
+ if prompt.startswith('/maxlen '):
98
+ try:
99
+ settings['max_new_tokens'] = int(prompt.split()[1])
100
+ print(f"Max new tokens set to {settings['max_new_tokens']}")
101
+ except (ValueError, IndexError):
102
+ print("Invalid max new tokens value")
103
+ continue
104
+
105
+ if not prompt:
106
+ continue
107
+
108
+ # Encode prompt
109
+ input_ids = torch.tensor([tokenizer.encode(prompt)], device=device)
110
+
111
+ # Generate
112
+ print("\nGenerating...")
113
+ with torch.no_grad():
114
+ output_ids = model.generate(
115
+ input_ids,
116
+ max_new_tokens=settings['max_new_tokens'],
117
+ temperature=settings['temperature'],
118
+ top_k=settings['top_k'],
119
+ top_p=settings['top_p'],
120
+ repetition_penalty=settings['repetition_penalty'],
121
+ eos_token_id=tokenizer.eos_token_id,
122
+ )
123
+
124
+ # Decode and print
125
+ generated_text = tokenizer.decode(output_ids[0].tolist())
126
+ # Remove the original prompt from output
127
+ prompt_text = tokenizer.decode(input_ids[0].tolist())
128
+ if generated_text.startswith(prompt_text):
129
+ generated_text = generated_text[len(prompt_text):].strip()
130
+
131
+ print("\n" + "-" * 60)
132
+ print("Generated:")
133
+ print("-" * 60)
134
+ print(generated_text)
135
+ print("-" * 60)
136
+
137
+ # Print token info
138
+ num_tokens = output_ids.shape[1] - input_ids.shape[1]
139
+ print(f"\nTokens generated: {num_tokens}")
140
+
141
+ except KeyboardInterrupt:
142
+ print("\n\nInterrupted by user. Type /quit to exit.")
143
+ except Exception as e:
144
+ print(f"\nError: {e}")
145
+
146
+
147
+ def generate_batch(
148
+ model,
149
+ tokenizer,
150
+ prompts: list,
151
+ device,
152
+ max_new_tokens: int = 256,
153
+ temperature: float = 0.8,
154
+ top_k: int = 50,
155
+ top_p: float = 0.9,
156
+ ):
157
+ """Generate completions for multiple prompts."""
158
+ results = []
159
+
160
+ for prompt in prompts:
161
+ input_ids = torch.tensor([tokenizer.encode(prompt)], device=device)
162
+
163
+ with torch.no_grad():
164
+ output_ids = model.generate(
165
+ input_ids,
166
+ max_new_tokens=max_new_tokens,
167
+ temperature=temperature,
168
+ top_k=top_k,
169
+ top_p=top_p,
170
+ eos_token_id=tokenizer.eos_token_id,
171
+ )
172
+
173
+ generated_text = tokenizer.decode(output_ids[0].tolist())
174
+ prompt_text = tokenizer.decode(input_ids[0].tolist())
175
+
176
+ if generated_text.startswith(prompt_text):
177
+ generated_text = generated_text[len(prompt_text):].strip()
178
+
179
+ results.append({
180
+ 'prompt': prompt,
181
+ 'completion': generated_text,
182
+ })
183
+
184
+ return results
185
+
186
+
187
+ def benchmark_generation(
188
+ model,
189
+ tokenizer,
190
+ device,
191
+ num_runs: int = 10,
192
+ max_new_tokens: int = 128,
193
+ prompt: str = "The future of artificial intelligence is",
194
+ ):
195
+ """Benchmark generation speed."""
196
+ import time
197
+
198
+ print(f"\nBenchmarking generation ({num_runs} runs)...")
199
+
200
+ input_ids = torch.tensor([tokenizer.encode(prompt)], device=device)
201
+
202
+ # Warmup
203
+ with torch.no_grad():
204
+ _ = model.generate(input_ids, max_new_tokens=10)
205
+
206
+ torch.cuda.synchronize()
207
+
208
+ # Benchmark
209
+ times = []
210
+ tokens_generated = []
211
+
212
+ for i in range(num_runs):
213
+ start = time.time()
214
+
215
+ with torch.no_grad():
216
+ output = model.generate(
217
+ input_ids,
218
+ max_new_tokens=max_new_tokens,
219
+ temperature=1.0,
220
+ )
221
+
222
+ torch.cuda.synchronize()
223
+ elapsed = time.time() - start
224
+
225
+ num_tokens = output.shape[1] - input_ids.shape[1]
226
+ times.append(elapsed)
227
+ tokens_generated.append(num_tokens)
228
+
229
+ print(f" Run {i+1}: {num_tokens} tokens in {elapsed:.2f}s ({num_tokens/elapsed:.1f} tok/s)")
230
+
231
+ avg_time = sum(times) / len(times)
232
+ avg_tokens = sum(tokens_generated) / len(tokens_generated)
233
+ avg_speed = avg_tokens / avg_time
234
+
235
+ print(f"\nAverage: {avg_tokens:.1f} tokens in {avg_time:.2f}s ({avg_speed:.1f} tok/s)")
236
+
237
+
238
+ def main():
239
+ parser = argparse.ArgumentParser(description='Generate text with VicAI')
240
+
241
+ parser.add_argument('--checkpoint', type=str, required=True, help='Path to model checkpoint')
242
+ parser.add_argument('--tokenizer', type=str, default='tokenizer.pkl', help='Path to tokenizer')
243
+ parser.add_argument('--prompt', type=str, default=None, help='Single prompt to generate from')
244
+ parser.add_argument('--interactive', action='store_true', help='Interactive mode')
245
+ parser.add_argument('--max-new-tokens', type=int, default=256, help='Maximum tokens to generate')
246
+ parser.add_argument('--temperature', type=float, default=0.8, help='Sampling temperature')
247
+ parser.add_argument('--top-k', type=int, default=50, help='Top-k sampling')
248
+ parser.add_argument('--top-p', type=float, default=0.9, help='Top-p (nucleus) sampling')
249
+ parser.add_argument('--repetition-penalty', type=float, default=1.1, help='Repetition penalty')
250
+ parser.add_argument('--benchmark', action='store_true', help='Run generation benchmark')
251
+ parser.add_argument('--device', type=str, default='cuda', help='Device to use')
252
+
253
+ args = parser.parse_args()
254
+
255
+ # Setup device
256
+ device = torch.device(args.device if torch.cuda.is_available() else 'cpu')
257
+ print(f"Using device: {device}")
258
+
259
+ # Load tokenizer
260
+ print(f"Loading tokenizer from {args.tokenizer}...")
261
+ # Use ByteLevelBPETokenizer by default (our trained tokenizer)
262
+ tokenizer = ByteLevelBPETokenizer()
263
+ tokenizer.load(args.tokenizer)
264
+ print(f"Tokenizer loaded: {len(tokenizer)} tokens")
265
+
266
+ # Load model
267
+ print(f"Loading model from {args.checkpoint}...")
268
+ checkpoint = torch.load(args.checkpoint, map_location=device)
269
+
270
+ # Create model (assuming 5B config)
271
+ model = create_vicai_5b(vocab_size=len(tokenizer))
272
+
273
+ # Load weights
274
+ state_dict = checkpoint.get('model', checkpoint)
275
+ model.load_state_dict(state_dict)
276
+ model = model.to(device)
277
+ model.eval()
278
+
279
+ print(f"Model loaded: ~{model.get_num_params() / 1e9:.2f}B parameters")
280
+
281
+ # Run benchmark if requested
282
+ if args.benchmark:
283
+ benchmark_generation(model, tokenizer, device)
284
+ return
285
+
286
+ # Interactive mode
287
+ if args.interactive or args.prompt is None:
288
+ generate_interactive(
289
+ model,
290
+ tokenizer,
291
+ device,
292
+ max_new_tokens=args.max_new_tokens,
293
+ temperature=args.temperature,
294
+ top_k=args.top_k,
295
+ top_p=args.top_p,
296
+ repetition_penalty=args.repetition_penalty,
297
+ )
298
+ else:
299
+ # Single prompt generation
300
+ print(f"\nPrompt: {args.prompt}")
301
+ print("-" * 60)
302
+
303
+ input_ids = torch.tensor([tokenizer.encode(args.prompt)], device=device)
304
+
305
+ with torch.no_grad():
306
+ output_ids = model.generate(
307
+ input_ids,
308
+ max_new_tokens=args.max_new_tokens,
309
+ temperature=args.temperature,
310
+ top_k=args.top_k,
311
+ top_p=args.top_p,
312
+ repetition_penalty=args.repetition_penalty,
313
+ eos_token_id=tokenizer.eos_token_id,
314
+ )
315
+
316
+ generated_text = tokenizer.decode(output_ids[0].tolist())
317
+ prompt_text = tokenizer.decode(input_ids[0].tolist())
318
+
319
+ if generated_text.startswith(prompt_text):
320
+ generated_text = generated_text[len(prompt_text):].strip()
321
+
322
+ print(generated_text)
323
+ print("-" * 60)
324
+
325
+ num_tokens = output_ids.shape[1] - input_ids.shape[1]
326
+ print(f"\nGenerated {num_tokens} tokens")
327
+
328
+
329
+ if __name__ == '__main__':
330
+ main()
model.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VicAI Model Architecture
3
+ A 5B parameter decoder-only transformer language model.
4
+ """
5
+
6
+ import math
7
+ from typing import Optional, Tuple
8
+
9
+ import torch
10
+ import torch.nn as nn
11
+ import torch.nn.functional as F
12
+
13
+
14
+ class RMSNorm(nn.Module):
15
+ """Root Mean Square Layer Normalization."""
16
+
17
+ def __init__(self, dim: int, eps: float = 1e-6):
18
+ super().__init__()
19
+ self.eps = eps
20
+ self.weight = nn.Parameter(torch.ones(dim))
21
+
22
+ def forward(self, x):
23
+ return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps) * self.weight
24
+
25
+
26
+ class RotaryPositionalEmbedding(nn.Module):
27
+ """Rotary Position Embedding (RoPE)."""
28
+
29
+ def __init__(self, dim: int, max_seq_len: int = 8192, base: float = 10000.0):
30
+ super().__init__()
31
+ self.dim = dim
32
+ self.max_seq_len = max_seq_len
33
+ self.base = base
34
+
35
+ inv_freq = 1.0 / (self.base ** (torch.arange(0, dim, 2).float() / dim))
36
+ self.register_buffer("inv_freq", inv_freq)
37
+
38
+ t = torch.arange(max_seq_len)
39
+ freqs = torch.einsum("i,j->ij", t, inv_freq)
40
+ emb = torch.cat((freqs, freqs), dim=-1)
41
+ self.register_buffer("cos_cached", emb.cos()[None, None, :, :])
42
+ self.register_buffer("sin_cached", emb.sin()[None, None, :, :])
43
+
44
+ def rotate_half(self, x):
45
+ x1, x2 = x.chunk(2, dim=-1)
46
+ return torch.cat((-x2, x1), dim=-1)
47
+
48
+ def apply_rotary_pos_emb(self, q, k, cos, sin):
49
+ q_embed = (q * cos) + (self.rotate_half(q) * sin)
50
+ k_embed = (k * cos) + (self.rotate_half(k) * sin)
51
+ return q_embed, k_embed
52
+
53
+ def forward(self, q, k, seq_len: int):
54
+ cos = self.cos_cached[:, :, :seq_len, :]
55
+ sin = self.sin_cached[:, :, :seq_len, :]
56
+ return self.apply_rotary_pos_emb(q, k, cos, sin)
57
+
58
+
59
+ class GroupedQueryAttention(nn.Module):
60
+ """Grouped Query Attention (GQA) for efficient inference."""
61
+
62
+ def __init__(
63
+ self,
64
+ dim: int,
65
+ n_heads: int,
66
+ n_kv_heads: int,
67
+ dropout: float = 0.0,
68
+ ):
69
+ super().__init__()
70
+ self.dim = dim
71
+ self.n_heads = n_heads
72
+ self.n_kv_heads = n_kv_heads
73
+ self.head_dim = dim // n_heads
74
+ self.n_rep = n_heads // n_kv_heads
75
+
76
+ self.wq = nn.Linear(dim, n_heads * self.head_dim, bias=False)
77
+ self.wk = nn.Linear(dim, n_kv_heads * self.head_dim, bias=False)
78
+ self.wv = nn.Linear(dim, n_kv_heads * self.head_dim, bias=False)
79
+ self.wo = nn.Linear(n_heads * self.head_dim, dim, bias=False)
80
+
81
+ self.attn_dropout = nn.Dropout(dropout)
82
+ self.resid_dropout = nn.Dropout(dropout)
83
+
84
+ self.rope = RotaryPositionalEmbedding(self.head_dim)
85
+
86
+ def forward(
87
+ self,
88
+ x: torch.Tensor,
89
+ mask: Optional[torch.Tensor] = None,
90
+ past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
91
+ ):
92
+ bsz, seq_len, _ = x.shape
93
+
94
+ q = self.wq(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
95
+ k = self.wk(x).view(bsz, seq_len, self.n_kv_heads, self.head_dim).transpose(1, 2)
96
+ v = self.wv(x).view(bsz, seq_len, self.n_kv_heads, self.head_dim).transpose(1, 2)
97
+
98
+ q, k = self.rope(q, k, seq_len)
99
+
100
+ if past_key_value is not None:
101
+ past_k, past_v = past_key_value
102
+ k = torch.cat([past_k, k], dim=2)
103
+ v = torch.cat([past_v, v], dim=2)
104
+
105
+ past_key_value = (k, v)
106
+
107
+ # Repeat k/v for grouped query attention
108
+ k = k.repeat_interleave(self.n_rep, dim=1)
109
+ v = v.repeat_interleave(self.n_rep, dim=1)
110
+
111
+ scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
112
+
113
+ if mask is not None:
114
+ scores = scores + mask
115
+
116
+ attn = F.softmax(scores, dim=-1)
117
+ attn = self.attn_dropout(attn)
118
+
119
+ out = torch.matmul(attn, v)
120
+ out = out.transpose(1, 2).contiguous().view(bsz, seq_len, self.dim)
121
+ out = self.wo(out)
122
+ out = self.resid_dropout(out)
123
+
124
+ return out, past_key_value
125
+
126
+
127
+ class FeedForward(nn.Module):
128
+ """SwiGLU Feed-Forward Network."""
129
+
130
+ def __init__(self, dim: int, hidden_dim: int, dropout: float = 0.0):
131
+ super().__init__()
132
+ self.w1 = nn.Linear(dim, hidden_dim, bias=False)
133
+ self.w2 = nn.Linear(hidden_dim, dim, bias=False)
134
+ self.w3 = nn.Linear(dim, hidden_dim, bias=False)
135
+ self.dropout = nn.Dropout(dropout)
136
+
137
+ def forward(self, x):
138
+ return self.w2(F.silu(self.w1(x)) * self.w3(x))
139
+
140
+
141
+ class TransformerBlock(nn.Module):
142
+ """Single transformer block with pre-normalization."""
143
+
144
+ def __init__(
145
+ self,
146
+ dim: int,
147
+ n_heads: int,
148
+ n_kv_heads: int,
149
+ hidden_dim: int,
150
+ dropout: float = 0.0,
151
+ ):
152
+ super().__init__()
153
+ self.attention_norm = RMSNorm(dim)
154
+ self.attention = GroupedQueryAttention(dim, n_heads, n_kv_heads, dropout)
155
+ self.ffn_norm = RMSNorm(dim)
156
+ self.feed_forward = FeedForward(dim, hidden_dim, dropout)
157
+
158
+ def forward(
159
+ self,
160
+ x: torch.Tensor,
161
+ mask: Optional[torch.Tensor] = None,
162
+ past_key_value: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
163
+ ):
164
+ # Attention with residual
165
+ attn_out, past_key_value = self.attention(
166
+ self.attention_norm(x), mask, past_key_value
167
+ )
168
+ x = x + attn_out
169
+
170
+ # FFN with residual
171
+ x = x + self.feed_forward(self.ffn_norm(x))
172
+
173
+ return x, past_key_value
174
+
175
+
176
+ class VicAIConfig:
177
+ """Configuration for VicAI model."""
178
+
179
+ def __init__(
180
+ self,
181
+ vocab_size: int = 32000,
182
+ dim: int = 4096,
183
+ n_layers: int = 32,
184
+ n_heads: int = 32,
185
+ n_kv_heads: int = 8,
186
+ hidden_dim: int = 14336,
187
+ max_seq_len: int = 8192,
188
+ dropout: float = 0.0,
189
+ tie_weights: bool = False,
190
+ ):
191
+ self.vocab_size = vocab_size
192
+ self.dim = dim
193
+ self.n_layers = n_layers
194
+ self.n_heads = n_heads
195
+ self.n_kv_heads = n_kv_heads
196
+ self.hidden_dim = hidden_dim
197
+ self.max_seq_len = max_seq_len
198
+ self.dropout = dropout
199
+ self.tie_weights = tie_weights
200
+
201
+ @property
202
+ def num_parameters(self):
203
+ """Calculate approximate parameter count."""
204
+ # Embedding
205
+ params = self.vocab_size * self.dim
206
+ # Attention per layer
207
+ attn_params = 4 * self.dim * self.dim # q, k, v, o projections
208
+ # FFN per layer
209
+ ffn_params = 3 * self.dim * self.hidden_dim # w1, w2, w3
210
+ # Layers
211
+ params += self.n_layers * (attn_params + ffn_params)
212
+ # Output
213
+ params += self.vocab_size * self.dim
214
+ return params
215
+
216
+
217
+ class VicAIModel(nn.Module):
218
+ """
219
+ VicAI: A 5B parameter decoder-only transformer language model.
220
+
221
+ Architecture details:
222
+ - 32 layers
223
+ - 4096 model dimension
224
+ - 32 attention heads (8 key-value heads for GQA)
225
+ - SwiGLU FFN with 14336 hidden dimension
226
+ - RoPE positional embeddings
227
+ - RMSNorm pre-normalization
228
+ - ~5.1B total parameters
229
+ """
230
+
231
+ def __init__(self, config: VicAIConfig):
232
+ super().__init__()
233
+ self.config = config
234
+
235
+ self.token_embedding = nn.Embedding(config.vocab_size, config.dim)
236
+ self.dropout = nn.Dropout(config.dropout)
237
+
238
+ self.layers = nn.ModuleList([
239
+ TransformerBlock(
240
+ config.dim,
241
+ config.n_heads,
242
+ config.n_kv_heads,
243
+ config.hidden_dim,
244
+ config.dropout,
245
+ )
246
+ for _ in range(config.n_layers)
247
+ ])
248
+
249
+ self.norm = RMSNorm(config.dim)
250
+ self.lm_head = nn.Linear(config.dim, config.vocab_size, bias=False)
251
+
252
+ if config.tie_weights:
253
+ self.lm_head.weight = self.token_embedding.weight
254
+
255
+ self.apply(self._init_weights)
256
+
257
+ # Print model info
258
+ total_params = self.get_num_params()
259
+ print(f"VicAI Model initialized with {total_params / 1e9:.2f}B parameters")
260
+
261
+ def _init_weights(self, module):
262
+ if isinstance(module, nn.Linear):
263
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
264
+ if module.bias is not None:
265
+ torch.nn.init.zeros_(module.bias)
266
+ elif isinstance(module, nn.Embedding):
267
+ torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
268
+
269
+ def get_num_params(self, non_embedding=True):
270
+ n_params = sum(p.numel() for p in self.parameters())
271
+ if non_embedding:
272
+ n_params -= self.token_embedding.weight.numel()
273
+ return n_params
274
+
275
+ def forward(
276
+ self,
277
+ input_ids: torch.Tensor,
278
+ targets: Optional[torch.Tensor] = None,
279
+ past_key_values: Optional[list] = None,
280
+ ):
281
+ bsz, seq_len = input_ids.shape
282
+
283
+ # Create causal mask
284
+ mask = torch.triu(
285
+ torch.ones(seq_len, seq_len, device=input_ids.device),
286
+ diagonal=1
287
+ ).bool()
288
+ mask = mask.unsqueeze(0).unsqueeze(0)
289
+ mask = mask.to(input_ids.device)
290
+ mask = torch.where(mask, float('-inf'), 0.0)
291
+
292
+ x = self.token_embedding(input_ids)
293
+ x = self.dropout(x)
294
+
295
+ new_key_values = []
296
+ for i, layer in enumerate(self.layers):
297
+ past_kv = past_key_values[i] if past_key_values is not None else None
298
+ x, kv = layer(x, mask, past_kv)
299
+ new_key_values.append(kv)
300
+
301
+ x = self.norm(x)
302
+ logits = self.lm_head(x)
303
+
304
+ loss = None
305
+ if targets is not None:
306
+ loss = F.cross_entropy(
307
+ logits.view(-1, logits.size(-1)),
308
+ targets.view(-1),
309
+ ignore_index=-100
310
+ )
311
+
312
+ return {
313
+ 'logits': logits,
314
+ 'loss': loss,
315
+ 'past_key_values': new_key_values,
316
+ }
317
+
318
+ @torch.no_grad()
319
+ def generate(
320
+ self,
321
+ input_ids: torch.Tensor,
322
+ max_new_tokens: int = 100,
323
+ temperature: float = 1.0,
324
+ top_k: int = 50,
325
+ top_p: float = 0.9,
326
+ repetition_penalty: float = 1.0,
327
+ eos_token_id: Optional[int] = None,
328
+ ):
329
+ """Generate text autoregressively."""
330
+ self.eval()
331
+
332
+ batch_size = input_ids.shape[0]
333
+ device = input_ids.device
334
+ past_key_values = None
335
+
336
+ for _ in range(max_new_tokens):
337
+ outputs = self(input_ids, past_key_values=past_key_values)
338
+ logits = outputs['logits']
339
+ past_key_values = outputs['past_key_values']
340
+
341
+ # Get logits for last token
342
+ logits = logits[:, -1, :] / temperature
343
+
344
+ # Apply repetition penalty
345
+ if repetition_penalty != 1.0:
346
+ for i in range(batch_size):
347
+ for token_id in set(input_ids[i].tolist()):
348
+ logits[i, token_id] /= repetition_penalty
349
+
350
+ # Top-k filtering
351
+ if top_k > 0:
352
+ indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
353
+ logits[indices_to_remove] = float('-inf')
354
+
355
+ # Top-p (nucleus) filtering
356
+ if top_p < 1.0:
357
+ sorted_logits, sorted_indices = torch.sort(logits, descending=True)
358
+ cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
359
+ sorted_indices_to_remove = cumulative_probs > top_p
360
+ sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
361
+ sorted_indices_to_remove[..., 0] = 0
362
+ indices_to_remove = sorted_indices_to_remove.scatter(
363
+ 1, sorted_indices, sorted_indices_to_remove
364
+ )
365
+ logits[indices_to_remove] = float('-inf')
366
+
367
+ probs = F.softmax(logits, dim=-1)
368
+ next_token = torch.multinomial(probs, num_samples=1)
369
+
370
+ input_ids = torch.cat([input_ids, next_token], dim=1)
371
+
372
+ # Early stopping if EOS token generated
373
+ if eos_token_id is not None and (next_token == eos_token_id).all():
374
+ break
375
+
376
+ return input_ids
377
+
378
+
379
+ def create_vicai_5b(vocab_size: int = 32000) -> VicAIModel:
380
+ """Create a 5B parameter VicAI model."""
381
+ config = VicAIConfig(
382
+ vocab_size=vocab_size,
383
+ dim=4096,
384
+ n_layers=32,
385
+ n_heads=32,
386
+ n_kv_heads=8,
387
+ hidden_dim=14336,
388
+ max_seq_len=8192,
389
+ dropout=0.0,
390
+ )
391
+ return VicAIModel(config)
392
+
393
+
394
+ if __name__ == "__main__":
395
+ # Test model creation
396
+ model = create_vicai_5b()
397
+ print(f"Total parameters: {model.get_num_params() / 1e9:.2f}B")
398
+
399
+ # Test forward pass
400
+ x = torch.randint(0, 32000, (2, 128))
401
+ outputs = model(x)
402
+ print(f"Output shape: {outputs['logits'].shape}")
403
+ print(f"Loss: {outputs['loss']}")
readme.md ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # VicAI
2
+
3
+ A 5B parameter decoder-only transformer language model built from scratch in PyTorch.
4
+
5
+ ## Overview
6
+
7
+ VicAI is a state-of-the-art language model featuring:
8
+ - **5.1B parameters** with 32 transformer layers
9
+ - **Grouped Query Attention (GQA)** for efficient inference
10
+ - **Rotary Position Embeddings (RoPE)** for better long-context modeling
11
+ - **SwiGLU activation** in feed-forward layers
12
+ - **RMSNorm** pre-normalization
13
+ - **Byte-level BPE tokenization** (32K vocabulary)
14
+
15
+ ## Architecture
16
+
17
+ | Component | Specification |
18
+ |-----------|---------------|
19
+ | Parameters | ~5.1B |
20
+ | Layers | 32 |
21
+ | Hidden Dim | 4096 |
22
+ | FFN Dim | 14336 |
23
+ | Attention Heads | 32 |
24
+ | KV Heads | 8 (GQA) |
25
+ | Context Length | 8192 |
26
+ | Vocabulary | 32,000 |
27
+
28
+ ## File Structure
29
+
30
+ ```
31
+ vicai/
32
+ ├── model.py # Model architecture and VicAI 5B config
33
+ ├── tokenizer.py # BPE tokenizer implementation
34
+ ├── dataset.py # Data loading (Wikipedia + custom sources)
35
+ ├── train.py # Distributed training script
36
+ ├── utils.py # Training utilities and helpers
37
+ ├── generate.py # Text generation and inference
38
+ ├── requirements.txt # Dependencies
39
+ └── README.md # This file
40
+ ```
41
+
42
+ ## Installation
43
+
44
+ ```bash
45
+ # Clone the repository
46
+ git clone https://github.com/yourusername/vicai.git
47
+ cd vicai
48
+
49
+ # Create virtual environment
50
+ python -m venv venv
51
+ source venv/bin/activate # On Windows: venv\Scripts\activate
52
+
53
+ # Install dependencies
54
+ pip install -r requirements.txt
55
+ ```
56
+
57
+ ## Quick Start
58
+
59
+ ### 1. Prepare Training Data
60
+
61
+ Option A: Create sample corpus from Wikipedia
62
+ ```bash
63
+ python -c "from dataset import create_sample_corpus; create_sample_corpus('data/train.txt', num_articles=10000)"
64
+ ```
65
+
66
+ Option B: Use your own text files
67
+ ```bash
68
+ # Place your text files in data/ directory
69
+ # Format: plain text with <|endoftext|> markers between documents
70
+ ```
71
+
72
+ ### 2. Train Tokenizer
73
+
74
+ ```python
75
+ from tokenizer import ByteLevelBPETokenizer
76
+ from dataset import create_sample_corpus
77
+
78
+ # Create corpus
79
+ corpus = create_sample_corpus('data/train.txt', num_articles=1000)
80
+
81
+ # Read texts
82
+ with open(corpus, 'r') as f:
83
+ texts = f.read().split('<|endoftext|>')
84
+
85
+ # Train tokenizer
86
+ tokenizer = ByteLevelBPETokenizer(vocab_size=32000)
87
+ tokenizer.train([t for t in texts if t.strip()])
88
+ tokenizer.save('tokenizer.pkl')
89
+ ```
90
+
91
+ ### 3. Train Model
92
+
93
+ Single GPU:
94
+ ```bash
95
+ python train.py \
96
+ --train-data data/train.txt \
97
+ --val-data data/val.txt \
98
+ --tokenizer tokenizer.pkl \
99
+ --batch-size 4 \
100
+ --max-steps 100000 \
101
+ --output-dir checkpoints
102
+ ```
103
+
104
+ Multi-GPU (DDP):
105
+ ```bash
106
+ torchrun --nproc_per_node=4 train.py \
107
+ --train-data data/train.txt \
108
+ --val-data data/val.txt \
109
+ --batch-size 1 \
110
+ --max-steps 100000 \
111
+ --output-dir checkpoints
112
+ ```
113
+
114
+ Multi-GPU (FSDP):
115
+ ```bash
116
+ torchrun --nproc_per_node=8 train.py \
117
+ --use-fsdp \
118
+ --train-data data/train.txt \
119
+ --batch-size 1 \
120
+ --output-dir checkpoints
121
+ ```
122
+
123
+ ### 4. Generate Text
124
+
125
+ Interactive mode:
126
+ ```bash
127
+ python generate.py \
128
+ --checkpoint checkpoints/best_model.pt \
129
+ --tokenizer tokenizer.pkl \
130
+ --interactive
131
+ ```
132
+
133
+ Single prompt:
134
+ ```bash
135
+ python generate.py \
136
+ --checkpoint checkpoints/best_model.pt \
137
+ --tokenizer tokenizer.pkl \
138
+ --prompt "The future of AI is" \
139
+ --max-new-tokens 256
140
+ ```
141
+
142
+ ## Training Configuration
143
+
144
+ ### Default Hyperparameters
145
+
146
+ | Parameter | Value |
147
+ |-----------|-------|
148
+ | Learning Rate | 3e-4 |
149
+ | Min LR | 3e-5 |
150
+ | Warmup Steps | 2,000 |
151
+ | Weight Decay | 0.1 |
152
+ | Batch Size | 4 (per device) |
153
+ | Max Steps | 100,000 |
154
+ | Beta1 | 0.9 |
155
+ | Beta2 | 0.95 |
156
+
157
+ ### Training Tips
158
+
159
+ - **Memory constrained?** Reduce batch size or use gradient accumulation
160
+ - **Longer context?** Increase `--max-seq-len` (up to 8192)
161
+ - **Faster training?** Enable `--compile` for torch.compile optimization
162
+ - **Better quality?** Train longer or use larger dataset
163
+
164
+ ## Generation Parameters
165
+
166
+ | Parameter | Default | Description |
167
+ |-----------|---------|-------------|
168
+ | temperature | 0.8 | Lower = more focused, higher = more random |
169
+ | top_k | 50 | Consider only top-k tokens |
170
+ | top_p | 0.9 | Nucleus sampling threshold |
171
+ | repetition_penalty | 1.1 | Penalize repeated tokens |
172
+ | max_new_tokens | 256 | Maximum tokens to generate |
173
+
174
+ ## Data Sources
175
+
176
+ The model can be trained on:
177
+
178
+ 1. **Wikipedia** (streaming via API)
179
+ 2. **OpenWebText** (Common Crawl filtered)
180
+ 3. **Custom text files** (your own data)
181
+ 4. **Mixed datasets** (combine multiple sources)
182
+
183
+ ## Hardware Requirements
184
+
185
+ ### Training
186
+
187
+ | GPUs | VRAM per GPU | Config |
188
+ |------|--------------|--------|
189
+ | 1x A100 (80GB) | 80GB | batch_size=4, compile=True |
190
+ | 4x A100 (40GB) | 40GB | batch_size=1, DDP |
191
+ | 8x A100 (40GB) | 40GB | batch_size=1, FSDP |
192
+ | 1x RTX 4090 | 24GB | batch_size=1, smaller model |
193
+
194
+ ### Inference
195
+
196
+ - Minimum: 1x GPU with 16GB VRAM (with quantization)
197
+ - Recommended: 1x GPU with 24GB+ VRAM
198
+
199
+ ## Model Architecture Details
200
+
201
+ ### Grouped Query Attention
202
+
203
+ Uses 8 key-value heads instead of 32, reducing memory bandwidth during inference while maintaining quality.
204
+
205
+ ### Rotary Position Embeddings
206
+
207
+ Rotary embeddings are applied to queries and keys, providing better relative position encoding than absolute embeddings.
208
+
209
+ ### SwiGLU Feed-Forward
210
+
211
+ ```python
212
+ FFN(x) = (silu(W1 @ x) * (W3 @ x)) @ W2
213
+ ```
214
+
215
+ This has been shown to improve training stability and performance.
216
+
217
+ ## Example Usage
218
+
219
+ ```python
220
+ from model import create_vicai_5b
221
+ from tokenizer import ByteLevelBPETokenizer
222
+ import torch
223
+
224
+ # Load tokenizer
225
+ tokenizer = ByteLevelBPETokenizer()
226
+ tokenizer.load('tokenizer.pkl')
227
+
228
+ # Create model
229
+ model = create_vicai_5b(vocab_size=len(tokenizer))
230
+
231
+ # Load checkpoint
232
+ checkpoint = torch.load('checkpoints/best_model.pt')
233
+ model.load_state_dict(checkpoint['model'])
234
+ model = model.cuda()
235
+
236
+ # Generate
237
+ text = "Artificial intelligence will"
238
+ input_ids = torch.tensor([tokenizer.encode(text)]).cuda()
239
+
240
+ with torch.no_grad():
241
+ output = model.generate(
242
+ input_ids,
243
+ max_new_tokens=100,
244
+ temperature=0.8,
245
+ top_k=50,
246
+ top_p=0.9,
247
+ )
248
+
249
+ generated = tokenizer.decode(output[0].tolist())
250
+ print(generated)
251
+ ```
252
+
253
+ ## Citation
254
+
255
+ If you use VicAI in your research, please cite:
256
+
257
+ ```bibtex
258
+ @software{vicai2024,
259
+ title = {VicAI: A 5B Parameter Language Model from Scratch},
260
+ author = {Your Name},
261
+ year = {2024},
262
+ url = {https://github.com/yourusername/vicai}
263
+ }
264
+ ```
265
+
266
+ ## License
267
+
268
+ This project is licensed under the MIT License.
269
+
270
+ ## Acknowledgments
271
+
272
+ - Transformer architecture based on "Attention Is All You Need"
273
+ - RoPE embeddings from RoFormer
274
+ - GQA from the Llama 2 paper
275
+ - SwiGLU from the PaLM paper
276
+
277
+ ## Contributing
278
+
279
+ Contributions are welcome! Please feel free to submit a Pull Request.
280
+
281
+ ## Troubleshooting
282
+
283
+ ### CUDA Out of Memory
284
+ - Reduce batch size
285
+ - Enable gradient checkpointing
286
+ - Use FSDP for multi-GPU training
287
+ - Reduce sequence length
288
+
289
+ ### Slow Training
290
+ - Enable `--compile` flag
291
+ - Use mixed precision (AMP)
292
+ - Ensure data is on fast storage (SSD)
293
+ - Use DataLoader `num_workers > 0`
294
+
295
+ ### Poor Generation Quality
296
+ - Train longer
297
+ - Use larger, higher quality dataset
298
+ - Adjust sampling parameters (temperature, top_p)
299
+ - Check tokenizer was trained on similar data
300
+
301
+ ## Contact
302
+
303
+ For questions or issues, please open a GitHub issue or contact the maintainers.
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ torchvision>=0.15.0
3
+ torchaudio>=2.0.0
4
+ numpy>=1.24.0
5
+ tqdm>=4.65.0
6
+ requests>=2.28.0
7
+ transformers>=4.30.0
8
+ datasets>=2.12.0
9
+ accelerate>=0.20.0
10
+ sentencepiece>=0.1.99
11
+ protobuf>=3.20.0
12
+ wandb>=0.15.0
13
+ tensorboard>=2.13.0
14
+ matplotlib>=3.7.0
15
+ scipy>=1.10.0
16
+ scikit-learn>=1.2.0
17
+ pandas>=2.0.0
18
+ pyyaml>=6.0
19
+ regex>=2023.0.0
20
+ filelock>=3.12.0
21
+ packaging>=23.0
22
+ psutil>=5.9.0
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:862358e10e9e9f7c70f593dd3e8d2aa9da1ceca56947cff0545204d943c27baf
3
+ size 71877
tokenizer.py ADDED
@@ -0,0 +1,396 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VicAI Tokenizer
3
+ Byte-Pair Encoding (BPE) tokenizer implementation.
4
+ """
5
+
6
+ import json
7
+ import pickle
8
+ import re
9
+ from collections import defaultdict
10
+ from typing import Dict, List, Optional, Union
11
+
12
+
13
+ class BPETokenizer:
14
+ """Byte-Pair Encoding Tokenizer."""
15
+
16
+ def __init__(self, vocab_size: int = 32000):
17
+ self.vocab_size = vocab_size
18
+ self.vocab = {}
19
+ self.merges = []
20
+ self.special_tokens = {
21
+ '<pad>': 0,
22
+ '<unk>': 1,
23
+ '<s>': 2,
24
+ '</s>': 3,
25
+ '<mask>': 4,
26
+ }
27
+ self.pad_token_id = 0
28
+ self.unk_token_id = 1
29
+ self.bos_token_id = 2
30
+ self.eos_token_id = 3
31
+ self.mask_token_id = 4
32
+
33
+ def _get_stats(self, vocab):
34
+ """Get counts of all symbol pairs."""
35
+ pairs = defaultdict(int)
36
+ for word, freq in vocab.items():
37
+ symbols = word.split()
38
+ for i in range(len(symbols) - 1):
39
+ pairs[(symbols[i], symbols[i + 1])] += freq
40
+ return pairs
41
+
42
+ def _merge_vocab(self, pair, vocab):
43
+ """Merge all occurrences of pair in vocab."""
44
+ bigram = re.escape(' '.join(pair))
45
+ pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
46
+ new_vocab = {}
47
+ for word in vocab:
48
+ new_word = pattern.sub(''.join(pair), word)
49
+ new_vocab[new_word] = vocab[word]
50
+ return new_vocab
51
+
52
+ def _pre_tokenize(self, text: str) -> List[str]:
53
+ """Pre-tokenize text into words."""
54
+ # Simple whitespace and punctuation tokenization
55
+ pattern = r"'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"
56
+ return re.findall(pattern, text)
57
+
58
+ def train(self, texts: List[str]):
59
+ """Train BPE on a list of texts."""
60
+ print(f"Training BPE tokenizer with vocab_size={self.vocab_size}")
61
+
62
+ # Initialize vocabulary with special tokens
63
+ self.vocab = {token: i for token, i in self.special_tokens.items()}
64
+
65
+ # Build word frequency dictionary
66
+ vocab = defaultdict(int)
67
+ for text in texts:
68
+ words = self._pre_tokenize(text.lower())
69
+ for word in words:
70
+ # End word with </w>
71
+ word = ' '.join(list(word)) + ' </w>'
72
+ vocab[tuple(word.split())] += 1
73
+
74
+ # Convert to string format
75
+ vocab = {' '.join(k): v for k, v in vocab.items()}
76
+
77
+ # Add individual characters to vocab
78
+ for word in vocab:
79
+ for char in word.split():
80
+ if char not in self.vocab:
81
+ self.vocab[char] = len(self.vocab)
82
+
83
+ # BPE training
84
+ num_merges = self.vocab_size - len(self.vocab)
85
+ for i in range(num_merges):
86
+ pairs = self._get_stats(vocab)
87
+ if not pairs:
88
+ break
89
+
90
+ best = max(pairs, key=pairs.get)
91
+ vocab = self._merge_vocab(best, vocab)
92
+ self.merges.append(best)
93
+
94
+ # Add merged token to vocab
95
+ merged_token = ''.join(best)
96
+ if merged_token not in self.vocab:
97
+ self.vocab[merged_token] = len(self.vocab)
98
+
99
+ if (i + 1) % 1000 == 0:
100
+ print(f" Completed {i + 1}/{num_merges} merges")
101
+
102
+ print(f"Final vocabulary size: {len(self.vocab)}")
103
+
104
+ def encode(self, text: str, add_special_tokens: bool = True) -> List[int]:
105
+ """Encode text to token IDs."""
106
+ words = self._pre_tokenize(text)
107
+ token_ids = []
108
+
109
+ if add_special_tokens:
110
+ token_ids.append(self.bos_token_id)
111
+
112
+ for word in words:
113
+ word = word.lower()
114
+ word_tokens = ' '.join(list(word)) + ' </w>'
115
+
116
+ # Apply BPE merges
117
+ for merge in self.merges:
118
+ bigram = re.escape(' '.join(merge))
119
+ pattern = re.compile(r'(?<!\S)' + bigram + r'(?!\S)')
120
+ word_tokens = pattern.sub(''.join(merge), word_tokens)
121
+
122
+ # Convert to IDs
123
+ for token in word_tokens.split():
124
+ token_ids.append(self.vocab.get(token, self.unk_token_id))
125
+
126
+ if add_special_tokens:
127
+ token_ids.append(self.eos_token_id)
128
+
129
+ return token_ids
130
+
131
+ def decode(self, token_ids: List[int], skip_special_tokens: bool = True) -> str:
132
+ """Decode token IDs to text."""
133
+ # Build reverse vocab
134
+ reverse_vocab = {v: k for k, v in self.vocab.items()}
135
+
136
+ tokens = []
137
+ for token_id in token_ids:
138
+ if token_id in self.special_tokens.values() and skip_special_tokens:
139
+ continue
140
+ tokens.append(reverse_vocab.get(token_id, '<unk>'))
141
+
142
+ text = ''.join(tokens)
143
+ text = text.replace('</w>', ' ')
144
+ return text.strip()
145
+
146
+ def save(self, path: str):
147
+ """Save tokenizer to file."""
148
+ data = {
149
+ 'vocab': self.vocab,
150
+ 'merges': self.merges,
151
+ 'special_tokens': self.special_tokens,
152
+ 'vocab_size': self.vocab_size,
153
+ }
154
+ with open(path, 'wb') as f:
155
+ pickle.dump(data, f)
156
+ print(f"Tokenizer saved to {path}")
157
+
158
+ def load(self, path: str):
159
+ """Load tokenizer from file."""
160
+ with open(path, 'rb') as f:
161
+ data = pickle.load(f)
162
+ self.vocab = data['vocab']
163
+ self.merges = data['merges']
164
+ self.special_tokens = data['special_tokens']
165
+ self.vocab_size = data['vocab_size']
166
+
167
+ self.pad_token_id = self.special_tokens['<pad>']
168
+ self.unk_token_id = self.special_tokens['<unk>']
169
+ self.bos_token_id = self.special_tokens['<s>']
170
+ self.eos_token_id = self.special_tokens['</s>']
171
+ self.mask_token_id = self.special_tokens['<mask>']
172
+ print(f"Tokenizer loaded from {path}")
173
+
174
+ def batch_encode(
175
+ self,
176
+ texts: List[str],
177
+ max_length: int = 512,
178
+ padding: bool = True,
179
+ truncation: bool = True,
180
+ ) -> Dict[str, List]:
181
+ """Batch encode texts."""
182
+ encoded = [self.encode(text) for text in texts]
183
+
184
+ if truncation:
185
+ encoded = [seq[:max_length] for seq in encoded]
186
+
187
+ if padding:
188
+ max_len = min(max(len(seq) for seq in encoded), max_length)
189
+ attention_mask = []
190
+ for seq in encoded:
191
+ mask = [1] * len(seq) + [0] * (max_len - len(seq))
192
+ seq.extend([self.pad_token_id] * (max_len - len(seq)))
193
+ attention_mask.append(mask[:max_len])
194
+ else:
195
+ attention_mask = [[1] * len(seq) for seq in encoded]
196
+
197
+ return {
198
+ 'input_ids': encoded,
199
+ 'attention_mask': attention_mask,
200
+ }
201
+
202
+ def __len__(self):
203
+ return len(self.vocab)
204
+
205
+
206
+ class ByteLevelBPETokenizer:
207
+ """Byte-level BPE tokenizer (similar to GPT-2/3)."""
208
+
209
+ def __init__(self, vocab_size: int = 32000):
210
+ self.vocab_size = vocab_size
211
+ self.vocab = {}
212
+ self.merges = []
213
+ self.byte_encoder = {i: chr(i + 128) for i in range(256)} # Shift to printable range
214
+ self.byte_decoder = {chr(i + 128): i for i in range(256)}
215
+
216
+ self.special_tokens = {
217
+ '<|endoftext|>': 0,
218
+ '<|pad|>': 1,
219
+ }
220
+ self.eos_token_id = 0
221
+ self.pad_token_id = 1
222
+
223
+ def _bytes_to_unicode(self, text: str) -> str:
224
+ """Convert string to byte-level representation."""
225
+ return ''.join(self.byte_encoder[b] for b in text.encode('utf-8'))
226
+
227
+ def _unicode_to_bytes(self, text: str) -> str:
228
+ """Convert byte-level representation back to string."""
229
+ return bytes(self.byte_decoder[c] for c in text).decode('utf-8', errors='replace')
230
+
231
+ def train(self, texts: List[str]):
232
+ """Train byte-level BPE."""
233
+ print(f"Training byte-level BPE tokenizer with vocab_size={self.vocab_size}")
234
+
235
+ # Initialize vocab with special tokens and all bytes
236
+ self.vocab = {token: i for token, i in self.special_tokens.items()}
237
+ for i in range(256):
238
+ byte_char = self.byte_encoder[i]
239
+ if byte_char not in self.vocab:
240
+ self.vocab[byte_char] = len(self.vocab)
241
+
242
+ # Build corpus as byte sequences
243
+ corpus = []
244
+ for text in texts:
245
+ byte_text = self._bytes_to_unicode(text)
246
+ corpus.extend(list(byte_text))
247
+
248
+ # Get initial word frequencies
249
+ vocab = defaultdict(int)
250
+ for text in texts:
251
+ byte_text = self._bytes_to_unicode(text)
252
+ # Add end token
253
+ byte_text += '<|endoftext|>'
254
+ vocab[tuple(byte_text)] += 1
255
+
256
+ # BPE training
257
+ num_merges = self.vocab_size - len(self.vocab)
258
+
259
+ for i in range(num_merges):
260
+ pairs = self._get_stats(vocab)
261
+ if not pairs:
262
+ break
263
+
264
+ best = max(pairs, key=pairs.get)
265
+ vocab = self._merge_vocab(best, vocab)
266
+ self.merges.append(best)
267
+
268
+ merged = ''.join(best)
269
+ if merged not in self.vocab:
270
+ self.vocab[merged] = len(self.vocab)
271
+
272
+ if (i + 1) % 1000 == 0:
273
+ print(f" Completed {i + 1}/{num_merges} merges")
274
+
275
+ print(f"Final vocabulary size: {len(self.vocab)}")
276
+
277
+ def _get_stats(self, vocab):
278
+ pairs = defaultdict(int)
279
+ for word, freq in vocab.items():
280
+ symbols = list(word)
281
+ for i in range(len(symbols) - 1):
282
+ pairs[(symbols[i], symbols[i + 1])] += freq
283
+ return pairs
284
+
285
+ def _merge_vocab(self, pair, vocab):
286
+ new_vocab = {}
287
+ bigram = pair[0] + pair[1]
288
+ for word in vocab:
289
+ new_word = []
290
+ i = 0
291
+ while i < len(word):
292
+ if i < len(word) - 1 and word[i] == pair[0] and word[i + 1] == pair[1]:
293
+ new_word.append(bigram)
294
+ i += 2
295
+ else:
296
+ new_word.append(word[i])
297
+ i += 1
298
+ new_vocab[tuple(new_word)] = vocab[word]
299
+ return new_vocab
300
+
301
+ def encode(self, text: str, add_special_tokens: bool = True) -> List[int]:
302
+ """Encode text to token IDs."""
303
+ byte_text = self._bytes_to_unicode(text)
304
+ if add_special_tokens:
305
+ byte_text += '<|endoftext|>'
306
+
307
+ # Apply merges
308
+ word = list(byte_text)
309
+ for merge in self.merges:
310
+ new_word = []
311
+ i = 0
312
+ while i < len(word):
313
+ if i < len(word) - 1 and word[i] == merge[0] and word[i + 1] == merge[1]:
314
+ new_word.append(merge[0] + merge[1])
315
+ i += 2
316
+ else:
317
+ new_word.append(word[i])
318
+ i += 1
319
+ word = new_word
320
+
321
+ # Convert to IDs
322
+ return [self.vocab.get(token, self.special_tokens['<|pad|>']) for token in word]
323
+
324
+ def decode(self, token_ids: List[int]) -> str:
325
+ """Decode token IDs to text."""
326
+ reverse_vocab = {v: k for k, v in self.vocab.items()}
327
+ text = ''.join(reverse_vocab.get(id, '') for id in token_ids)
328
+ text = text.replace('<|endoftext|>', '')
329
+ return self._unicode_to_bytes(text)
330
+
331
+ def save(self, path: str):
332
+ """Save tokenizer to file."""
333
+ data = {
334
+ 'vocab': self.vocab,
335
+ 'merges': self.merges,
336
+ 'special_tokens': self.special_tokens,
337
+ 'vocab_size': self.vocab_size,
338
+ 'byte_encoder': self.byte_encoder,
339
+ 'byte_decoder': self.byte_decoder,
340
+ }
341
+ with open(path, 'wb') as f:
342
+ pickle.dump(data, f)
343
+ print(f"Tokenizer saved to {path}")
344
+
345
+ def load(self, path: str):
346
+ """Load tokenizer from file."""
347
+ with open(path, 'rb') as f:
348
+ data = pickle.load(f)
349
+ self.vocab = data['vocab']
350
+ self.merges = data['merges']
351
+ self.special_tokens = data['special_tokens']
352
+ self.vocab_size = data['vocab_size']
353
+ self.byte_encoder = data.get('byte_encoder', self.byte_encoder)
354
+ self.byte_decoder = data.get('byte_decoder', self.byte_decoder)
355
+
356
+ # Ensure all special tokens exist
357
+ if '<|endoftext|>' not in self.special_tokens:
358
+ self.special_tokens['<|endoftext|>'] = 0
359
+ if '<|pad|>' not in self.special_tokens:
360
+ self.special_tokens['<|pad|>'] = 1
361
+
362
+ self.eos_token_id = self.special_tokens.get('<|endoftext|>', 0)
363
+ self.pad_token_id = self.special_tokens.get('<|pad|>', 1)
364
+ print(f"Tokenizer loaded from {path}")
365
+
366
+ def __len__(self):
367
+ return len(self.vocab)
368
+
369
+
370
+ def create_and_train_tokenizer(texts: List[str], vocab_size: int = 32000, output_path: str = "tokenizer.pkl"):
371
+ """Create and train a tokenizer on the given texts."""
372
+ tokenizer = ByteLevelBPETokenizer(vocab_size=vocab_size)
373
+ tokenizer.train(texts)
374
+ tokenizer.save(output_path)
375
+ return tokenizer
376
+
377
+
378
+ if __name__ == "__main__":
379
+ # Test tokenizer
380
+ sample_texts = [
381
+ "Hello, world! This is a test.",
382
+ "The quick brown fox jumps over the lazy dog.",
383
+ "Machine learning is fascinating.",
384
+ "Artificial intelligence will change the world.",
385
+ ]
386
+
387
+ tokenizer = BPETokenizer(vocab_size=1000)
388
+ tokenizer.train(sample_texts)
389
+
390
+ test_text = "Hello world!"
391
+ encoded = tokenizer.encode(test_text)
392
+ decoded = tokenizer.decode(encoded)
393
+
394
+ print(f"\nOriginal: {test_text}")
395
+ print(f"Encoded: {encoded}")
396
+ print(f"Decoded: {decoded}")
train.py ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VicAI Training Script
3
+ Distributed training with FSDP/DDP support.
4
+ """
5
+
6
+ import argparse
7
+ import os
8
+ import time
9
+ from contextlib import nullcontext
10
+ from pathlib import Path
11
+
12
+ import torch
13
+ import torch.distributed as dist
14
+ from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
15
+ from torch.distributed.fsdp.wrap import transformer_auto_wrap_policy
16
+ from torch.nn.parallel import DistributedDataParallel as DDP
17
+ from torch.utils.data import DataLoader
18
+ from torch.utils.data.distributed import DistributedSampler
19
+
20
+ from model import VicAIModel, VicAIConfig, create_vicai_5b
21
+ from tokenizer import ByteLevelBPETokenizer, BPETokenizer
22
+ from dataset import (
23
+ WikipediaDataset,
24
+ TextFileDataset,
25
+ MixedDataset,
26
+ create_sample_corpus,
27
+ )
28
+ from utils import (
29
+ get_logger,
30
+ load_checkpoint,
31
+ save_checkpoint,
32
+ get_lr_scheduler,
33
+ estimate_loss,
34
+ configure_optimizers,
35
+ )
36
+
37
+
38
+ def setup_distributed():
39
+ """Initialize distributed training."""
40
+ if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
41
+ rank = int(os.environ['RANK'])
42
+ world_size = int(os.environ['WORLD_SIZE'])
43
+ local_rank = int(os.environ.get('LOCAL_RANK', 0))
44
+ else:
45
+ rank = 0
46
+ world_size = 1
47
+ local_rank = 0
48
+
49
+ if world_size > 1:
50
+ dist.init_process_group(backend='nccl', rank=rank, world_size=world_size)
51
+ torch.cuda.set_device(local_rank)
52
+
53
+ return rank, world_size, local_rank
54
+
55
+
56
+ def cleanup_distributed():
57
+ """Cleanup distributed training."""
58
+ if dist.is_initialized():
59
+ dist.destroy_process_group()
60
+
61
+
62
+ def get_data_loader(dataset, batch_size, world_size, rank, shuffle=True):
63
+ """Create distributed data loader."""
64
+ if world_size > 1:
65
+ sampler = DistributedSampler(
66
+ dataset,
67
+ num_replicas=world_size,
68
+ rank=rank,
69
+ shuffle=shuffle,
70
+ )
71
+ else:
72
+ sampler = None
73
+
74
+ loader = DataLoader(
75
+ dataset,
76
+ batch_size=batch_size,
77
+ sampler=sampler,
78
+ num_workers=4,
79
+ pin_memory=True,
80
+ drop_last=True,
81
+ )
82
+
83
+ return loader, sampler
84
+
85
+
86
+ def train_step(model, batch, optimizer, scaler, device, use_amp):
87
+ """Single training step."""
88
+ model.train()
89
+
90
+ input_ids = batch['input_ids'].to(device)
91
+ labels = batch['labels'].to(device)
92
+
93
+ optimizer.zero_grad()
94
+
95
+ with torch.cuda.amp.autocast(enabled=use_amp):
96
+ outputs = model(input_ids, targets=labels)
97
+ loss = outputs['loss']
98
+
99
+ if use_amp:
100
+ scaler.scale(loss).backward()
101
+ scaler.step(optimizer)
102
+ scaler.update()
103
+ else:
104
+ loss.backward()
105
+ optimizer.step()
106
+
107
+ return loss.item()
108
+
109
+
110
+ def train(
111
+ model,
112
+ train_loader,
113
+ val_loader,
114
+ optimizer,
115
+ lr_scheduler,
116
+ scaler,
117
+ device,
118
+ args,
119
+ logger,
120
+ ):
121
+ """Main training loop."""
122
+ best_val_loss = float('inf')
123
+ step = 0
124
+
125
+ model.train()
126
+ train_iterator = iter(train_loader)
127
+
128
+ for epoch in range(args.max_epochs):
129
+ if hasattr(train_loader.sampler, 'set_epoch'):
130
+ train_loader.sampler.set_epoch(epoch)
131
+
132
+ epoch_start_time = time.time()
133
+
134
+ while step < args.max_steps:
135
+ try:
136
+ batch = next(train_iterator)
137
+ except StopIteration:
138
+ train_iterator = iter(train_loader)
139
+ batch = next(train_iterator)
140
+
141
+ # Training step
142
+ loss = train_step(model, batch, optimizer, scaler, device, args.use_amp)
143
+ lr_scheduler.step()
144
+
145
+ step += 1
146
+
147
+ # Logging
148
+ if step % args.log_interval == 0 and args.rank == 0:
149
+ lr = optimizer.param_groups[0]['lr']
150
+ logger.info(
151
+ f"Step {step}/{args.max_steps} | "
152
+ f"Loss: {loss:.4f} | LR: {lr:.2e}"
153
+ )
154
+
155
+ # Evaluation
156
+ if step % args.eval_interval == 0:
157
+ val_loss = evaluate(model, val_loader, device, args.use_amp)
158
+
159
+ if args.rank == 0:
160
+ logger.info(f"Validation loss: {val_loss:.4f}")
161
+
162
+ # Save best model
163
+ if val_loss < best_val_loss:
164
+ best_val_loss = val_loss
165
+ save_checkpoint(
166
+ model,
167
+ optimizer,
168
+ scaler,
169
+ step,
170
+ val_loss,
171
+ args.output_dir / 'best_model.pt',
172
+ )
173
+ logger.info(f"Saved best model with loss {val_loss:.4f}")
174
+
175
+ model.train()
176
+
177
+ # Regular checkpointing
178
+ if step % args.save_interval == 0 and args.rank == 0:
179
+ save_checkpoint(
180
+ model,
181
+ optimizer,
182
+ scaler,
183
+ step,
184
+ loss,
185
+ args.output_dir / f'checkpoint_step_{step}.pt',
186
+ )
187
+ logger.info(f"Saved checkpoint at step {step}")
188
+
189
+ if step >= args.max_steps:
190
+ break
191
+
192
+ epoch_time = time.time() - epoch_start_time
193
+ if args.rank == 0:
194
+ logger.info(f"Epoch {epoch + 1} completed in {epoch_time:.2f}s")
195
+
196
+ return step
197
+
198
+
199
+ def evaluate(model, data_loader, device, use_amp):
200
+ """Evaluate model on validation set."""
201
+ model.eval()
202
+ total_loss = 0
203
+ num_batches = 0
204
+
205
+ with torch.no_grad():
206
+ for batch in data_loader:
207
+ input_ids = batch['input_ids'].to(device)
208
+ labels = batch['labels'].to(device)
209
+
210
+ with torch.cuda.amp.autocast(enabled=use_amp):
211
+ outputs = model(input_ids, targets=labels)
212
+ loss = outputs['loss']
213
+
214
+ total_loss += loss.item()
215
+ num_batches += 1
216
+
217
+ if num_batches >= 100: # Limit eval batches
218
+ break
219
+
220
+ # Average across all processes
221
+ avg_loss = total_loss / num_batches
222
+
223
+ if dist.is_initialized():
224
+ loss_tensor = torch.tensor([avg_loss], device=device)
225
+ dist.all_reduce(loss_tensor, op=dist.ReduceOp.AVG)
226
+ avg_loss = loss_tensor.item()
227
+
228
+ return avg_loss
229
+
230
+
231
+ def main():
232
+ parser = argparse.ArgumentParser(description='Train VicAI')
233
+
234
+ # Model args
235
+ parser.add_argument('--vocab-size', type=int, default=32000)
236
+ parser.add_argument('--dim', type=int, default=4096)
237
+ parser.add_argument('--n-layers', type=int, default=32)
238
+ parser.add_argument('--n-heads', type=int, default=32)
239
+ parser.add_argument('--n-kv-heads', type=int, default=8)
240
+ parser.add_argument('--hidden-dim', type=int, default=14336)
241
+
242
+ # Training args
243
+ parser.add_argument('--batch-size', type=int, default=4)
244
+ parser.add_argument('--max-seq-len', type=int, default=2048)
245
+ parser.add_argument('--max-steps', type=int, default=100000)
246
+ parser.add_argument('--max-epochs', type=int, default=10)
247
+ parser.add_argument('--learning-rate', type=float, default=3e-4)
248
+ parser.add_argument('--min-lr', type=float, default=3e-5)
249
+ parser.add_argument('--warmup-steps', type=int, default=2000)
250
+ parser.add_argument('--weight-decay', type=float, default=0.1)
251
+ parser.add_argument('--grad-clip', type=float, default=1.0)
252
+ parser.add_argument('--beta1', type=float, default=0.9)
253
+ parser.add_argument('--beta2', type=float, default=0.95)
254
+
255
+ # Data args
256
+ parser.add_argument('--train-data', type=str, default='data/train.txt')
257
+ parser.add_argument('--val-data', type=str, default='data/val.txt')
258
+ parser.add_argument('--tokenizer-path', type=str, default='tokenizer.pkl')
259
+
260
+ # System args
261
+ parser.add_argument('--output-dir', type=str, default='checkpoints')
262
+ parser.add_argument('--resume', type=str, default=None)
263
+ parser.add_argument('--eval-interval', type=int, default=1000)
264
+ parser.add_argument('--save-interval', type=int, default=5000)
265
+ parser.add_argument('--log-interval', type=int, default=100)
266
+ parser.add_argument('--use-amp', action='store_true', default=True)
267
+ parser.add_argument('--use-fsdp', action='store_true', default=False)
268
+ parser.add_argument('--compile', action='store_true', default=False)
269
+
270
+ args = parser.parse_args()
271
+
272
+ # Setup
273
+ args.rank, args.world_size, args.local_rank = setup_distributed()
274
+ args.is_distributed = args.world_size > 1
275
+
276
+ # Create output directory
277
+ args.output_dir = Path(args.output_dir)
278
+ if args.rank == 0:
279
+ args.output_dir.mkdir(parents=True, exist_ok=True)
280
+
281
+ # Logger
282
+ logger = get_logger('vicai_train', args.output_dir / 'train.log' if args.rank == 0 else None)
283
+
284
+ if args.rank == 0:
285
+ logger.info(f"Starting VicAI training with {args.world_size} GPUs")
286
+ logger.info(f"Arguments: {args}")
287
+
288
+ # Device
289
+ device = torch.device(f'cuda:{args.local_rank}' if torch.cuda.is_available() else 'cpu')
290
+
291
+ # Load tokenizer
292
+ if os.path.exists(args.tokenizer_path):
293
+ logger.info(f"Loading tokenizer from {args.tokenizer_path}")
294
+ tokenizer = ByteLevelBPETokenizer()
295
+ tokenizer.load(args.tokenizer_path)
296
+ else:
297
+ logger.warning(f"Tokenizer not found at {args.tokenizer_path}, creating default")
298
+ tokenizer = ByteLevelBPETokenizer(vocab_size=args.vocab_size)
299
+ # Train on sample data
300
+ if args.rank == 0:
301
+ sample_file = create_sample_corpus(num_articles=100)
302
+ with open(sample_file, 'r') as f:
303
+ texts = f.read().split('<|endoftext|>')
304
+ tokenizer.train([t for t in texts if t.strip()])
305
+ tokenizer.save(args.tokenizer_path)
306
+
307
+ if args.is_distributed:
308
+ dist.barrier()
309
+
310
+ if args.rank != 0:
311
+ tokenizer.load(args.tokenizer_path)
312
+
313
+ # Create model
314
+ logger.info("Creating model...")
315
+ config = VicAIConfig(
316
+ vocab_size=len(tokenizer),
317
+ dim=args.dim,
318
+ n_layers=args.n_layers,
319
+ n_heads=args.n_heads,
320
+ n_kv_heads=args.n_kv_heads,
321
+ hidden_dim=args.hidden_dim,
322
+ max_seq_len=args.max_seq_len,
323
+ dropout=0.0,
324
+ )
325
+
326
+ if args.rank == 0:
327
+ logger.info(f"Model config: {config.__dict__}")
328
+ logger.info(f"Model parameters: ~{config.num_parameters / 1e9:.2f}B")
329
+
330
+ model = VicAIModel(config)
331
+
332
+ if args.use_fsdp and args.is_distributed:
333
+ model = FSDP(model, device_id=device)
334
+ elif args.is_distributed:
335
+ model = DDP(model, device_ids=[args.local_rank])
336
+ else:
337
+ model = model.to(device)
338
+
339
+ if args.compile and hasattr(torch, 'compile'):
340
+ logger.info("Compiling model...")
341
+ model = torch.compile(model)
342
+
343
+ # Create datasets
344
+ logger.info("Creating datasets...")
345
+
346
+ if os.path.exists(args.train_data):
347
+ train_dataset = TextFileDataset(args.train_data, tokenizer, args.max_seq_len)
348
+ val_dataset = TextFileDataset(args.val_data, tokenizer, args.max_seq_len) if os.path.exists(args.val_data) else train_dataset
349
+ else:
350
+ logger.warning("Training data not found, using Wikipedia streaming dataset")
351
+ train_dataset = WikipediaDataset(tokenizer, max_length=args.max_seq_len)
352
+ val_dataset = WikipediaDataset(tokenizer, max_length=args.max_seq_len)
353
+
354
+ train_loader, train_sampler = get_data_loader(train_dataset, args.batch_size, args.world_size, args.rank)
355
+ val_loader, _ = get_data_loader(val_dataset, args.batch_size, args.world_size, args.rank, shuffle=False)
356
+
357
+ # Optimizer
358
+ optimizer = configure_optimizers(model, args)
359
+
360
+ # Learning rate scheduler
361
+ lr_scheduler = get_lr_scheduler(optimizer, args)
362
+
363
+ # Gradient scaler for AMP
364
+ scaler = torch.cuda.amp.GradScaler(enabled=args.use_amp)
365
+
366
+ # Resume from checkpoint
367
+ start_step = 0
368
+ if args.resume:
369
+ logger.info(f"Resuming from {args.resume}")
370
+ start_step = load_checkpoint(model, optimizer, scaler, args.resume, device)
371
+
372
+ # Training
373
+ logger.info("Starting training...")
374
+ final_step = train(
375
+ model,
376
+ train_loader,
377
+ val_loader,
378
+ optimizer,
379
+ lr_scheduler,
380
+ scaler,
381
+ device,
382
+ args,
383
+ logger,
384
+ )
385
+
386
+ # Save final model
387
+ if args.rank == 0:
388
+ save_checkpoint(
389
+ model,
390
+ optimizer,
391
+ scaler,
392
+ final_step,
393
+ 0.0,
394
+ args.output_dir / 'final_model.pt',
395
+ )
396
+ logger.info("Training completed!")
397
+
398
+ cleanup_distributed()
399
+
400
+
401
+ if __name__ == '__main__':
402
+ main()
utils.py ADDED
@@ -0,0 +1,359 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ VicAI Utilities
3
+ Helper functions for training and evaluation.
4
+ """
5
+
6
+ import json
7
+ import logging
8
+ import math
9
+ import os
10
+ import sys
11
+ from pathlib import Path
12
+ from typing import Dict, Optional
13
+
14
+ import torch
15
+ import torch.distributed as dist
16
+ from torch.optim import AdamW
17
+
18
+
19
+ def get_logger(name: str, log_file: Optional[Path] = None) -> logging.Logger:
20
+ """Create a logger with file and console handlers."""
21
+ logger = logging.getLogger(name)
22
+ logger.setLevel(logging.INFO)
23
+
24
+ # Clear existing handlers
25
+ logger.handlers = []
26
+
27
+ # Formatter
28
+ formatter = logging.Formatter(
29
+ '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
30
+ datefmt='%Y-%m-%d %H:%M:%S'
31
+ )
32
+
33
+ # Console handler
34
+ console_handler = logging.StreamHandler(sys.stdout)
35
+ console_handler.setLevel(logging.INFO)
36
+ console_handler.setFormatter(formatter)
37
+ logger.addHandler(console_handler)
38
+
39
+ # File handler
40
+ if log_file:
41
+ log_file.parent.mkdir(parents=True, exist_ok=True)
42
+ file_handler = logging.FileHandler(log_file)
43
+ file_handler.setLevel(logging.INFO)
44
+ file_handler.setFormatter(formatter)
45
+ logger.addHandler(file_handler)
46
+
47
+ return logger
48
+
49
+
50
+ def save_checkpoint(
51
+ model,
52
+ optimizer,
53
+ scaler,
54
+ step: int,
55
+ loss: float,
56
+ path: Path,
57
+ ):
58
+ """Save model checkpoint."""
59
+ path.parent.mkdir(parents=True, exist_ok=True)
60
+
61
+ # Unwrap model if using DDP/FSDP
62
+ state_dict = model.state_dict()
63
+ if hasattr(model, 'module'):
64
+ state_dict = model.module.state_dict()
65
+
66
+ checkpoint = {
67
+ 'model': state_dict,
68
+ 'optimizer': optimizer.state_dict(),
69
+ 'scaler': scaler.state_dict() if scaler else None,
70
+ 'step': step,
71
+ 'loss': loss,
72
+ }
73
+
74
+ torch.save(checkpoint, path)
75
+
76
+
77
+ def load_checkpoint(
78
+ model,
79
+ optimizer,
80
+ scaler,
81
+ path: str,
82
+ device,
83
+ ):
84
+ """Load model checkpoint."""
85
+ checkpoint = torch.load(path, map_location=device)
86
+
87
+ # Handle both wrapped and unwrapped models
88
+ state_dict = checkpoint['model']
89
+ if hasattr(model, 'module'):
90
+ model.module.load_state_dict(state_dict)
91
+ else:
92
+ model.load_state_dict(state_dict)
93
+
94
+ optimizer.load_state_dict(checkpoint['optimizer'])
95
+
96
+ if scaler and checkpoint.get('scaler'):
97
+ scaler.load_state_dict(checkpoint['scaler'])
98
+
99
+ return checkpoint.get('step', 0)
100
+
101
+
102
+ def get_lr_scheduler(optimizer, args):
103
+ """Create learning rate scheduler with warmup and cosine decay."""
104
+
105
+ def lr_lambda(current_step):
106
+ if current_step < args.warmup_steps:
107
+ # Linear warmup
108
+ return current_step / args.warmup_steps
109
+ else:
110
+ # Cosine decay
111
+ progress = (current_step - args.warmup_steps) / (args.max_steps - args.warmup_steps)
112
+ progress = min(1.0, progress)
113
+ cosine_decay = 0.5 * (1 + math.cos(math.pi * progress))
114
+ return args.min_lr / args.learning_rate + (1 - args.min_lr / args.learning_rate) * cosine_decay
115
+
116
+ from torch.optim.lr_scheduler import LambdaLR
117
+ return LambdaLR(optimizer, lr_lambda)
118
+
119
+
120
+ def configure_optimizers(model, args):
121
+ """Configure optimizer with weight decay."""
122
+ # Separate parameters that should and shouldn't have weight decay
123
+ decay_params = []
124
+ no_decay_params = []
125
+
126
+ for name, param in model.named_parameters():
127
+ if not param.requires_grad:
128
+ continue
129
+
130
+ # Don't apply weight decay to bias and normalization parameters
131
+ if 'bias' in name or 'norm' in name or 'embedding' in name:
132
+ no_decay_params.append(param)
133
+ else:
134
+ decay_params.append(param)
135
+
136
+ param_groups = [
137
+ {'params': decay_params, 'weight_decay': args.weight_decay},
138
+ {'params': no_decay_params, 'weight_decay': 0.0},
139
+ ]
140
+
141
+ optimizer = AdamW(
142
+ param_groups,
143
+ lr=args.learning_rate,
144
+ betas=(args.beta1, args.beta2),
145
+ eps=1e-8,
146
+ )
147
+
148
+ return optimizer
149
+
150
+
151
+ def estimate_loss(model, data_loader, device, num_batches=10):
152
+ """Estimate loss on a data loader."""
153
+ model.eval()
154
+ total_loss = 0
155
+
156
+ with torch.no_grad():
157
+ for i, batch in enumerate(data_loader):
158
+ if i >= num_batches:
159
+ break
160
+
161
+ input_ids = batch['input_ids'].to(device)
162
+ labels = batch['labels'].to(device)
163
+
164
+ outputs = model(input_ids, targets=labels)
165
+ total_loss += outputs['loss'].item()
166
+
167
+ model.train()
168
+ return total_loss / num_batches
169
+
170
+
171
+ def get_grad_norm(model):
172
+ """Calculate gradient norm."""
173
+ total_norm = 0.0
174
+ for p in model.parameters():
175
+ if p.grad is not None:
176
+ total_norm += p.grad.data.norm(2).item() ** 2
177
+ return total_norm ** 0.5
178
+
179
+
180
+ def clip_gradients(model, max_norm):
181
+ """Clip gradients by norm."""
182
+ torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
183
+
184
+
185
+ class AverageMeter:
186
+ """Track running average of metrics."""
187
+
188
+ def __init__(self):
189
+ self.reset()
190
+
191
+ def reset(self):
192
+ self.val = 0
193
+ self.avg = 0
194
+ self.sum = 0
195
+ self.count = 0
196
+
197
+ def update(self, val, n=1):
198
+ self.val = val
199
+ self.sum += val * n
200
+ self.count += n
201
+ self.avg = self.sum / self.count
202
+
203
+
204
+ class EarlyStopping:
205
+ """Early stopping to prevent overfitting."""
206
+
207
+ def __init__(self, patience=5, min_delta=0.0):
208
+ self.patience = patience
209
+ self.min_delta = min_delta
210
+ self.counter = 0
211
+ self.best_loss = None
212
+ self.early_stop = False
213
+
214
+ def __call__(self, val_loss):
215
+ if self.best_loss is None:
216
+ self.best_loss = val_loss
217
+ elif val_loss > self.best_loss - self.min_delta:
218
+ self.counter += 1
219
+ if self.counter >= self.patience:
220
+ self.early_stop = True
221
+ else:
222
+ self.best_loss = val_loss
223
+ self.counter = 0
224
+
225
+ return self.early_stop
226
+
227
+
228
+ def count_parameters(model):
229
+ """Count trainable parameters."""
230
+ return sum(p.numel() for p in model.parameters() if p.requires_grad)
231
+
232
+
233
+ def format_num_parameters(num_params):
234
+ """Format parameter count for display."""
235
+ if num_params >= 1e9:
236
+ return f"{num_params / 1e9:.2f}B"
237
+ elif num_params >= 1e6:
238
+ return f"{num_params / 1e6:.2f}M"
239
+ elif num_params >= 1e3:
240
+ return f"{num_params / 1e3:.2f}K"
241
+ else:
242
+ return str(num_params)
243
+
244
+
245
+ def get_device_info():
246
+ """Get information about available GPUs."""
247
+ if not torch.cuda.is_available():
248
+ return "No CUDA available"
249
+
250
+ info = []
251
+ for i in range(torch.cuda.device_count()):
252
+ props = torch.cuda.get_device_properties(i)
253
+ info.append(
254
+ f"GPU {i}: {props.name} ({props.total_memory / 1e9:.1f} GB)"
255
+ )
256
+
257
+ return "\n".join(info)
258
+
259
+
260
+ def print_model_summary(model):
261
+ """Print a summary of the model architecture."""
262
+ print("\n" + "=" * 60)
263
+ print("MODEL SUMMARY")
264
+ print("=" * 60)
265
+
266
+ total_params = 0
267
+ trainable_params = 0
268
+
269
+ print(f"\n{'Layer':<40} {'Parameters':>15} {'Trainable':>10}")
270
+ print("-" * 70)
271
+
272
+ for name, param in model.named_parameters():
273
+ num_params = param.numel()
274
+ total_params += num_params
275
+ if param.requires_grad:
276
+ trainable_params += num_params
277
+ trainable = "Yes"
278
+ else:
279
+ trainable = "No"
280
+
281
+ print(f"{name:<40} {num_params:>15,} {trainable:>10}")
282
+
283
+ print("-" * 70)
284
+ print(f"{'Total':<40} {total_params:>15,}")
285
+ print(f"{'Trainable':<40} {trainable_params:>15,}")
286
+ print(f"{'Non-trainable':<40} {total_params - trainable_params:>15,}")
287
+ print("=" * 60 + "\n")
288
+
289
+
290
+ def save_training_config(args, output_path: Path):
291
+ """Save training configuration to JSON."""
292
+ config = vars(args)
293
+ with open(output_path, 'w') as f:
294
+ json.dump(config, f, indent=2)
295
+
296
+
297
+ def load_training_config(config_path: Path):
298
+ """Load training configuration from JSON."""
299
+ with open(config_path, 'r') as f:
300
+ return json.load(f)
301
+
302
+
303
+ def all_reduce_dict(data: Dict, device):
304
+ """All reduce dictionary values across processes."""
305
+ if not dist.is_initialized():
306
+ return data
307
+
308
+ reduced_data = {}
309
+ for key, value in data.items():
310
+ if isinstance(value, (int, float)):
311
+ tensor = torch.tensor([value], device=device)
312
+ dist.all_reduce(tensor, op=dist.ReduceOp.AVG)
313
+ reduced_data[key] = tensor.item()
314
+ else:
315
+ reduced_data[key] = value
316
+
317
+ return reduced_data
318
+
319
+
320
+ def set_seed(seed: int):
321
+ """Set random seed for reproducibility."""
322
+ import random
323
+ import numpy as np
324
+
325
+ random.seed(seed)
326
+ np.random.seed(seed)
327
+ torch.manual_seed(seed)
328
+ torch.cuda.manual_seed_all(seed)
329
+
330
+ # For deterministic operations (may be slower)
331
+ torch.backends.cudnn.deterministic = True
332
+ torch.backends.cudnn.benchmark = False
333
+
334
+
335
+ def get_memory_usage():
336
+ """Get current memory usage."""
337
+ if torch.cuda.is_available():
338
+ allocated = torch.cuda.memory_allocated() / 1e9
339
+ reserved = torch.cuda.memory_reserved() / 1e9
340
+ max_allocated = torch.cuda.max_memory_allocated() / 1e9
341
+ return {
342
+ 'allocated_gb': allocated,
343
+ 'reserved_gb': reserved,
344
+ 'max_allocated_gb': max_allocated,
345
+ }
346
+ return {'allocated_gb': 0, 'reserved_gb': 0, 'max_allocated_gb': 0}
347
+
348
+
349
+ if __name__ == "__main__":
350
+ # Test utilities
351
+ logger = get_logger("test")
352
+ logger.info("Testing logger")
353
+
354
+ print(get_device_info())
355
+
356
+ meter = AverageMeter()
357
+ for i in range(10):
358
+ meter.update(i)
359
+ print(f"Average: {meter.avg}")
vocab.json ADDED
@@ -0,0 +1,2002 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "<|endoftext|>": 0,
3
+ "<|pad|>": 1,
4
+ "€": 2,
5
+ "": 3,
6
+ "‚": 4,
7
+ "ƒ": 5,
8
+ "„": 6,
9
+ "…": 7,
10
+ "†": 8,
11
+ "‡": 9,
12
+ "ˆ": 10,
13
+ "‰": 11,
14
+ "Š": 12,
15
+ "‹": 13,
16
+ "Œ": 14,
17
+ "": 15,
18
+ "Ž": 16,
19
+ "": 17,
20
+ "": 18,
21
+ "‘": 19,
22
+ "’": 20,
23
+ "“": 21,
24
+ "”": 22,
25
+ "•": 23,
26
+ "–": 24,
27
+ "—": 25,
28
+ "˜": 26,
29
+ "™": 27,
30
+ "š": 28,
31
+ "›": 29,
32
+ "œ": 30,
33
+ "": 31,
34
+ "ž": 32,
35
+ "Ÿ": 33,
36
+ " ": 34,
37
+ "¡": 35,
38
+ "¢": 36,
39
+ "£": 37,
40
+ "¤": 38,
41
+ "¥": 39,
42
+ "¦": 40,
43
+ "§": 41,
44
+ "¨": 42,
45
+ "©": 43,
46
+ "ª": 44,
47
+ "«": 45,
48
+ "¬": 46,
49
+ "­": 47,
50
+ "®": 48,
51
+ "¯": 49,
52
+ "°": 50,
53
+ "±": 51,
54
+ "²": 52,
55
+ "³": 53,
56
+ "´": 54,
57
+ "µ": 55,
58
+ "¶": 56,
59
+ "·": 57,
60
+ "¸": 58,
61
+ "¹": 59,
62
+ "º": 60,
63
+ "»": 61,
64
+ "¼": 62,
65
+ "½": 63,
66
+ "¾": 64,
67
+ "¿": 65,
68
+ "À": 66,
69
+ "Á": 67,
70
+ "Â": 68,
71
+ "Ã": 69,
72
+ "Ä": 70,
73
+ "Å": 71,
74
+ "Æ": 72,
75
+ "Ç": 73,
76
+ "È": 74,
77
+ "É": 75,
78
+ "Ê": 76,
79
+ "Ë": 77,
80
+ "Ì": 78,
81
+ "Í": 79,
82
+ "Î": 80,
83
+ "Ï": 81,
84
+ "Ð": 82,
85
+ "Ñ": 83,
86
+ "Ò": 84,
87
+ "Ó": 85,
88
+ "Ô": 86,
89
+ "Õ": 87,
90
+ "Ö": 88,
91
+ "×": 89,
92
+ "Ø": 90,
93
+ "Ù": 91,
94
+ "Ú": 92,
95
+ "Û": 93,
96
+ "Ü": 94,
97
+ "Ý": 95,
98
+ "Þ": 96,
99
+ "ß": 97,
100
+ "à": 98,
101
+ "á": 99,
102
+ "â": 100,
103
+ "ã": 101,
104
+ "ä": 102,
105
+ "å": 103,
106
+ "æ": 104,
107
+ "ç": 105,
108
+ "è": 106,
109
+ "é": 107,
110
+ "ê": 108,
111
+ "ë": 109,
112
+ "ì": 110,
113
+ "í": 111,
114
+ "î": 112,
115
+ "ï": 113,
116
+ "ð": 114,
117
+ "ñ": 115,
118
+ "ò": 116,
119
+ "ó": 117,
120
+ "ô": 118,
121
+ "õ": 119,
122
+ "ö": 120,
123
+ "÷": 121,
124
+ "ø": 122,
125
+ "ù": 123,
126
+ "ú": 124,
127
+ "û": 125,
128
+ "ü": 126,
129
+ "ý": 127,
130
+ "þ": 128,
131
+ "ÿ": 129,
132
+ "Ā": 130,
133
+ "ā": 131,
134
+ "Ă": 132,
135
+ "ă": 133,
136
+ "Ą": 134,
137
+ "ą": 135,
138
+ "Ć": 136,
139
+ "ć": 137,
140
+ "Ĉ": 138,
141
+ "ĉ": 139,
142
+ "Ċ": 140,
143
+ "ċ": 141,
144
+ "Č": 142,
145
+ "č": 143,
146
+ "Ď": 144,
147
+ "ď": 145,
148
+ "Đ": 146,
149
+ "đ": 147,
150
+ "Ē": 148,
151
+ "ē": 149,
152
+ "Ĕ": 150,
153
+ "ĕ": 151,
154
+ "Ė": 152,
155
+ "ė": 153,
156
+ "Ę": 154,
157
+ "ę": 155,
158
+ "Ě": 156,
159
+ "ě": 157,
160
+ "Ĝ": 158,
161
+ "ĝ": 159,
162
+ "Ğ": 160,
163
+ "ğ": 161,
164
+ "Ġ": 162,
165
+ "ġ": 163,
166
+ "Ģ": 164,
167
+ "ģ": 165,
168
+ "Ĥ": 166,
169
+ "ĥ": 167,
170
+ "Ħ": 168,
171
+ "ħ": 169,
172
+ "Ĩ": 170,
173
+ "ĩ": 171,
174
+ "Ī": 172,
175
+ "ī": 173,
176
+ "Ĭ": 174,
177
+ "ĭ": 175,
178
+ "Į": 176,
179
+ "į": 177,
180
+ "İ": 178,
181
+ "ı": 179,
182
+ "IJ": 180,
183
+ "ij": 181,
184
+ "Ĵ": 182,
185
+ "ĵ": 183,
186
+ "Ķ": 184,
187
+ "ķ": 185,
188
+ "ĸ": 186,
189
+ "Ĺ": 187,
190
+ "ĺ": 188,
191
+ "Ļ": 189,
192
+ "ļ": 190,
193
+ "Ľ": 191,
194
+ "ľ": 192,
195
+ "Ŀ": 193,
196
+ "ŀ": 194,
197
+ "Ł": 195,
198
+ "ł": 196,
199
+ "Ń": 197,
200
+ "ń": 198,
201
+ "Ņ": 199,
202
+ "ņ": 200,
203
+ "Ň": 201,
204
+ "ň": 202,
205
+ "ʼn": 203,
206
+ "Ŋ": 204,
207
+ "ŋ": 205,
208
+ "Ō": 206,
209
+ "ō": 207,
210
+ "Ŏ": 208,
211
+ "ŏ": 209,
212
+ "Ő": 210,
213
+ "ő": 211,
214
+ "Œ": 212,
215
+ "œ": 213,
216
+ "Ŕ": 214,
217
+ "ŕ": 215,
218
+ "Ŗ": 216,
219
+ "ŗ": 217,
220
+ "Ř": 218,
221
+ "ř": 219,
222
+ "Ś": 220,
223
+ "ś": 221,
224
+ "Ŝ": 222,
225
+ "ŝ": 223,
226
+ "Ş": 224,
227
+ "ş": 225,
228
+ "Š": 226,
229
+ "š": 227,
230
+ "Ţ": 228,
231
+ "ţ": 229,
232
+ "Ť": 230,
233
+ "ť": 231,
234
+ "Ŧ": 232,
235
+ "ŧ": 233,
236
+ "Ũ": 234,
237
+ "ũ": 235,
238
+ "Ū": 236,
239
+ "ū": 237,
240
+ "Ŭ": 238,
241
+ "ŭ": 239,
242
+ "Ů": 240,
243
+ "ů": 241,
244
+ "Ű": 242,
245
+ "ű": 243,
246
+ "Ų": 244,
247
+ "ų": 245,
248
+ "Ŵ": 246,
249
+ "ŵ": 247,
250
+ "Ŷ": 248,
251
+ "ŷ": 249,
252
+ "Ÿ": 250,
253
+ "Ź": 251,
254
+ "ź": 252,
255
+ "Ż": 253,
256
+ "ż": 254,
257
+ "Ž": 255,
258
+ "ž": 256,
259
+ "ſ": 257,
260
+ "  ": 258,
261
+ "    ": 259,
262
+ "        ": 260,
263
+ "Š        ": 261,
264
+ "éî": 262,
265
+ "åî": 263,
266
+ "Š    ": 264,
267
+ "óå": 265,
268
+ "åò": 266,
269
+ "½ ": 267,
270
+ "áô": 268,
271
+ "ôï": 269,
272
+ " ½ ": 270,
273
+ "¬ ": 271,
274
+ "áò": 272,
275
+ "äå": 273,
276
+ "ïò": 274,
277
+ "ìæ": 275,
278
+ "óåìæ": 276,
279
+ "Š            ": 277,
280
+ "ôå": 278,
281
+ "¢¢": 279,
282
+ "òå": 280,
283
+ "áì": 281,
284
+ "óåìæ®": 282,
285
+ "ëåî": 283,
286
+ "ïî": 284,
287
+ "ãè": 285,
288
+ "óô": 286,
289
+ "ôïëåî": 287,
290
+ "ìï": 288,
291
+ "éú": 289,
292
+ "ôé": 290,
293
+ "©Š        ": 291,
294
+ "ìå": 292,
295
+ "æ ": 293,
296
+ "ô¨": 294,
297
+ "º ": 295,
298
+ "©Š    ": 296,
299
+ "áä": 297,
300
+ "ó®": 298,
301
+ "äé": 299,
302
+ "ºŠ        ": 300,
303
+ "õô": 301,
304
+ "íð": 302,
305
+ "ô ": 303,
306
+ "áòç": 304,
307
+ "òá": 305,
308
+ "ðò": 306,
309
+ "áâ": 307,
310
+ "ïäå": 308,
311
+ "¬Š        ": 309,
312
+ "¢¢¢": 310,
313
+ "éä": 311,
314
+ "ó ": 312,
315
+ "íá": 313,
316
+ "éîç": 314,
317
+ "ðáò": 315,
318
+ "çå": 316,
319
+ "ðå": 317,
320
+ "éúåò": 318,
321
+ "áôá": 319,
322
+ "ôåø": 320,
323
+ "ãå": 321,
324
+ "ó ½ ": 322,
325
+ "áí": 323,
326
+ "ìåî": 324,
327
+ "õí": 325,
328
+ "��": 326,
329
+ "áôå": 327,
330
+ "éæ ": 328,
331
+ "ïã": 329,
332
+ "éô": 330,
333
+ "öïã": 331,
334
+ "öïãáâ": 332,
335
+ "£ ": 333,
336
+ "ßéä": 334,
337
+ "ôåøô": 335,
338
+ "ïäåì": 336,
339
+ "æïò": 337,
340
+ "éîô": 338,
341
+ "éîô¨": 339,
342
+ "äß": 340,
343
+ "äáôá": 341,
344
+ "áòçó®": 342,
345
+ "ðõô": 343,
346
+ "òãè": 344,
347
+ "©ºŠ        ": 345,
348
+ "©Š    Š    ": 346,
349
+ "ôïòãè": 347,
350
+ "æé": 348,
351
+ "íïäåì": 349,
352
+ "ßôïëåî": 350,
353
+ "ðòéîô¨": 351,
354
+ "òï": 352,
355
+ "ôïòãè®": 353,
356
+ "äåæ ": 354,
357
+ "íáø": 355,
358
+ "éî ": 356,
359
+ "õå": 357,
360
+ "åì": 358,
361
+ "ôïëåîéúåò": 359,
362
+ "õò": 360,
363
+ "öáì": 361,
364
+ "éúå": 362,
365
+ "ä ": 363,
366
+ "ßó": 364,
367
+ "ðáòáí": 365,
368
+ "ßß": 366,
369
+ "æïò ": 367,
370
+ "© ": 368,
371
+ "ïõô": 369,
372
+ "äéí": 370,
373
+ "ºŠ            ": 371,
374
+ "óôò": 372,
375
+ "ôéïî": 373,
376
+ "©Š            ": 374,
377
+ "öé": 375,
378
+ "æ¢": 376,
379
+ "óåô": 377,
380
+ "Š        óåìæ®": 378,
381
+ "ôòá": 379,
382
+ "õì": 380,
383
+ "õî": 381,
384
+ "áî": 382,
385
+ "íáøß": 383,
386
+ "÷ïò": 384,
387
+ "ìïç": 385,
388
+ "ôåò": 386,
389
+ "î ": 387,
390
+ "§¬ ": 388,
391
+ "òåô": 389,
392
+ "òåôõò": 390,
393
+ "§º ": 391,
394
+ "òåôõòî ": 392,
395
+ "¬Š    ": 393,
396
+ "ðá": 394,
397
+ "ðòï": 395,
398
+ "éîç ": 396,
399
+ "Š    Š    ": 397,
400
+ " éî ": 398,
401
+ "ôòáéî": 399,
402
+ "òáî": 400,
403
+ "öéãå": 401,
404
+ "            ": 402,
405
+ "èå": 403,
406
+ "ðåî": 404,
407
+ "ôù": 405,
408
+ "ðï": 406,
409
+ "ßë": 407,
410
+ "ðòéîô¨æ¢": 408,
411
+ "ðõôßéä": 409,
412
+ "äåöéãå": 410,
413
+ "ãï": 411,
414
+ "áôè": 412,
415
+ "öå": 413,
416
+ "ãïî": 414,
417
+ "éíð": 415,
418
+ "ó¨": 416,
419
+ "ìïó": 417,
420
+ "ºŠ    ": 418,
421
+ "©Š        Š        ": 419,
422
+ "îõí": 420,
423
+ "óÛ": 421,
424
+ "§Ý": 422,
425
+ "ôéí": 423,
426
+ "éíðïò": 424,
427
+ "éíðïòô ": 425,
428
+ "ŠŠŠ": 426,
429
+ "åä": 427,
430
+ "ôïð": 428,
431
+ "îå": 429,
432
+ "óåò": 430,
433
+ "èåáä": 431,
434
+ "éîðõôßéä": 432,
435
+ "åîåò": 433,
436
+ "í ": 434,
437
+ "°°": 435,
438
+ "ùôå": 436,
439
+ "ìïáä": 437,
440
+ "ô¨§": 438,
441
+ "éã": 439,
442
+ "íåò": 440,
443
+ "éóô": 441,
444
+ "®¢¢¢": 442,
445
+ "áôé": 443,
446
+ "éôè": 444,
447
+ "ßóéúå": 445,
448
+ "ºŠ    ¢¢¢": 446,
449
+ "ä¨": 447,
450
+ "÷éôè": 448,
451
+ "õìô": 449,
452
+ "ª ": 450,
453
+ "« ": 451,
454
+ "óß": 452,
455
+ "æéç": 453,
456
+ "¬Š            ": 454,
457
+ "ý¢": 455,
458
+ "åø": 456,
459
+ "ìïçç": 457,
460
+ "ôåð": 458,
461
+ "éç": 459,
462
+ "áíð": 460,
463
+ "ðáôè": 461,
464
+ "áääß": 462,
465
+ "ìåî¨": 463,
466
+ "ôè": 464,
467
+ "æòï": 465,
468
+ "ìá": 466,
469
+ "ßðáòáí": 467,
470
+ "îå÷": 468,
471
+ "ðáòóåò": 469,
472
+ "æá": 470,
473
+ "íåòçå": 471,
474
+ "ãïäå": 472,
475
+ "æòïí ": 473,
476
+ "º éîô": 474,
477
+ "ïîå": 475,
478
+ "­ ": 476,
479
+ "¬Š                ": 477,
480
+ "çåîåò": 478,
481
+ "õíåî": 479,
482
+ "äåæá": 480,
483
+ "äåæáõìô": 481,
484
+ "÷ïòä": 482,
485
+ "ðòïíð": 483,
486
+ "áôéïî": 484,
487
+ "ó½": 485,
488
+ "ïõôðõô": 486,
489
+ "ïð": 487,
490
+ "ðáòóåò®": 488,
491
+ "æéìå": 489,
492
+ "ôåî": 490,
493
+ "ßäé": 491,
494
+ "ìåîç": 492,
495
+ "ìåîçôè": 493,
496
+ "âùôå": 494,
497
+ "©ºŠ    ¢¢¢": 495,
498
+ "óåìæ¬ ": 496,
499
+ "óåñ": 497,
500
+ "óïò": 498,
501
+ "öïãáâßóéúå": 499,
502
+ "ãìå": 500,
503
+ "ãèå": 501,
504
+ "ðáòóåò®áääß": 502,
505
+ "ðáòóåò®áääßáòç": 503,
506
+ "ðáòóåò®áääßáòçõíåî": 504,
507
+ "ðáòóåò®áääßáòçõíåîô¨§": 505,
508
+ "ðáòóåò®áääßáòçõíåîô¨§­": 506,
509
+ "ðáòóåò®áääßáòçõíåîô¨§­­": 507,
510
+ "õð": 508,
511
+ "ãé": 509,
512
+ "ôß": 510,
513
+ "Îïîå": 511,
514
+ "ßèåáä": 512,
515
+ "ãë": 513,
516
+ "÷éôè ": 514,
517
+ "ãïîæéç": 515,
518
+ "®ó": 516,
519
+ "õîë": 517,
520
+ "ìïççåò": 518,
521
+ "äåæáõìô½": 519,
522
+ "ôï ": 520,
523
+ "֌": 521,
524
+ "öáìõå": 522,
525
+ "ºŠ                ": 523,
526
+ "©Š    Š    £ ": 524,
527
+ "äáôáóåô": 525,
528
+ "äõ": 526,
529
+ "Š    Š    äåæ ": 527,
530
+ "æìï": 528,
531
+ "æìïáô": 529,
532
+ "òõå": 530,
533
+ " « ": 531,
534
+ "§¬ ôù": 532,
535
+ "§¬ ôùðå": 533,
536
+ "§¬ ôùðå½": 534,
537
+ "©Š    ðáòóåò®áääßáòçõíåîô¨§­­": 535,
538
+ "ðòéîô¨¢": 536,
539
+ "îî": 537,
540
+ "÷åéç": 538,
541
+ "÷åéçè": 539,
542
+ "¯ ": 540,
543
+ "óã": 541,
544
+ "âáô": 542,
545
+ "©Š                ": 543,
546
+ "ŠŠŠäåæ ": 544,
547
+ "¬ äåæáõìô½": 545,
548
+ "ðåãé": 546,
549
+ "íïäåì®": 547,
550
+ "áíå": 548,
551
+ "íâ": 549,
552
+ "áð": 550,
553
+ "çåîåòáôå": 551,
554
+ "áòôé": 552,
555
+ "îî®": 553,
556
+ "ïòí": 554,
557
+ "åò¨": 555,
558
+ " ª ": 556,
559
+ "Š        Š        ": 557,
560
+ "ùß": 558,
561
+ " ½ óåìæ®": 559,
562
+ "Š                ": 560,
563
+ "ßôïëåîó": 561,
564
+ "åòáô": 562,
565
+ "½½ ": 563,
566
+ "åóô": 564,
567
+ "óôòé": 565,
568
+ "áòôéãìå": 566,
569
+ "óðåãé": 567,
570
+ "õòå": 568,
571
+ "ù ": 569,
572
+ "®°": 570,
573
+ "ãá": 571,
574
+ "ãèõîë": 572,
575
+ "óë": 573,
576
+ "îï": 574,
577
+ "ìå ": 575,
578
+ "ãèåãë": 576,
579
+ "ãèåãëðï": 577,
580
+ "ßìïó": 578,
581
+ "ïðôéí": 579,
582
+ "ôïëåîéúåò®": 580,
583
+ "ßߨ": 581,
584
+ "Ôòõå": 582,
585
+ "ó¬ ": 583,
586
+ "éîæ": 584,
587
+ "ßôïëåîßéä": 585,
588
+ "Äáôá": 586,
589
+ "òáîë": 587,
590
+ "óðåãéáì": 588,
591
+ "åôåò": 589,
592
+ "º éîô ½ ": 590,
593
+ "íáóë": 591,
594
+ "óÛ§": 592,
595
+ "®®": 593,
596
+ "º û": 594,
597
+ "éë": 595,
598
+ "ïðôéíéúåò": 596,
599
+ "ý ": 597,
600
+ "éîéô": 598,
601
+ "÷åéçèô": 599,
602
+ " ½ °": 600,
603
+ "áöå": 601,
604
+ "åô": 602,
605
+ "áôå ": 603,
606
+ "±Ý": 604,
607
+ "áîä ": 605,
608
+ " ½½ ": 606,
609
+ "Ôï": 607,
610
+ "Û§": 608,
611
+ "íáøßìåîçôè": 609,
612
+ "Öéã": 610,
613
+ "ÖéãÁ": 611,
614
+ "ÖéãÁÉ": 612,
615
+ "©Š        óåìæ®": 613,
616
+ "ïó": 614,
617
+ "óåñß": 615,
618
+ "óåñßìåî": 616,
619
+ "éëé": 617,
620
+ "éò": 618,
621
+ "âùôåß": 619,
622
+ "ðòïíðô": 620,
623
+ "éôå": 621,
624
+ "ÐÅ": 622,
625
+ "òåó": 623,
626
+ "íï": 624,
627
+ "âáôãè": 625,
628
+ "©Š            Š            ": 626,
629
+ "á ": 627,
630
+ "Ôïëåî": 628,
631
+ "áã": 629,
632
+ "ìáâ": 630,
633
+ "Üî": 631,
634
+ "áó ": 632,
635
+ "©®": 633,
636
+ "ßäéí": 634,
637
+ "ðáóô": 635,
638
+ "ôéïîß": 636,
639
+ "éîæï": 637,
640
+ "çåôß": 638,
641
+ "åìóå": 639,
642
+ "ôéíå": 640,
643
+ "½§": 641,
644
+ "ðáéò": 642,
645
+ "÷áò": 643,
646
+ "òåñ": 644,
647
+ "âõ": 645,
648
+ "èá": 646,
649
+ "äòï": 647,
650
+ "ôáì": 648,
651
+ "¬Š            §": 649,
652
+ "ôåíð": 650,
653
+ "ÂÐÅ": 651,
654
+ "ãèåãëðïéîô": 652,
655
+ "îõíß": 653,
656
+ "ìïççåò®": 654,
657
+ "äï": 655,
658
+ "óåìæ®öïãáâ": 656,
659
+ "åîãïäå": 657,
660
+ "©Š    Š    äåæ ": 658,
661
+ " ½ ôïòãè®": 659,
662
+ "éî¨": 660,
663
+ "Ý ½ ": 661,
664
+ "ðáóôßë": 662,
665
+ "«½ ": 663,
666
+ "áððåî": 664,
667
+ "áððåîä¨": 665,
668
+ "çò": 666,
669
+ "åòáôõòå": 667,
670
+ "¾ ": 668,
671
+ "®¢¢¢Š    ": 669,
672
+ "îáíå": 670,
673
+ "ìé": 671,
674
+ "ìåò": 672,
675
+ "Ôïëåîéúåò": 673,
676
+ "ìïã": 674,
677
+ "äá": 675,
678
+ "óãáì": 676,
679
+ "óãáìåò": 677,
680
+ "óôåð": 678,
681
+ "áö": 679,
682
+ "º óôò": 680,
683
+ "ãïä": 681,
684
+ "ðôéïî": 682,
685
+ "ßßéîéô": 683,
686
+ "ßßéîéôßߨ": 684,
687
+ "åùß": 685,
688
+ "åùßöáìõå": 686,
689
+ "ôïôáì": 687,
690
+ "©ºŠ            ": 688,
691
+ "éîðõôßéäó": 689,
692
+ "ó ½ Û": 690,
693
+ "ôåíðåòáôõòå": 691,
694
+ "Ìï": 692,
695
+ "éëéðå": 693,
696
+ "éëéðåäé": 694,
697
+ "Éî": 695,
698
+ "óåß": 696,
699
+ "ôéîç": 697,
700
+ "Šæòïí ": 698,
701
+ "çå¨": 699,
702
+ "äòïð": 700,
703
+ "òåðå": 701,
704
+ "îïòí": 702,
705
+ "ìïçéô": 703,
706
+ "óéúå": 704,
707
+ "çòáä": 705,
708
+ "íáøßîå÷": 706,
709
+ "ßð": 707,
710
+ "ðåîáì": 708,
711
+ "ðåîáìôù": 709,
712
+ "óôòéâõ": 710,
713
+ "óôòéâõôå": 711,
714
+ "ãõ": 712,
715
+ "óáíð": 713,
716
+ "󮢢¢": 714,
717
+ "ôåøô ½ ": 715,
718
+ "áîä": 716,
719
+ "óåôôéîç": 717,
720
+ "ãô": 718,
721
+ "ðáòáíåôåò": 719,
722
+ "äòïðïõô": 720,
723
+ "îïô ": 721,
724
+ "îõíßðáòáí": 722,
725
+ "¨§": 723,
726
+ "Š            Š            ": 724,
727
+ "©ºŠ                ": 725,
728
+ "×éëéðåäé": 726,
729
+ "ìïáäåò": 727,
730
+ "ßóôåð": 728,
731
+ "ìáâåì": 729,
732
+ "âåóô": 730,
733
+ "éîæï¨": 731,
734
+ "äõìå": 732,
735
+ "ùåò": 733,
736
+ "íâåä": 734,
737
+ "±°°": 735,
738
+ "¨óåìæ¬ ": 736,
739
+ "ðáóôßëåùßöáìõå": 737,
740
+ "º®": 738,
741
+ "Ãòå": 739,
742
+ "ôò": 740,
743
+ "æïò é": 741,
744
+ "òåðåôé": 742,
745
+ "ôåøô ": 743,
746
+ "ãõäá": 744,
747
+ "éôåí": 745,
748
+ "¬ §": 746,
749
+ "ßäéò": 747,
750
+ "öåò": 748,
751
+ "ßðáôè": 749,
752
+ "¢©Š    ": 750,
753
+ "ãïäåò": 751,
754
+ "óó ": 752,
755
+ "îßèåáä": 753,
756
+ "îßë": 754,
757
+ "îßëö": 755,
758
+ "îßëößèåáä": 756,
759
+ "îß": 757,
760
+ "éîå": 758,
761
+ "éó ": 759,
762
+ "äåî": 760,
763
+ "áôéïî ": 761,
764
+ "ïì": 762,
765
+ "󊠠  ": 763,
766
+ "Ìéóô": 764,
767
+ "òáîçå¨": 765,
768
+ "¬ äåöéãå": 766,
769
+ "ôïðßë": 767,
770
+ "ôïðßð": 768,
771
+ "ïóßôïëåîßéä": 769,
772
+ "öåì": 770,
773
+ "ôïß": 771,
774
+ "Ìïáä": 772,
775
+ "ìò": 773,
776
+ "÷ïòì": 774,
777
+ "áâìå": 775,
778
+ "ôòáéîß": 776,
779
+ "óôáò": 777,
780
+ "ìïççåò®éîæï¨": 778,
781
+ "äåãá": 779,
782
+ "óðåãéáìßôïëåî": 780,
783
+ "®ê": 781,
784
+ " ­ ": 782,
785
+ "Û±Ý": 783,
786
+ "óôáôå": 784,
787
+ "óôáôåßäé": 785,
788
+ "áçå": 786,
789
+ "Šéíðïòô ": 787,
790
+ "óº ": 788,
791
+ "±å": 789,
792
+ "äéîç": 790,
793
+ "èéä": 791,
794
+ "èéääåî": 792,
795
+ " éî òáîçå¨": 793,
796
+ "ìéóô": 794,
797
+ "°Ý": 795,
798
+ "ðõ": 796,
799
+ "©ŠŠŠäåæ ": 797,
800
+ " ½½ °": 798,
801
+ "éæ áòçó®": 799,
802
+ "ôåîóïò": 800,
803
+ "§¬ ôùðå½éîô": 801,
804
+ "§¬ ôùðå½éîô¬ äåæáõìô½": 802,
805
+ "äåãáù": 803,
806
+ "½áòçó®": 804,
807
+ "îå÷ß": 805,
808
+ "íåòçåó": 806,
809
+ "Ïðôéïî": 807,
810
+ "Ïðôéïîáì": 808,
811
+ "ŠŠŠã": 809,
812
+ "ŠŠŠãìá": 810,
813
+ "ŠŠŠãìáóó ": 811,
814
+ "Š    Š    äåæ ßßéîéôßߨ": 812,
815
+ "­±": 813,
816
+ "óõí": 814,
817
+ "Æáì": 815,
818
+ "Æáìóå": 816,
819
+ "èáðå": 817,
820
+ "³²": 818,
821
+ "ìáùåò": 819,
822
+ "ãïîæéç®": 820,
823
+ "ó ½ ÛÝ": 821,
824
+ "ó®áððåîä¨": 822,
825
+ "åîô": 823,
826
+ " áîä ": 824,
827
+ "Ôòá": 825,
828
+ " éíðïòô ": 826,
829
+ "áöç": 827,
830
+ " §": 828,
831
+ " ½ û": 829,
832
+ "é « ": 830,
833
+ "®êï": 831,
834
+ "®êïéî¨": 832,
835
+ "òïò": 833,
836
+ "óåôôéîçóÛ§": 834,
837
+ "çåîåòáôåäß": 835,
838
+ " ½ îî®": 836,
839
+ "äéí½": 837,
840
+ "íáøßóåñßìåî": 838,
841
+ "âá": 839,
842
+ "¨óåìæ®": 840,
843
+ "ó ½ ôïòãè®": 841,
844
+ "ãèåä": 842,
845
+ "áôôåî": 843,
846
+ "èéääåîßäéí": 844,
847
+ "Ãïî": 845,
848
+ "®¢¢¢Š        ": 846,
849
+ "Ãòåáôå ": 847,
850
+ "®ôï": 848,
851
+ "íáøßîå÷ßôïëåîó": 849,
852
+ "òåðåôéôéïîß": 850,
853
+ "òåðåôéôéïîßðåîáìôù": 851,
854
+ "åïóßôïëåîßéä": 852,
855
+ "ôïëåîßéä": 853,
856
+ "óßôïß": 854,
857
+ "íïöå": 855,
858
+ "áôôåò": 856,
859
+ "© ­": 857,
860
+ "© ­¾ ": 858,
861
+ "ÂÐÅÔïëåîéúåò": 859,
862
+ "ìåß": 860,
863
+ "óáöå": 861,
864
+ "ïó®": 862,
865
+ "÷ïòìäß": 863,
866
+ "÷ïòìäßóéúå": 864,
867
+ "ãõäá®": 865,
868
+ "¬ áòçó®": 866,
869
+ "óôáòô": 867,
870
+ "ðáä": 868,
871
+ "ãô¨": 869,
872
+ "é ": 870,
873
+ "èáîä": 871,
874
+ "ðáçå": 872,
875
+ "Åò": 873,
876
+ "Åòòïò": 874,
877
+ "áì ": 875,
878
+ "󊠠      óåìæ®": 876,
879
+ "íâåääéîç": 877,
880
+ "º æìïáô": 878,
881
+ "æòåñ": 879,
882
+ "éîß": 880,
883
+ "󊠠      ": 881,
884
+ "âé": 882,
885
+ "éîôåò": 883,
886
+ "ó§Ý": 884,
887
+ "åô ": 885,
888
+ "æïò é éî òáîçå¨": 886,
889
+ "óåô¨": 887,
890
+ "éîäé": 888,
891
+ "®çå": 889,
892
+ "öáìßìïó": 890,
893
+ "ðô ": 891,
894
+ "ºŠ                    ": 892,
895
+ "ïõôðõôßäéò": 893,
896
+ "§©Š    ðáòóåò®áääßáòçõíåîô¨§­­": 894,
897
+ "×éëéðåäéá ": 895,
898
+ "ÌéóôÛ": 896,
899
+ "íáò": 897,
900
+ "èáîäìåò": 898,
901
+ "¢ ª ": 899,
902
+ "Íïäåì": 900,
903
+ "íå": 901,
904
+ "ëå": 902,
905
+ "ó© ": 903,
906
+ "ææ": 904,
907
+ "¬ º": 905,
908
+ "ßå": 906,
909
+ "ó ½ óåìæ®": 907,
910
+ "Š        Š        £ ": 908,
911
+ "©ºŠ        ¢¢¢": 909,
912
+ "ãïõî": 910,
913
+ "íïäåì ": 911,
914
+ "ó¢": 912,
915
+ "áìß": 913,
916
+ "½Ôòõå": 914,
917
+ "ìéóô¨": 915,
918
+ "éîäéãå": 916,
919
+ "®®®": 917,
920
+ "áìì": 918,
921
+ "ãòå": 919,
922
+ "©Š    ðòéîô¨æ¢": 920,
923
+ "äéóôòéâõôå": 921,
924
+ "äéóô": 922,
925
+ "ãïò": 923,
926
+ "¨íïäåì": 924,
927
+ "ðïãè": 925,
928
+ "åøãå": 926,
929
+ "åøãåðô ": 927,
930
+ "ïðåî": 928,
931
+ "óù": 929,
932
+ "ºŠ        ¢¢¢": 930,
933
+ "äåãïäå": 931,
934
+ "ãïîô": 932,
935
+ "¬Š                §": 933,
936
+ "åìð": 934,
937
+ "ÖéãÁÉ ": 935,
938
+ "ŠŠéíðïòô ": 936,
939
+ "÷áòä¨": 937,
940
+ "±®°": 938,
941
+ " ¯ ": 939,
942
+ "äéí¬ ": 940,
943
+ "ó©": 941,
944
+ "Š        óåìæ": 942,
945
+ "Š        óåìæ¬Š        ": 943,
946
+ "òåð": 944,
947
+ "ôïòãè®Ô": 945,
948
+ "ôïòãè®Ôåî": 946,
949
+ "ôïòãè®Ôåîóïò": 947,
950
+ "ÏðôéïîáìÛ": 948,
951
+ "¨Û": 949,
952
+ "©Š        Š        £ ": 950,
953
+ "°°°": 951,
954
+ "©ºŠ        óåìæ®": 952,
955
+ "󺊠           ": 953,
956
+ "¯ ±å": 954,
957
+ "º®²": 955,
958
+ "íïäõìå": 956,
959
+ "°®°": 957,
960
+ "󧺠": 958,
961
+ "îïß": 959,
962
+ "Š            éæ ": 960,
963
+ "ë ": 961,
964
+ "¼ ": 962,
965
+ "©Š            Š            £ ": 963,
966
+ "Ìå": 964,
967
+ "óåô ": 965,
968
+ "Äáôáóåô": 966,
969
+ "ãïòðõ": 967,
970
+ "éóß": 968,
971
+ "ôòù": 969,
972
+ "¬Š            ��       ": 970,
973
+ "åøéóô": 971,
974
+ "Ìïáä ": 972,
975
+ "ôïëåîéúåò ": 973,
976
+ "§© ": 974,
977
+ "éôåíó¨": 975,
978
+ "§ÝŠ        óåìæ®": 976,
979
+ "òáîäï": 977,
980
+ "¢ ª ¶": 978,
981
+ "¢ ª ¶°": 979,
982
+ "èåìð": 980,
983
+ "èåìð½§": 981,
984
+ "ñõ": 982,
985
+ "®¢¢¢Š    Š    äåæ ßßéîéôßߨ": 983,
986
+ "ôéïî ": 984,
987
+ "¬ ë": 985,
988
+ "èåáäßäéí": 986,
989
+ "Ìéîå": 987,
990
+ "Ìéîåáò": 988,
991
+ "âéá": 989,
992
+ "áôô": 990,
993
+ "¨ø": 991,
994
+ "Òå": 992,
995
+ "áôôåîôéïîß": 993,
996
+ "º®²æ": 994,
997
+ "º®²æý": 995,
998
+ "óèáðå": 996,
999
+ "¨éîðõôßéä": 997,
1000
+ "Š            §": 998,
1001
+ "¬Š        ý": 999,
1002
+ "åöáì": 1000,
1003
+ " ôïëåî": 1001,
1004
+ "éîäéãåóßôïß": 1002,
1005
+ "éîäéãåóßôïßòå": 1003,
1006
+ "éîäéãåóßôïßòåíïöå": 1004,
1007
+ "óïòôå": 1005,
1008
+ "óïòôåäß": 1006,
1009
+ "âòå": 1007,
1010
+ "âòåá": 1008,
1011
+ "âòåáë": 1009,
1012
+ "íáéî": 1010,
1013
+ "Ôòáéî": 1011,
1014
+ "Âùôå": 1012,
1015
+ "Ìåöåì": 1013,
1016
+ "éìå": 1014,
1017
+ "ìïáäß": 1015,
1018
+ "ãèåäõì": 1016,
1019
+ "åìó床        ": 1017,
1020
+ "äéóô®": 1018,
1021
+ "©Š    Š    òåôõòî ": 1019,
1022
+ "òáîë ½½ °": 1020,
1023
+ "ìïççåò®éîæï¨æ¢": 1021,
1024
+ "©Š                    ": 1022,
1025
+ "ãïíð": 1023,
1026
+ "±°": 1024,
1027
+ "¨áòçó®": 1025,
1028
+ "÷éôè ïðåî": 1026,
1029
+ "÷éôè ïðåî¨": 1027,
1030
+ "§© áó ": 1028,
1031
+ "§© áó æ": 1029,
1032
+ "ðìé": 1030,
1033
+ "¼ü": 1031,
1034
+ "ü¾": 1032,
1035
+ "Äáôáóåô¨": 1033,
1036
+ "îå÷ß÷ïòä": 1034,
1037
+ "ãïîôéî": 1035,
1038
+ "ãïîôéîõå": 1036,
1039
+ "äáôáÛ§": 1037,
1040
+ "åîãïäåä": 1038,
1041
+ "íéîß": 1039,
1042
+ "åôãè": 1040,
1043
+ "¢©Š    ðòéîô¨¢": 1041,
1044
+ "åîãè": 1042,
1045
+ "åîãèíáò": 1043,
1046
+ "áììïã": 1044,
1047
+ "áììïãáôå": 1045,
1048
+ " ðáòáíåôåò": 1046,
1049
+ "äåãïäåò": 1047,
1050
+ "ôòáî": 1048,
1051
+ "ôòáîó": 1049,
1052
+ "çõ": 1050,
1053
+ "¬ äéí½": 1051,
1054
+ "áôåß": 1052,
1055
+ "¬ óåñßìåî": 1053,
1056
+ "åîãå": 1054,
1057
+ "º æìïáô ½ °": 1055,
1058
+ "ó½Æáìóå": 1056,
1059
+ "Ý ½ Îïîå": 1057,
1060
+ "´°": 1058,
1061
+ "¯ ±å¹": 1059,
1062
+ "©ºŠ        éæ ": 1060,
1063
+ "ð®": 1061,
1064
+ "֏": 1062,
1065
+ "éÝ": 1063,
1066
+ "çòáä¨": 1064,
1067
+ "Çåîåò": 1065,
1068
+ "©ºŠ                    ": 1066,
1069
+ "ð ": 1067,
1070
+ "Ðáôè": 1068,
1071
+ "Ôåø": 1069,
1072
+ "ôïòãè®ãõäá®": 1070,
1073
+ "õóåß": 1071,
1074
+ "ôòáéî¨": 1072,
1075
+ "öáìß": 1073,
1076
+ "éôåòáô": 1074,
1077
+ "éôåòáôïò": 1075,
1078
+ "åðïãè": 1076,
1079
+ "¾½ ": 1077,
1080
+ "½äåöéãå": 1078,
1081
+ "§¬ ôùð彿ìïáô": 1079,
1082
+ "§¬ ôùð彿ìïáô¬ äåæáõìô½": 1080,
1083
+ "§¬ ôùðå½óôò": 1081,
1084
+ "ôïëåîéúåò ½ ": 1082,
1085
+ "®óðìé": 1083,
1086
+ "©Ý": 1084,
1087
+ "óïî": 1085,
1088
+ "öïãáâ ½ û": 1086,
1089
+ "óùíâ": 1087,
1090
+ "óùíâïì": 1088,
1091
+ "󮢢¢Š        ": 1089,
1092
+ "õòò": 1090,
1093
+ "ôåøô®": 1091,
1094
+ "ìåî¨óåìæ®öïãáâ": 1092,
1095
+ "é « ±": 1093,
1096
+ "óðåãéáìßôïëåîó": 1094,
1097
+ "èå ": 1095,
1098
+ "éîç®": 1096,
1099
+ "ðòéîô¨æ¢Üî": 1097,
1100
+ "òáîäïí": 1098,
1101
+ "óº": 1099,
1102
+ "ó÷éôè": 1100,
1103
+ "óôáôåßäéãô¨": 1101,
1104
+ "û§": 1102,
1105
+ "óååä": 1103,
1106
+ "áãôé": 1104,
1107
+ "áãôéöå": 1105,
1108
+ "ðòïíðôß": 1106,
1109
+ "ìù ": 1107,
1110
+ "ìáî": 1108,
1111
+ "ìáîçõ": 1109,
1112
+ "ìáîçõáçå": 1110,
1113
+ "¨© ": 1111,
1114
+ "±¬ ": 1112,
1115
+ "óéî": 1113,
1116
+ "Š        òåôõòî ": 1114,
1117
+ "åîô ": 1115,
1118
+ " ½ îî®Ìéîåáò": 1116,
1119
+ " ½ îî®Ìéîåáò¨": 1117,
1120
+ "óåìæ®èåáäßäéí": 1118,
1121
+ "âéáó½Æáìóå": 1119,
1122
+ "©Š        Š        éæ ": 1120,
1123
+ " æïò ": 1121,
1124
+ "éì": 1122,
1125
+ "ðòå": 1123,
1126
+ "Ãïîæéç": 1124,
1127
+ "îßìáùåò": 1125,
1128
+ "ôéå": 1126,
1129
+ "âï": 1127,
1130
+ "éí": 1128,
1131
+ "ëåù": 1129,
1132
+ "éôé": 1130,
1133
+ "áìéúå": 1131,
1134
+ "ûŠ            §": 1132,
1135
+ "ôïòãè®îïß": 1133,
1136
+ "ôïòãè®îïßçòáä¨": 1134,
1137
+ "Û°Ý": 1135,
1138
+ "Š                    ": 1136,
1139
+ "öïãáâßóéúå½": 1137,
1140
+ "Ôåóô": 1138,
1141
+ "ôòáéîéîç ": 1139,
1142
+ "ÄÐ": 1140,
1143
+ "õôé": 1141,
1144
+ "¬Š": 1142,
1145
+ "ìòßó": 1143,
1146
+ "ìòßóãèåäõì": 1144,
1147
+ "ôòáéîéîç": 1145,
1148
+ "®çåô¨§": 1146,
1149
+ "©Š    åìó床        ": 1147,
1150
+ "õóåßáíð": 1148,
1151
+ "âáôãèÛ§": 1149,
1152
+ "¨äåöéãå": 1150,
1153
+ "áòçó": 1151,
1154
+ "éìå ": 1152,
1155
+ "Éôåò": 1153,
1156
+ "ûáòçó®": 1154,
1157
+ "éæ ��òçó®òáîë ½½ °": 1155,
1158
+ "Óáöå": 1156,
1159
+ "âáôãèå": 1157,
1160
+ "Š            Š            éæ ": 1158,
1161
+ "òåäõ": 1159,
1162
+ "òåäõãå": 1160,
1163
+ "ìåáò": 1161,
1164
+ "íõð": 1162,
1165
+ "§¬ ôùðå½óôò¬ äåæáõìô½": 1163,
1166
+ "ïæ ": 1164,
1167
+ "®óðìéô¨": 1165,
1168
+ "÷ïòä ": 1166,
1169
+ " ôåøô ": 1167,
1170
+ "óº ÌéóôÛ": 1168,
1171
+ "æïòí": 1169,
1172
+ "ãèáò": 1170,
1173
+ " ôï ": 1171,
1174
+ "§§": 1172,
1175
+ "ä û": 1173,
1176
+ "ôåøô©Š        ": 1174,
1177
+ "§º óåìæ®": 1175,
1178
+ "ôï û": 1176,
1179
+ "åîãïäå¨": 1177,
1180
+ " ½ Û": 1178,
1181
+ "áôôåîôéïîßíáóë": 1179,
1182
+ "âùôåßåî": 1180,
1183
+ "âùôåßåîãïäåò": 1181,
1184
+ "óï": 1182,
1185
+ "æåôãè": 1183,
1186
+ "ó ½ ôïòãè®ôåîóïò": 1184,
1187
+ "ìåî¨óåìæ®": 1185,
1188
+ "ßéäø": 1186,
1189
+ "֔": 1187,
1190
+ "ßæéìå": 1188,
1191
+ "óôáôåßäéãô": 1189,
1192
+ "¬Š        §": 1190,
1193
+ "ôòáéîáâìå": 1191,
1194
+ "åîãèíáòë": 1192,
1195
+ "óôáòôó÷éôè": 1193,
1196
+ "ïõôðõôßéä": 1194,
1197
+ "îî®Í": 1195,
1198
+ "îî®Íï": 1196,
1199
+ "îî®Íïäõìå": 1197,
1200
+ "óõð": 1198,
1201
+ "æïò÷áòä¨": 1199,
1202
+ "©ºŠ        òåôõòî ": 1200,
1203
+ "֬": 1201,
1204
+ "ôáò": 1202,
1205
+ "ñ¬ ë": 1203,
1206
+ "Ûº": 1204,
1207
+ "º éîô¬Š        ": 1205,
1208
+ "©Š        Š        óåìæ®": 1206,
1209
+ "ðïõ": 1207,
1210
+ "ðïõô¨": 1208,
1211
+ "¬ óåìæ®": 1209,
1212
+ "éó îïô ": 1210,
1213
+ "éó îïô Îïîå": 1211,
1214
+ "¬ ö": 1212,
1215
+ "ßéîôåò": 1213,
1216
+ " ÷éôè ": 1214,
1217
+ "©¬ ": 1215,
1218
+ "º éîô ½ ³²": 1216,
1219
+ "ðòïð": 1217,
1220
+ "îßðáòáí": 1218,
1221
+ "  £ ": 1219,
1222
+ "ó «½ ": 1220,
1223
+ "󩊠       ": 1221,
1224
+ "©Š        ðòéîô¨æ¢": 1222,
1225
+ "éôéáìéúå": 1223,
1226
+ "½°®°": 1224,
1227
+ "©Š            éæ ": 1225,
1228
+ " éî óåìæ®": 1226,
1229
+ "¨©Š        ": 1227,
1230
+ "éîðõôßéäó®": 1228,
1231
+ "äåø": 1229,
1232
+ "âáôãèßóéúå": 1230,
1233
+ "ïõôðõôóÛ§": 1231,
1234
+ "Çåô ": 1232,
1235
+ "®ôïìéóô¨": 1233,
1236
+ "®ôïìéóô¨©": 1234,
1237
+ "îåø": 1235,
1238
+ "Š    íïäåì": 1236,
1239
+ "Ôòáéîéîç ": 1237,
1240
+ "Šæòïí ôïòãè®": 1238,
1241
+ "ì ": 1239,
1242
+ "ðß": 1240,
1243
+ "ÂùôåÌåöåì": 1241,
1244
+ "ÂùôåÌåöåìÂÐÅÔïëåîéúåò": 1242,
1245
+ "òïî": 1243,
1246
+ "ìïãáìß": 1244,
1247
+ "ìïãáìßòáîë": 1245,
1248
+ "äáôáß": 1246,
1249
+ "íåí": 1247,
1250
+ "íåíïò": 1248,
1251
+ "ôòáéîßìïáäåò": 1249,
1252
+ "óôåð ": 1250,
1253
+ "óÛ°Ý": 1251,
1254
+ "öáìßìïóó": 1252,
1255
+ "¬Š                            ": 1253,
1256
+ "ý¢©Š                ": 1254,
1257
+ "ïî ": 1255,
1258
+ "ßìïóó ½ ": 1256,
1259
+ "îõíßâáôãèå": 1257,
1260
+ "÷éôè ôïòãè®îïßçòáä¨": 1258,
1261
+ "󧬠ôùðå½éîô¬ äåæáõìô½": 1259,
1262
+ "ìåáòî": 1260,
1263
+ "÷áòíõð": 1261,
1264
+ "°®": 1262,
1265
+ "äéò": 1263,
1266
+ "áãôéïî": 1264,
1267
+ "óáíðìåß": 1265,
1268
+ "¼üåî": 1266,
1269
+ "¼üåîäï": 1267,
1270
+ "¼üåîäïæ": 1268,
1271
+ "¼üåîäïæôåøô": 1269,
1272
+ "¼üåîäïæôåøôü¾": 1270,
1273
+ "óôòéð": 1271,
1274
+ "óôòéð¨": 1272,
1275
+ "¢©Š    Š    ": 1273,
1276
+ "Åî": 1274,
1277
+ "ðáäßôïëåîßéä": 1275,
1278
+ "ãõòò": 1276,
1279
+ "ò§": 1277,
1280
+ "Š        æïò ": 1278,
1281
+ "ôåøôº óôò": 1279,
1282
+ "óôòÝ": 1280,
1283
+ "ÂÐÅ ": 1281,
1284
+ "©ý": 1282,
1285
+ "ó®çå": 1283,
1286
+ " ½ óåìæ®ß": 1284,
1287
+ "äßôïëåî": 1285,
1288
+ "äõíð": 1286,
1289
+ " ½ óåìæ®óðåãéáìßôïëåî": 1287,
1290
+ "®áððåîä¨": 1288,
1291
+ "âùôåßäåãïäåò": 1289,
1292
+ "ïõôðõôßðáôè": 1290,
1293
+ "ôèå ": 1291,
1294
+ "󮢢¢Š    ": 1292,
1295
+ "ôïëåîéúåò®äåãïäå": 1293,
1296
+ "õòì": 1294,
1297
+ "§º §": 1295,
1298
+ "§¬Š                §": 1296,
1299
+ "òåóð": 1297,
1300
+ "òåóðïî": 1298,
1301
+ "òåóðïîóå": 1299,
1302
+ "æéìåî": 1300,
1303
+ "æéìåîáíå": 1301,
1304
+ "ðòéîô¨¢Üî": 1302,
1305
+ "ìïççéîç®": 1303,
1306
+ "Éîôåò": 1304,
1307
+ "¢©Š    ðòéîô¨¢  ": 1305,
1308
+ "¢©Š    ðòéîô¨¢  ¯": 1306,
1309
+ "ðòïíðô ": 1307,
1310
+ "¢©Š                ": 1308,
1311
+ "¢¢¢Š": 1309,
1312
+ "¢¢¢ŠÖéãÁÉ ": 1310,
1313
+ "æïòíåò": 1311,
1314
+ "Š¢¢¢": 1312,
1315
+ "Š¢¢¢ŠŠéíðïòô ": 1313,
1316
+ "ðéîç ": 1314,
1317
+ "õðìå": 1315,
1318
+ "ãôéïî": 1316,
1319
+ "Îïòí": 1317,
1320
+ "îî®Íïäõì婺Š    ¢¢¢": 1318,
1321
+ "Òï": 1319,
1322
+ "®¢¢¢Š    Š    äåæ ßßéîéôßߨóåìæ¬ ": 1320,
1323
+ "åð": 1321,
1324
+ "©ºŠ        óõð": 1322,
1325
+ "©ºŠ        ó��ðåò¨": 1323,
1326
+ "©ºŠ        óõðåò¨©®": 1324,
1327
+ "©ºŠ        óõðåò¨©®ßßéîéôßߨ": 1325,
1328
+ "©ºŠ        óõðåò¨©®ßßéîéôßߨ©Š        óåìæ®": 1326,
1329
+ "Ðáò": 1327,
1330
+ "óõí¨": 1328,
1331
+ "åòù": 1329,
1332
+ "¬ âéáó½Æáìóå": 1330,
1333
+ "º ÏðôéïîáìÛ": 1331,
1334
+ "âó": 1332,
1335
+ "âóú": 1333,
1336
+ "âóú¬ óåñßìåî": 1334,
1337
+ "öéå": 1335,
1338
+ "öéå÷¨": 1336,
1339
+ "çòï": 1337,
1340
+ "íõì": 1338,
1341
+ "íáø¨": 1339,
1342
+ "£ Á": 1340,
1343
+ "ÖéãÁÉÃïîæéç": 1341,
1344
+ "ß÷åéçèô": 1342,
1345
+ "âïïì": 1343,
1346
+ " ª óåìæ®": 1344,
1347
+ "ÖéãÁÉÍïäåì": 1345,
1348
+ "ßåíâåääéîç": 1346,
1349
+ "ìù": 1347,
1350
+ "ôïôáìßðáòáí": 1348,
1351
+ "íïäõìå®": 1349,
1352
+ "ðáòáíåôåòó¨": 1350,
1353
+ "ôáòç": 1351,
1354
+ "ôáòçåô": 1352,
1355
+ "õó": 1353,
1356
+ "å¨": 1354,
1357
+ "ìïóó": 1355,
1358
+ "ôïòå": 1356,
1359
+ "Ôïð": 1357,
1360
+ "Ôïð­": 1358,
1361
+ "óïòôåäßéîäéãåóßôïßòåíïöå": 1359,
1362
+ "çåîåòáôåä": 1360,
1363
+ "ãòåáôåß": 1361,
1364
+ "öéãá": 1362,
1365
+ "öéãáé": 1363,
1366
+ "öéãáéß": 1364,
1367
+ "â¨": 1365,
1368
+ "ŠŠŠéæ ": 1366,
1369
+ "ŠŠŠéæ ßß": 1367,
1370
+ "ŠŠŠéæ ßßîáíå": 1368,
1371
+ "ŠŠŠéæ ßßîáíåßß": 1369,
1372
+ "ŠŠŠéæ ßßîáíåßß ½½ ": 1370,
1373
+ "ßßíáéî": 1371,
1374
+ "ßßíáéîßß": 1372,
1375
+ "Ôåóô ": 1373,
1376
+ "©Š": 1374,
1377
+ "©Š<": 1375,
1378
+ "©Š<|": 1376,
1379
+ "©Š<|e": 1377,
1380
+ "©Š<|en": 1378,
1381
+ "©Š<|end": 1379,
1382
+ "©Š<|endo": 1380,
1383
+ "©Š<|endof": 1381,
1384
+ "©Š<|endoft": 1382,
1385
+ "©Š<|endofte": 1383,
1386
+ "©Š<|endoftex": 1384,
1387
+ "©Š<|endoftext": 1385,
1388
+ "©Š<|endoftext|": 1386,
1389
+ "©Š<|endoftext|>": 1387,
1390
+ "äéóôòéâõôåä ": 1388,
1391
+ "äéóôòéâõôåä": 1389,
1392
+ "õôéì": 1390,
1393
+ "×éëéðåäéá": 1391,
1394
+ "óáöåß": 1392,
1395
+ "ìòßóãèåäõìåò": 1393,
1396
+ "§ éî ": 1394,
1397
+ "ãåó": 1395,
1398
+ "ãìåáî": 1396,
1399
+ "óè": 1397,
1400
+ "ïðôéíéúåò¬ ": 1398,
1401
+ "ó ½ âáôãèÛ§": 1399,
1402
+ "®ôï¨äåöéãå": 1400,
1403
+ "Š    íïäå쬊    ": 1401,
1404
+ "¬Š©ºŠ    ¢¢¢": 1402,
1405
+ "÷èéìå ": 1403,
1406
+ "«½ ±": 1404,
1407
+ "¥ ": 1405,
1408
+ "åý¢": 1406,
1409
+ "Öáì": 1407,
1410
+ "Óáöå ": 1408,
1411
+ "ãèåãëðïéîô¨": 1409,
1412
+ "áòçó®ïõôðõôßäéò": 1410,
1413
+ "öáìéä": 1411,
1414
+ "¬ äåöéãå½äåöéãå": 1412,
1415
+ "§¬ ôùðå½óôò¬ äåæáõìô½§": 1413,
1416
+ "±°°°": 1414,
1417
+ "§ ": 1415,
1418
+ "Äå": 1416,
1419
+ "áöá": 1417,
1420
+ "áöáé": 1418,
1421
+ "áöáéìáâ": 1419,
1422
+ "ïó®ðáôè": 1420,
1423
+ "ôïëåîéúåòßðáôè": 1421,
1424
+ "Ôïëåîéúåò ": 1422,
1425
+ "¢©Š        ": 1423,
1426
+ "ìïççåò®éîæï¨¢": 1424,
1427
+ "íïäåì ½ ": 1425,
1428
+ "äáôáóåô ½ ": 1426,
1429
+ "êóïî": 1427,
1430
+ "§º °": 1428,
1431
+ "¼¯": 1429,
1432
+ " ½ °Š        óåìæ®": 1430,
1433
+ "®éôåíó¨": 1431,
1434
+ "Š    Š    äåæ ß": 1432,
1435
+ "âéç": 1433,
1436
+ "âéçòá": 1434,
1437
+ "ðáôôåò": 1435,
1438
+ "ðáôôåòî": 1436,
1439
+ "öïãáâÛ": 1437,
1440
+ "ôåøôº óôò© ­¾ ": 1438,
1441
+ "ü§": 1439,
1442
+ "®óðìéô¨©": 1440,
1443
+ " ½ Ôòõå": 1441,
1444
+ "ÉÄ": 1442,
1445
+ "ìáãå": 1443,
1446
+ "ó ½ äáôáÛ§": 1444,
1447
+ "åìó床            ": 1445,
1448
+ "ìåöåì": 1446,
1449
+ "²µ": 1447,
1450
+ "²µ¶": 1448,
1451
+ "õîéã": 1449,
1452
+ "õîéãïäå": 1450,
1453
+ "÷ïòäÛ": 1451,
1454
+ "۱ݩŠ                    ": 1452,
1455
+ " ½ ¢": 1453,
1456
+ "ôåóô": 1454,
1457
+ "ôïëåîéúåò®åîãïäå¨": 1455,
1458
+ "ßìåîçôè": 1456,
1459
+ "éîðõôßéäó ½ ôïòãè®ôåîóïò": 1457,
1460
+ "ºÝ": 1458,
1461
+ "÷îìïáä": 1459,
1462
+ "ó ­ ": 1460,
1463
+ "Èáîä": 1461,
1464
+ "äåãáùßðáòáí": 1462,
1465
+ "îõíßðáòáíó ": 1463,
1466
+ "½¢ ª ¶°": 1464,
1467
+ "§º": 1465,
1468
+ "ðòéîô¨¢­": 1466,
1469
+ "Óåô ": 1467,
1470
+ "âåîãèíáòë": 1468,
1471
+ "äßç": 1469,
1472
+ "äßçâ": 1470,
1473
+ "çåîåòáôéïî ": 1471,
1474
+ "ôïð­": 1472,
1475
+ "ãïîôéîõ加           Š            éæ ": 1473,
1476
+ "íáøßîå÷ßôïëåî": 1474,
1477
+ "íáøßîå÷ßôïëåîó½": 1475,
1478
+ "çåîåòáôåäßôåøô ½ ": 1476,
1479
+ "óÛ°Ý®ôïìéóô¨©": 1477,
1480
+ "ðòïíðôßôåøô": 1478,
1481
+ "¬ èåìð½§": 1479,
1482
+ "Íïäåì ": 1480,
1483
+ " ðáòáíåôåò": 1481,
1484
+ "íáôè": 1482,
1485
+ "ôïòãè®îî®": 1483,
1486
+ "ÒÍ": 1484,
1487
+ "ÒÍÓ": 1485,
1488
+ "¨îî®Íïäõì婺Š    ¢¢¢": 1486,
1489
+ "Åíâåääéîç": 1487,
1490
+ "²¬ ": 1488,
1491
+ "âáóå": 1489,
1492
+ "¢¬ ": 1490,
1493
+ "åíâ": 1491,
1494
+ "ãáô": 1492,
1495
+ "¬ ºÝ": 1493,
1496
+ "©Š        òåôõòî ": 1494,
1497
+ "©ŠŠŠãìáóó ": 1495,
1498
+ "ôôåî": 1496,
1499
+ "© æïò ": 1497,
1500
+ "îßèåáäó": 1498,
1501
+ "îßëößèåáäó": 1499,
1502
+ " ½ îî®Ìéîåáò¨äéí¬ ": 1500,
1503
+ "ó ª ": 1501,
1504
+ "¬ âéáó½Æáìó婊        óåìæ®": 1502,
1505
+ "Ý ½ Îïî嬊    ": 1503,
1506
+ "®óèáðå": 1504,
1507
+ "ôòáîóðï": 1505,
1508
+ "ôòáîóðïóå": 1506,
1509
+ "ôòáîóðïóå¨": 1507,
1510
+ "ïòå": 1508,
1511
+ "©Š        Š        òåôõòî ": 1509,
1512
+ "öïãáâßóéú庠éîô ½ ³²": 1510,
1513
+ "öïãáâßóéú庠éîô ½ ³²°°°": 1511,
1514
+ "öïãáâßóéúå ½ ": 1512,
1515
+ "ãïõîô": 1513,
1516
+ " ª óåìæ®äéí": 1514,
1517
+ "ôá": 1515,
1518
+ "ó æïò ": 1516,
1519
+ "¨Š                ": 1517,
1520
+ "¯ ±å¹º®²æý": 1518,
1521
+ "ó¨óåìæ¬ ": 1519,
1522
+ "éîðõôßéäó®óèáðå": 1520,
1523
+ "õîó": 1521,
1524
+ "æïò é¬ ": 1522,
1525
+ " éî åî": 1523,
1526
+ " éî åîõí": 1524,
1527
+ " éî åîõíåò": 1525,
1528
+ " éî åîõíåòáôå": 1526,
1529
+ "ìïóó ½ ": 1527,
1530
+ "º éîô ½ µ": 1528,
1531
+ "º æìïáô ½ °®": 1529,
1532
+ "çòåó": 1530,
1533
+ "ïõôðõôó ½ ": 1531,
1534
+ "¨éîðõôßéäó¬ ": 1532,
1535
+ "ôéïî ðåîáìôù": 1533,
1536
+ "ìïçéôó¬ ": 1534,
1537
+ "îõ": 1535,
1538
+ "ßóáíð": 1536,
1539
+ "ãòåáôåßöéãáéß": 1537,
1540
+ "ãòåáôåßöéãáéßµ": 1538,
1541
+ "±²": 1539,
1542
+ "ý¢©Š    ðòéîô¨æ¢": 1540,
1543
+ "ðô": 1541,
1544
+ "÷òá": 1542,
1545
+ "Óáíð": 1543,
1546
+ "Æéìå": 1544,
1547
+ "çåôßìïççåò": 1545,
1548
+ "®¢¢¢Š    éæ ": 1546,
1549
+ "ïó®åî": 1547,
1550
+ "ïó®åîöé": 1548,
1551
+ "ïó®åîöéòïî": 1549,
1552
+ " ½ éîô¨": 1550,
1553
+ "ðòïãåó": 1551,
1554
+ "çòïõð": 1552,
1555
+ "äáôá ": 1553,
1556
+ "óáíðìåò": 1554,
1557
+ "ìåò¨": 1555,
1558
+ "¨íïäå쬠": 1556,
1559
+ "¬Š    äåöéãå": 1557,
1560
+ "Íá": 1558,
1561
+ "®óåô": 1559,
1562
+ " û": 1560,
1563
+ "ü ": 1561,
1564
+ "æý ": 1562,
1565
+ "Š                        ": 1563,
1566
+ "󺊠               ": 1564,
1567
+ "ßôéíå": 1565,
1568
+ "ìáâåìó": 1566,
1569
+ "öåòáçå": 1567,
1570
+ "áìì ": 1568,
1571
+ "§¬ ôùð彿ìïáô¬ äåæáõìô½°®": 1569,
1572
+ "âåô": 1570,
1573
+ "âåôá": 1571,
1574
+ "§¬ áãôéïî": 1572,
1575
+ "§¬ áãôéïî½§": 1573,
1576
+ "§¬ áãôéïî½§ó": 1574,
1577
+ "§¬ áãôéïî½§óôïòå": 1575,
1578
+ "§¬ áãôéïî½§óôïòåß": 1576,
1579
+ "§¬ áãôéïî½§óôïòåßô": 1577,
1580
+ "§¬ áãôéïî½§óôïòåßôòõå": 1578,
1581
+ "§¬ áãôéïî½§óôïòåßôòõ姬 ": 1579,
1582
+ "ãïíðéìå": 1580,
1583
+ "ðáòåîô": 1581,
1584
+ "¬ åøéóô": 1582,
1585
+ "¬ åøéóôß": 1583,
1586
+ "¬ åøéóôßï": 1584,
1587
+ "¬ åøéóôßïë": 1585,
1588
+ "¬ åøéóôßïë½Ôòõå": 1586,
1589
+ "¨æ": 1587,
1590
+ "áöáéìáâìå": 1588,
1591
+ "óáíðìå ": 1589,
1592
+ "îõíßáòôéãìå": 1590,
1593
+ "¬ §ò": 1591,
1594
+ "éî ôåøô": 1592,
1595
+ "Ãï": 1593,
1596
+ "òåáí": 1594,
1597
+ "×éëéðåäéáÄáôáóåô¨": 1595,
1598
+ "ó¨íïäåì": 1596,
1599
+ "æéî": 1597,
1600
+ "ðéã": 1598,
1601
+ "ðéãë": 1599,
1602
+ "ðéãëìå": 1600,
1603
+ "äåæáõìôäé": 1601,
1604
+ "Äéã": 1602,
1605
+ "Äéãô": 1603,
1606
+ "óôáô": 1604,
1607
+ " ½ òå": 1605,
1608
+ " §®êïéî¨": 1606,
1609
+ "¨¿": 1607,
1610
+ "¡Ü": 1608,
1611
+ "¡ÜÓ": 1609,
1612
+ "¡ÜÓ©": 1610,
1613
+ "÷ïòä éî ": 1611,
1614
+ "®óõ": 1612,
1615
+ "ßôïëåîéúå": 1613,
1616
+ "áòù ": 1614,
1617
+ "ó®éôåíó¨": 1615,
1618
+ "Ãïîöåò": 1616,
1619
+ "Ãïîöåòô ": 1617,
1620
+ "§§®êïéî¨": 1618,
1621
+ "¨é « ±": 1619,
1622
+ "ðòéîô¨æ¢  ": 1620,
1623
+ "ý¢©Š    Š    äåæ ": 1621,
1624
+ "éîôÝ": 1622,
1625
+ "Š            æïò ": 1623,
1626
+ "óôòºŠ        ¢¢¢": 1624,
1627
+ "òåöåò": 1625,
1628
+ "ðáô躠óôò": 1626,
1629
+ "¬ §÷": 1627,
1630
+ "â§© áó æ": 1628,
1631
+ " ½ óåìæ®óðåãéáìßôïëåîóÛ§": 1629,
1632
+ "ìïá": 1630,
1633
+ "ìïáäå": 1631,
1634
+ "ìåî¨óåñ": 1632,
1635
+ " ½ ÛÝ": 1633,
1636
+ "Š    Š    äåæ ßß": 1634,
1637
+ "ßߨóåìæ": 1635,
1638
+ "­ìåöåì": 1636,
1639
+ "­ìåöåì ": 1637,
1640
+ "­¸": 1638,
1641
+ "âùôåßôåøô": 1639,
1642
+ "¢¬Š        ": 1640,
1643
+ "¢¬Š        ¢": 1641,
1644
+ "éîå ": 1642,
1645
+ "¢Š    ": 1643,
1646
+ "òåñõå": 1644,
1647
+ " ×éëéðåäéá ": 1645,
1648
+ "âáóåß": 1646,
1649
+ "âáóåßõòì": 1647,
1650
+ "ßäáôá": 1648,
1651
+ "Åø": 1649,
1652
+ "ôéô": 1650,
1653
+ "ôéôìå": 1651,
1654
+ "¨ãèõîë": 1652,
1655
+ "äáôáóåôßéäø": 1653,
1656
+ "äáôáßäéò": 1654,
1657
+ "÷éëé": 1655,
1658
+ "×å": 1656,
1659
+ "Ôåøô ": 1657,
1660
+ "òåñõ": 1658,
1661
+ "òåñõé": 1659,
1662
+ "òåñõéòå": 1660,
1663
+ "ãïîóï": 1661,
1664
+ "Æïòí": 1662,
1665
+ "ìïáäßóôáôåßäéãô¨": 1663,
1666
+ " « ¢": 1664,
1667
+ "¼´°": 1665,
1668
+ "¼´°ý ": 1666,
1669
+ "¾±": 1667,
1670
+ "¾±µ": 1668,
1671
+ "¨óååä": 1669,
1672
+ "òåóåò": 1670,
1673
+ "òåóåòöå": 1671,
1674
+ "Éîôåòáãôéöå": 1672,
1675
+ "ãïîôéîõ加           Š            éæ ðòïíðô": 1673,
1676
+ "ãïîôéîõ加           Š            éæ ðòïíðô®": 1674,
1677
+ "ãïîôéîõ加           Š            éæ ðòïíðô®óôáòôó÷éôè": 1675,
1678
+ "ãïîôéîõ加           Š            éæ ðòïíðô®óôáòôó÷éô訧": 1676,
1679
+ "ãïîôéîõ加           Š            éæ ðòïíðô®óôáòôó÷éô訧¯": 1677,
1680
+ " §©ºŠ                ": 1678,
1681
+ " §©ºŠ                ôòù": 1679,
1682
+ " §©ºŠ                ôòùºŠ                    ": 1680,
1683
+ " §©ºŠ                ôòùºŠ                    óåôôéîçóÛ§": 1681,
1684
+ "ðòïíðô®óðìéô¨©": 1682,
1685
+ "ðòïíðô®óðìéô¨©Û±Ý©Š                    ": 1683,
1686
+ "ðòïíðô®óðìéô¨©Û±Ý©Š                    ðòéîô¨æ¢": 1684,
1687
+ "óåô ôï û": 1685,
1688
+ "óåô ôï ûóåôôéîçóÛ§": 1686,
1689
+ "§Ýý¢©Š                ": 1687,
1690
+ "§Ýý¢©Š                åøãåðô ": 1688,
1691
+ "§Ýý¢©Š                åøãåðô ¨": 1689,
1692
+ "§Ýý¢©Š                åøãåðô ¨Öáì": 1690,
1693
+ "§Ýý¢©Š                åøãåðô ¨Öáìõå": 1691,
1694
+ "§Ýý¢©Š                åøãåðô ¨ÖáìõåÅòòïò": 1692,
1695
+ "§Ýý¢©Š                åøãåðô ¨ÖáìõåÅòòïò¬ ": 1693,
1696
+ "§Ýý¢©Š                åøãåðô ¨ÖáìõåÅòòïò¬ Éî": 1694,
1697
+ "§Ýý¢©Š                åøãåðô ¨ÖáìõåÅòòïò¬ Éîäåø": 1695,
1698
+ "§Ýý¢©Š                åøãåðô ¨ÖáìõåÅòòïò¬ ÉîäåøÅòòïò": 1696,
1699
+ "§Ýý¢©Š                åøãåðô ¨ÖáìõåÅòòïò¬ ÉîäåøÅòòïò©ºŠ                    ": 1697,
1700
+ "§Ýý¢©Š                åøãåðô ¨ÖáìõåÅòòïò¬ ÉîäåøÅòòïò©ºŠ                    ðòéîô¨¢": 1698,
1701
+ "§Ýý¢©Š                åøãåðô ¨ÖáìõåÅòòïò¬ ÉîäåøÅòòïò©ºŠ                    ðòéîô¨¢Éî": 1699,
1702
+ "§Ýý¢©Š                åøãåðô ¨ÖáìõåÅòòïò¬ ÉîäåøÅòòïò©ºŠ                    ðòéîô¨¢Éîöáìéä": 1700,
1703
+ "§Ýý¢©Š                åøãåðô ¨ÖáìõåÅòòïò¬ ÉîäåøÅòòïò©ºŠ                    ðòéîô¨¢Éîöáìéä ": 1701,
1704
+ "öáìõ墩Š                ": 1702,
1705
+ "íïäåì®çåîåòáôå": 1703,
1706
+ "§Ý¬Š                    ": 1704,
1707
+ "çåîåòáôåäßôåøô": 1705,
1708
+ "îõíßôïëåîó": 1706,
1709
+ "ôòáîóæïòíåò": 1707,
1710
+ "Šæòïí ôù": 1708,
1711
+ "Šæòïí ôùðéîç ": 1709,
1712
+ "Šæòïí ôùðéîç éíðïòô ": 1710,
1713
+ "ŠŠéíðïòô ôïòãè": 1711,
1714
+ "ÒÍÓÎïòí": 1712,
1715
+ "å ": 1713,
1716
+ "áìéú": 1714,
1717
+ "º éîô¬ ": 1715,
1718
+ "ôïòãè®ïîå": 1716,
1719
+ "²©®": 1717,
1720
+ "© « ": 1718,
1721
+ "¸±": 1719,
1722
+ "¸±¹": 1720,
1723
+ "º æìïáô ½ ": 1721,
1724
+ " ½ äéí": 1722,
1725
+ "éîö": 1723,
1726
+ "éîöß": 1724,
1727
+ "éîößæòåñ": 1725,
1728
+ "°¬ ": 1726,
1729
+ "åòß": 1727,
1730
+ "¬ äéí½­±": 1728,
1731
+ "ãáãèåä": 1729,
1732
+ " ½ ø": 1730,
1733
+ "áðð": 1731,
1734
+ "ßåíâåä": 1732,
1735
+ "© ª ": 1733,
1736
+ "õðå": 1734,
1737
+ "®¢¢¢Š    Š    äåæ ßßéîéôßߨŠ        óåìæ¬Š        ": 1735,
1738
+ "äòïðïõôº æìïáô ½ °": 1736,
1739
+ "äòïðïõôº æìïáô ½ °®°": 1737,
1740
+ "ó ª óåìæ®èåáäßäéí": 1738,
1741
+ "¬ âéáó½Æáìó婊        óåìæ®÷": 1739,
1742
+ "áôôîß": 1740,
1743
+ "äòïðïõô ½ îî®": 1741,
1744
+ "äòïðïõô ½ îî®Ä": 1742,
1745
+ "äòïðïõô ½ îî®Äòï": 1743,
1746
+ "äòïðïõô ½ îî®Äòïðïõô¨": 1744,
1747
+ "òåóéä": 1745,
1748
+ "º ôïòãè®Ôåîóïò": 1746,
1749
+ "º ôïòãè®Ôåîóïò¬Š        ": 1747,
1750
+ " ½ óåìæ®÷": 1748,
1751
+ "©®öéå÷¨": 1749,
1752
+ "©®öéå÷¨âóú¬ óåñßìåî": 1750,
1753
+ "©®öéå÷¨âóú¬ óåñßìåóåìæ®": 1751,
1754
+ "ôòáîóðïó娱¬ ": 1752,
1755
+ " éó îïô Îïîå": 1753,
1756
+ "ö ½ ": 1754,
1757
+ "묠ö": 1755,
1758
+ "áô ": 1756,
1759
+ "óãïòå": 1757,
1760
+ "­±©": 1758,
1761
+ "áôôî": 1759,
1762
+ "Æ®ó": 1760,
1763
+ "¬ ðáóôßëåùßöáìõå": 1761,
1764
+ "Æïò": 1762,
1765
+ "Óéîç": 1763,
1766
+ "Óéîçìå ": 1764,
1767
+ "ðòå­": 1765,
1768
+ "äéí©Š        óåìæ®": 1766,
1769
+ "ä ½ ": 1767,
1770
+ "´°¹": 1768,
1771
+ "´°¹¶": 1769,
1772
+ "¸¬Š        ": 1770,
1773
+ "±´": 1771,
1774
+ "±´³": 1772,
1775
+ "±´³³": 1773,
1776
+ "±´³³¶": 1774,
1777
+ "ôéåß÷åéçèô": 1775,
1778
+ "óé": 1776,
1779
+ "ó ¨": 1777,
1780
+ "ãïîæéç ½ ": 1778,
1781
+ "Š        Š        óåìæ®": 1779,
1782
+ "ôïëåîßåíâåääéîç": 1780,
1783
+ "¬Š                ãïîæéç®": 1781,
1784
+ "éîô ": 1782,
1785
+ "çåôßîõíßðáòáí": 1783,
1786
+ "éîéôéáìéúå": 1784,
1787
+ "åìéæ ": 1785,
1788
+ "îõíåì": 1786,
1789
+ "æìïáô¨§": 1787,
1790
+ "éîæ§": 1788,
1791
+ "óÛéÝ": 1789,
1792
+ " éæ ": 1790,
1793
+ "åìóå ": 1791,
1794
+ "íáøßîå÷ßôïëåîóº éîô ½ ": 1792,
1795
+ "áõ": 1793,
1796
+ "åöáì¨": 1794,
1797
+ "Š            Š            £ ": 1795,
1798
+ "ìïçéôóÛ": 1796,
1799
+ "¡½ ": 1797,
1800
+ "éî犠           éæ ": 1798,
1801
+ " ¾ ": 1799,
1802
+ "Û®®®": 1800,
1803
+ "ðòïâ": 1801,
1804
+ "­±Ý": 1802,
1805
+ "áôôåò¨": 1803,
1806
+ "Š        Š        òåôõòî ": 1804,
1807
+ "©Š    òåôõòî ": 1805,
1808
+ "¨ãïîæéç": 1806,
1809
+ "©ŠŠŠéæ ßßîáíåßß ½½ ": 1807,
1810
+ "¢ßßíáéîßß": 1808,
1811
+ "¢ßßíáéîßߢ": 1809,
1812
+ "¢ßßíáéîßߢºŠ    ": 1810,
1813
+ "¢ßßíáéîßߢºŠ    £ ": 1811,
1814
+ "¢ßßíáéîßߢºŠ    £ Ôåóô ": 1812,
1815
+ "óº û": 1813,
1816
+ "ïõôðõôó ½ íïäåì": 1814,
1817
+ "ïõôðõôóÛ§ìïó": 1815,
1818
+ "ïõôðõôóÛ§ìïóó§Ý": 1816,
1819
+ "ý¢©Š<|endoftext|>": 1817,
1820
+ "òé": 1818,
1821
+ "Äé": 1819,
1822
+ "Äéóôòéâõôå": 1820,
1823
+ "ÆÓ": 1821,
1824
+ "ÆÓÄÐ": 1822,
1825
+ "ÄÄÐ": 1823,
1826
+ "áòçðáò": 1824,
1827
+ "áòçðáòóå": 1825,
1828
+ "æó": 1826,
1829
+ "æóä": 1827,
1830
+ "õôéìó®": 1828,
1831
+ "Ôåøô": 1829,
1832
+ "ÔåøôÆéìå": 1830,
1833
+ "õðßäé": 1831,
1834
+ "õðßäéóôòéâõôå": 1832,
1835
+ "õðßäéóôòéâõôåä¨": 1833,
1836
+ "òáî묠": 1834,
1837
+ "óèõ": 1835,
1838
+ "óèõææ": 1836,
1839
+ "óèõææìå": 1837,
1840
+ "©ºŠ    ¢¢¢Ãòåáôå ": 1838,
1841
+ "íïäåì®ôòáéî¨": 1839,
1842
+ "éîðõôßéäó§Ý": 1840,
1843
+ "ìáâåìó§Ý": 1841,
1844
+ "ïðôéíéúåò®": 1842,
1845
+ "äáôå": 1843,
1846
+ "éôåí¨": 1844,
1847
+ "öáìßìïáäåò": 1845,
1848
+ "¬Š    äåöéã嬊    ": 1846,
1849
+ "âåóôß": 1847,
1850
+ "ôòáéîßéôåòáôïò": 1848,
1851
+ "éôåò¨": 1849,
1852
+ "èáó": 1850,
1853
+ "èáóáô": 1851,
1854
+ "èáóáôôò": 1852,
1855
+ "ôéíå®": 1853,
1856
+ "ôéíå®ôéíå": 1854,
1857
+ "íáøßóôåð": 1855,
1858
+ "îåøô¨": 1856,
1859
+ "Ìïç": 1857,
1860
+ "Ìïçç": 1858,
1861
+ "ý¯": 1859,
1862
+ "óáöåßãèåãëðïéîô¨": 1860,
1863
+ "áòçó®ïõôðõôßäéò ¯ ": 1861,
1864
+ "ðô§": 1862,
1865
+ "©Š                        ": 1863,
1866
+ "󺊠               âòåáë": 1864,
1867
+ "ãïíðìå": 1865,
1868
+ "éî û": 1866,
1869
+ "äáôáßìïáäåò": 1867,
1870
+ "ôïôáìßìïó": 1868,
1871
+ "óÝ": 1869,
1872
+ "íáéî¨": 1870,
1873
+ "Áò": 1871,
1874
+ "Ôòáéî ": 1872,
1875
+ "áòç󊠠  ": 1873,
1876
+ "áòç󊠠  ðáòóåò®áääßáòçõíåîô¨§­­": 1874,
1877
+ "°°°©Š    ðáòóåò®áääßáòçõíåîô¨§­­": 1875,
1878
+ "íáø­": 1876,
1879
+ "²°": 1877,
1880
+ "²°´": 1878,
1881
+ "ìåáòîéîç": 1879,
1882
+ "òáôå": 1880,
1883
+ "òåóõí": 1881,
1884
+ "õóå": 1882,
1885
+ "éóßäéóôòéâõôåä": 1883,
1886
+ "Š    Š    £ ": 1884,
1887
+ "éæ áòçó®òáîë ½½ °ºŠ        ": 1885,
1888
+ "äéò¨": 1886,
1889
+ "ó½Ôòõå": 1887,
1890
+ "ÇÐ": 1888,
1891
+ "ôïòãè®ãõäá®éóß": 1889,
1892
+ "ôïòãè®ãõäá®éóßáöáéìáâìå": 1890,
1893
+ "ãðõ": 1891,
1894
+ "ãðõ§": 1892,
1895
+ "©Š    Š    £ Ìïáä ": 1893,
1896
+ "ôïëåîéúåòŠ    ": 1894,
1897
+ "ïó®ðáôè®": 1895,
1898
+ "ïó®ðáôè®åøéóô": 1896,
1899
+ "ïó®ðáôè®åøéóôó¨": 1897,
1900
+ "Ìïáäéîç ": 1898,
1901
+ "ôïëåîéúåò æòïí ": 1899,
1902
+ "ý¢©Š        ": 1900,
1903
+ "ôïëåîéúåò ½ ÂùôåÌåöåìÂÐÅÔïëåîéúåò": 1901,
1904
+ "ìïáä¨áòçó®": 1902,
1905
+ "áôéîç ": 1903,
1906
+ "æ®": 1904,
1907
+ "®óôòéð¨": 1905,
1908
+ "§©ºŠ        ": 1906,
1909
+ "äáôᬠ": 1907,
1910
+ "äáôáóåô ½ ×éëéðåäéáÄáôáóåô¨": 1908,
1911
+ "¬ íáøßìåîçôè": 1909,
1912
+ "ó¨íïäåì¬��": 1910,
1913
+ " ½ ôïòãè®ãõäá®": 1911,
1914
+ "®®®¢©Š    ": 1912,
1915
+ "öïãáâ ½ ûý": 1913,
1916
+ "¾§º ": 1914,
1917
+ "ßçåôß": 1915,
1918
+ "ßçåôßóôáô": 1916,
1919
+ "ó ïæ ": 1917,
1920
+ "äåæáõìôäéãô¨": 1918,
1921
+ "äåæáõìôäéãô¨éîô": 1919,
1922
+ "äåæáõìôäéãô¨éîô©Š        ": 1920,
1923
+ "äåæáõìôäéãô¨éîô©Š        æïò ": 1921,
1924
+ " éî öïãáâ": 1922,
1925
+ "­ ±": 1923,
1926
+ "é « ±Ý": 1924,
1927
+ "©Ý ": 1925,
1928
+ "íåòçåß": 1926,
1929
+ "íåòçåßöïãáâ": 1927,
1930
+ "¬ öïãáâ": 1928,
1931
+ "âéçòáí": 1929,
1932
+ " ½ òå®": 1930,
1933
+ "򧨿": 1931,
1934
+ "¡ÜÓ©§": 1932,
1935
+ "ôïëåîéúå": 1933,
1936
+ "Üð": 1934,
1937
+ "Üðû": 1935,
1938
+ "Üó": 1936,
1939
+ "ôåøôóº ÌéóôÛ": 1937,
1940
+ "ôåøôóº ÌéóôÛóôòÝ": 1938,
1941
+ "öïãáâ ½ ": 1939,
1942
+ "ôåøô éî ôåøô": 1940,
1943
+ "ìï÷": 1941,
1944
+ "¼¯÷": 1942,
1945
+ "¼¯÷¾": 1943,
1946
+ "÷ïòä ½ ": 1944,
1947
+ "æïòíáô": 1945,
1948
+ "Áä": 1946,
1949
+ "ó ôï ": 1947,
1950
+ "©ºŠ                éæ ": 1948,
1951
+ "îïô éî ": 1949,
1952
+ "îïô éî óåìæ®öïãáâ": 1950,
1953
+ "óåìæ®öïãáâÛ": 1951,
1954
+ "Ý ½ ìåî¨óåìæ®öïãáâ": 1952,
1955
+ "îõíßíåòçåó": 1953,
1956
+ "º âïïì": 1954,
1957
+ "º âïïì ½ Ôòõå": 1955,
1958
+ "Åîãïäå": 1956,
1959
+ " ÉÄ": 1957,
1960
+ "÷ïòäßôïëåî": 1958,
1961
+ "®çåô¨": 1959,
1962
+ "Äåãïäå": 1960,
1963
+ "òåöåòóåß": 1961,
1964
+ "ó ½ Û݊        æïò ": 1962,
1965
+ "¨óåìæ¬ ðáô躠óôò": 1963,
1966
+ "¨óåìæ¬ ðáô躠óôò©ºŠ        ¢¢¢": 1964,
1967
+ "æéì客¢¢Š        ": 1965,
1968
+ "÷éôè ïðåî¨ðáôè": 1966,
1969
+ "â§© áó æºŠ            ": 1967,
1970
+ "ðéãëìå®": 1968,
1971
+ "äáôá ½ ": 1969,
1972
+ "ìïá䍿": 1970,
1973
+ " ½ óåìæ®óðåãéáìßôïëåîóÛ§¼": 1971,
1974
+ "¾§ÝŠ        óåìæ®": 1972,
1975
+ "æòïí û": 1973,
1976
+ "óåñ éî ": 1974,
1977
+ "óåñ éî åîãïäåä": 1975,
1978
+ "íáøßìåî": 1976,
1979
+ "éîðõôßéä󧺠": 1977,
1980
+ "áôôåîôéïîßíáó맺 ": 1978,
1981
+ "âùôåóßôïß": 1979,
1982
+ "âùôåóßôïßõîéãïäå": 1980,
1983
+ "âùôå­ìåöåì ": 1981,
1984
+ "õôæ": 1982,
1985
+ "õôæ­¸": 1983,
1986
+ "ãïòðõó ": 1984,
1987
+ "é ¼ ": 1985,
1988
+ "é ¼ ìåî¨": 1986,
1989
+ "é ¼ ìåî¨÷ïòä": 1987,
1990
+ " áîä ÷ïòäÛ": 1988,
1991
+ "ºŠ                    îå÷ß÷ïòä": 1989,
1992
+ "ºŠ                    îå÷ß÷ïòä®áððåîä¨": 1990,
1993
+ "©Š    ôïëåîéúåò®": 1991,
1994
+ "óáíðìåßôåøô": 1992,
1995
+ "󠽠ۊ        ": 1993,
1996
+ "¬Š    Ý": 1994,
1997
+ "ôïëåîéúåò®äåãïäå¨": 1995,
1998
+ "òåñõåóô": 1996,
1999
+ "Éôåòáâìå": 1997,
2000
+ "ÉôåòáâìåÄáôáóåô": 1998,
2001
+ "íéîßáòôéãìå": 1999
2002
+ }