teszenofficial commited on
Commit
fc3b75f
·
verified ·
1 Parent(s): 027d58e

Delete tokenizer.py

Browse files
Files changed (1) hide show
  1. tokenizer.py +0 -138
tokenizer.py DELETED
@@ -1,138 +0,0 @@
1
- import sentencepiece as spm
2
- import os
3
- import json
4
-
5
-
6
- class MTPTokenizer:
7
- """Tokenizer using SentencePiece BPE"""
8
-
9
- def __init__(self, model_path=None):
10
- self.sp = None
11
- self.model_path = model_path
12
-
13
- if model_path and os.path.exists(model_path):
14
- self.load(model_path)
15
-
16
- def train(self, corpus_path, vocab_size=4000, model_prefix='mtp_tokenizer'):
17
- """Train SentencePiece BPE tokenizer on corpus"""
18
-
19
- # Extract text from JSONL corpus
20
- texts = []
21
- with open(corpus_path, 'r', encoding='utf-8') as f:
22
- for line in f:
23
- data = json.loads(line)
24
- if 'instruction' in data:
25
- texts.append(data['instruction'])
26
- if 'response' in data:
27
- texts.append(data['response'])
28
-
29
- # Save temporary text file
30
- temp_file = 'temp_corpus.txt'
31
- with open(temp_file, 'w', encoding='utf-8') as f:
32
- f.write('\n'.join(texts))
33
-
34
- # Calculate optimal vocab size based on corpus
35
- total_chars = sum(len(text) for text in texts)
36
- max_vocab = min(vocab_size, int(total_chars * 0.15)) # Heuristic: ~15% of chars
37
-
38
- print(f" → Corpus stats: {len(texts)} texts, {total_chars} characters")
39
- print(f" → Adjusted vocab size: {max_vocab} (requested: {vocab_size})")
40
-
41
- # Train SentencePiece with adjusted parameters
42
- try:
43
- spm.SentencePieceTrainer.train(
44
- input=temp_file,
45
- model_prefix=model_prefix,
46
- vocab_size=max_vocab,
47
- model_type='bpe',
48
- pad_id=0,
49
- unk_id=1,
50
- bos_id=2,
51
- eos_id=3,
52
- character_coverage=1.0,
53
- normalization_rule_name='identity',
54
- num_threads=4,
55
- split_digits=True,
56
- allow_whitespace_only_pieces=False,
57
- byte_fallback=False,
58
- max_sentencepiece_length=16
59
- )
60
- except RuntimeError as e:
61
- if "Vocabulary size too high" in str(e):
62
- # Extract suggested max from error and retry
63
- import re
64
- match = re.search(r'value <= (\d+)', str(e))
65
- if match:
66
- suggested_max = int(match.group(1))
67
- print(f" → Retrying with vocab size: {suggested_max}")
68
- spm.SentencePieceTrainer.train(
69
- input=temp_file,
70
- model_prefix=model_prefix,
71
- vocab_size=suggested_max,
72
- model_type='bpe',
73
- pad_id=0,
74
- unk_id=1,
75
- bos_id=2,
76
- eos_id=3,
77
- character_coverage=1.0,
78
- normalization_rule_name='identity',
79
- num_threads=4,
80
- split_digits=True,
81
- allow_whitespace_only_pieces=False,
82
- byte_fallback=False,
83
- max_sentencepiece_length=16
84
- )
85
- else:
86
- raise
87
- else:
88
- raise
89
-
90
- # Clean up
91
- os.remove(temp_file)
92
-
93
- # Load the trained model
94
- self.model_path = f"{model_prefix}.model"
95
- self.load(self.model_path)
96
-
97
- print(f"✓ Tokenizer trained: {self.vocab_size()} tokens")
98
- print(f"✓ Model saved: {self.model_path}")
99
-
100
- def load(self, model_path):
101
- """Load trained tokenizer"""
102
- self.sp = spm.SentencePieceProcessor()
103
- self.sp.load(model_path)
104
- self.model_path = model_path
105
-
106
- def encode(self, text):
107
- """Encode text to token IDs"""
108
- if self.sp is None:
109
- raise ValueError("Tokenizer not loaded. Train or load a model first.")
110
- return self.sp.encode_as_ids(text)
111
-
112
- def decode(self, ids):
113
- """Decode token IDs to text"""
114
- if self.sp is None:
115
- raise ValueError("Tokenizer not loaded. Train or load a model first.")
116
- return self.sp.decode_ids(ids)
117
-
118
- def vocab_size(self):
119
- """Get vocabulary size"""
120
- if self.sp is None:
121
- return 0
122
- return self.sp.get_piece_size()
123
-
124
- def bos_id(self):
125
- """Beginning of sentence token ID"""
126
- return self.sp.bos_id()
127
-
128
- def eos_id(self):
129
- """End of sentence token ID"""
130
- return self.sp.eos_id()
131
-
132
- def pad_id(self):
133
- """Padding token ID"""
134
- return self.sp.pad_id()
135
-
136
- def unk_id(self):
137
- """Unknown token ID"""
138
- return self.sp.unk_id()