ChaitraSaiK commited on
Commit
b3797cd
·
1 Parent(s): 4984d4a

removed 350_file, third commit

Browse files
Files changed (2) hide show
  1. app.py +152 -75
  2. bpe_vocab_350_merges.pkl +0 -0
app.py CHANGED
@@ -1,8 +1,54 @@
1
  import gradio as gr
2
- import pickle
3
  from typing import List, Dict, Tuple
4
  import numpy as np
5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  class OptimizedBPETokenizer:
7
  def __init__(self, merges: Dict[Tuple[int, int], int]):
8
  self.merges = merges
@@ -19,98 +65,129 @@ class OptimizedBPETokenizer:
19
  if not isinstance(text, str):
20
  return []
21
 
22
- ids = np.array(list(text.encode('utf-8')), dtype=np.uint16)
 
23
 
24
- result = []
25
- for i in range(0, len(ids), chunk_size):
26
- chunk = ids[i:i + chunk_size]
27
- processed_chunk = self._encode_chunk(chunk)
28
- result.extend(processed_chunk)
29
-
30
- return result
31
-
32
- def _encode_chunk(self, ids: np.ndarray) -> List[int]:
33
- output = []
34
- i = 0
35
- while i < len(ids):
36
- if i < len(ids) - 1:
37
- first, second = ids[i], ids[i + 1]
38
- if first in self.merge_lookup and second in self.merge_lookup[first]:
39
- output.append(self.merge_lookup[first][second])
40
- i += 2
41
- continue
42
- output.append(ids[i])
43
- i += 1
44
- return output
45
-
46
- def decode(self, ids: List[int], chunk_size: int = 1000000) -> str:
47
- byte_tokens = []
48
- for i in range(0, len(ids), chunk_size):
49
- chunk = ids[i:i + chunk_size]
50
- decoded_chunk = self._decode_chunk(chunk)
51
- byte_tokens.extend(decoded_chunk)
52
-
53
- return bytes(byte_tokens).decode('utf-8')
54
 
55
- def _decode_chunk(self, ids: List[int]) -> List[int]:
56
  result = []
57
  for token in ids:
58
  if token < 256:
59
  result.append(token)
60
  else:
61
- result.extend(self._expand_token(token))
62
- return result
 
 
 
63
 
64
  def _expand_token(self, token: int) -> List[int]:
65
  if token < 256:
66
  return [token]
67
-
68
  pair = self.idx_to_pair[token]
69
- expanded = []
70
- for t in pair:
71
- expanded.extend(self._expand_token(t))
72
- return expanded
73
-
74
- # Load the pre-trained merges
75
- with open("bpe_vocab_350_merges.pkl", "rb") as f:
76
- merges = pickle.load(f)
77
 
 
78
  tokenizer = OptimizedBPETokenizer(merges)
79
 
80
- def process_text(text: str, operation: str) -> str:
81
- if operation == "Encode":
 
 
 
82
  tokens = tokenizer.encode(text)
83
  return f"Encoded tokens: {tokens}\nToken count: {len(tokens)}"
84
- else: # Decode
85
- try:
86
- # Convert string of numbers to list of integers
87
- tokens = [int(x) for x in text.strip('[]').split(',')]
88
- decoded_text = tokenizer.decode(tokens)
89
- return f"Decoded text: {decoded_text}"
90
- except:
91
- return "Error: Please provide a valid list of integers for decoding"
92
 
93
- # Create the Gradio interface
94
- iface = gr.Interface(
95
- fn=process_text,
96
- inputs=[
97
- gr.Textbox(label="Input Text", placeholder="Enter text to encode or tokens to decode..."),
98
- gr.Radio(["Encode", "Decode"], label="Operation", value="Encode")
99
- ],
100
- outputs=gr.Textbox(label="Output"),
101
- title="Telugu BPE Tokenizer",
102
- description="A byte-pair encoding tokenizer trained on Telugu text. For encoding, enter Telugu text. For decoding, enter a list of integers (e.g., [256, 257, 258])."
103
- )
104
 
105
- # if __name__ == "__main__":
106
- # # Test encoding
107
- # test_text = "నమస్కారం" # Telugu "Hello"
108
- # encoded = tokenizer.encode(test_text)
109
- # print(f"Test Encode: '{test_text}' -> {encoded}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
 
111
- # # Test decoding
112
- # decoded = tokenizer.decode(encoded)
113
- # print(f"Test Decode: {encoded} -> '{decoded}'")
 
 
 
114
 
115
- # Launch the interface
116
- iface.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
 
2
  from typing import List, Dict, Tuple
3
  import numpy as np
4
 
5
+ def get_stats(ids):
6
+ counts = {}
7
+ for pair in zip(ids, ids[1:]):
8
+ counts[pair] = counts.get(pair, 0) + 1
9
+ return counts
10
+
11
+ def merge(ids, pair, idx):
12
+ newids = []
13
+ i = 0
14
+ while i < len(ids):
15
+ if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
16
+ newids.append(idx)
17
+ i += 2
18
+ else:
19
+ newids.append(ids[i])
20
+ i += 1
21
+ return newids
22
+
23
+ # Read the Telugu text file and train BPE
24
+ def train_bpe(vocab_size: int = 350):
25
+ # Read the preprocessed Telugu text
26
+ with open('telugu_preprocessed_file.txt', 'r', encoding='utf-8') as f:
27
+ text = f.read()
28
+
29
+ # Convert initial text to bytes
30
+ tokens = list(text.encode('utf-8'))
31
+
32
+ # Train merges
33
+ num_merges = vocab_size - 256
34
+ ids = list(tokens)
35
+ merges = {}
36
+
37
+ for i in range(num_merges):
38
+ stats = get_stats(ids)
39
+ if not stats: # If no more pairs to merge
40
+ break
41
+ pair = max(stats, key=stats.get)
42
+ idx = 256 + i
43
+ print(f"merging {pair} into a new token {idx}") # Optional: for monitoring training
44
+ ids = merge(ids, pair, idx)
45
+ merges[pair] = idx
46
+
47
+ return merges
48
+
49
+ # Train the tokenizer
50
+ merges = train_bpe()
51
+
52
  class OptimizedBPETokenizer:
53
  def __init__(self, merges: Dict[Tuple[int, int], int]):
54
  self.merges = merges
 
65
  if not isinstance(text, str):
66
  return []
67
 
68
+ # Convert to regular integers instead of numpy types
69
+ ids = [int(x) for x in text.encode('utf-8')]
70
 
71
+ # Apply merges
72
+ while True:
73
+ stats = get_stats(ids)
74
+ if not stats:
75
+ break
76
+ pair = max(stats, key=stats.get)
77
+ if pair not in self.merges:
78
+ break
79
+ ids = merge(ids, pair, self.merges[pair])
80
+
81
+ return ids
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ def decode(self, ids: List[int]) -> str:
84
  result = []
85
  for token in ids:
86
  if token < 256:
87
  result.append(token)
88
  else:
89
+ # Expand merged tokens
90
+ pair = self.idx_to_pair[token]
91
+ result.extend(self._expand_token(pair[0]))
92
+ result.extend(self._expand_token(pair[1]))
93
+ return bytes(result).decode('utf-8')
94
 
95
  def _expand_token(self, token: int) -> List[int]:
96
  if token < 256:
97
  return [token]
 
98
  pair = self.idx_to_pair[token]
99
+ result = []
100
+ result.extend(self._expand_token(pair[0]))
101
+ result.extend(self._expand_token(pair[1]))
102
+ return result
 
 
 
 
103
 
104
+ # Initialize tokenizer
105
  tokenizer = OptimizedBPETokenizer(merges)
106
 
107
+ def encode_text(text: str) -> str:
108
+ """Function to handle encoding"""
109
+ if not text:
110
+ return "Please enter text to encode"
111
+ try:
112
  tokens = tokenizer.encode(text)
113
  return f"Encoded tokens: {tokens}\nToken count: {len(tokens)}"
114
+ except Exception as e:
115
+ return f"Encoding error: {str(e)}"
 
 
 
 
 
 
116
 
117
+ def decode_tokens(text: str) -> str:
118
+ """Function to handle decoding"""
119
+ if not text:
120
+ return "Please enter tokens to decode"
121
+ try:
122
+ tokens = [int(x) for x in text.strip('[]').split(',')]
123
+ decoded_text = tokenizer.decode(tokens)
124
+ return f"Decoded text: {decoded_text}"
125
+ except Exception as e:
126
+ return f"Error: Please provide valid integers for decoding. Details: {str(e)}"
 
127
 
128
+ # Create the Gradio interface
129
+ with gr.Blocks(title="Telugu BPE Tokenizer") as iface:
130
+ gr.Markdown("# Telugu BPE Tokenizer")
131
+ gr.Markdown("A byte-pair encoding tokenizer trained on Telugu text.")
132
+
133
+ with gr.Row():
134
+ # Encoding Section
135
+ with gr.Column():
136
+ gr.Markdown("### Encode Text")
137
+ input_text = gr.Textbox(
138
+ label="Input Text",
139
+ placeholder="Enter Telugu text to encode..."
140
+ )
141
+ encode_button = gr.Button("Encode")
142
+ encode_output = gr.Textbox(label="Encoding Result")
143
+
144
+ # Decoding Section
145
+ with gr.Column():
146
+ gr.Markdown("### Decode Tokens")
147
+ input_tokens = gr.Textbox(
148
+ label="Input Tokens",
149
+ placeholder="Enter comma-separated tokens (e.g., 256,257,258)"
150
+ )
151
+ decode_button = gr.Button("Decode")
152
+ decode_output = gr.Textbox(label="Decoding Result")
153
 
154
+ # Set up the button click events
155
+ encode_button.click(
156
+ fn=encode_text,
157
+ inputs=input_text,
158
+ outputs=encode_output
159
+ )
160
 
161
+ decode_button.click(
162
+ fn=decode_tokens,
163
+ inputs=input_tokens,
164
+ outputs=decode_output
165
+ )
166
+
167
+ # Add examples
168
+ with gr.Row():
169
+ with gr.Column():
170
+ gr.Examples(
171
+ examples=[
172
+ ["నమస్కారం"],
173
+ ["తెలుగు భాష"],
174
+ ],
175
+ inputs=input_text,
176
+ outputs=encode_output,
177
+ fn=encode_text,
178
+ label="Encoding Examples"
179
+ )
180
+
181
+ with gr.Column():
182
+ gr.Examples(
183
+ examples=[
184
+ ["256,257,258"], # Example tokens
185
+ ],
186
+ inputs=input_tokens,
187
+ outputs=decode_output,
188
+ fn=decode_tokens,
189
+ label="Decoding Examples"
190
+ )
191
+
192
+ if __name__ == "__main__":
193
+ iface.launch()
bpe_vocab_350_merges.pkl DELETED
Binary file (984 Bytes)