ChaitraSaiK commited on
Commit
21a6d06
·
1 Parent(s): bb20c6e

first commit

Browse files
Files changed (3) hide show
  1. app.py +115 -0
  2. bpe_vocab_350_merges.pkl +3 -0
  3. requirements.txt +2 -0
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import numpy as np
3
+ import pickle
4
+ from typing import List, Dict, Tuple
5
+
6
+ class OptimizedBPETokenizer:
7
+ def __init__(self, merges: Dict[Tuple[int, int], int]):
8
+ self.merges = merges
9
+ self.idx_to_pair = {idx: pair for pair, idx in merges.items()}
10
+
11
+ # Create lookup table for faster encoding
12
+ self.merge_lookup = {}
13
+ for (first, second), idx in merges.items():
14
+ if first not in self.merge_lookup:
15
+ self.merge_lookup[first] = {}
16
+ self.merge_lookup[first][second] = idx
17
+
18
+ def encode(self, text: str, chunk_size: int = 1000000) -> List[int]:
19
+ if not isinstance(text, str):
20
+ return []
21
+
22
+ ids = np.array(list(text.encode('utf-8')), dtype=np.uint16)
23
+
24
+ result = []
25
+ for i in range(0, len(ids), chunk_size):
26
+ chunk = ids[i:i + chunk_size]
27
+ processed_chunk = self._encode_chunk(chunk)
28
+ result.extend(processed_chunk)
29
+
30
+ return result
31
+
32
+ def _encode_chunk(self, ids: np.ndarray) -> List[int]:
33
+ output = []
34
+ i = 0
35
+ while i < len(ids):
36
+ if i < len(ids) - 1:
37
+ first, second = ids[i], ids[i + 1]
38
+ if first in self.merge_lookup and second in self.merge_lookup[first]:
39
+ output.append(self.merge_lookup[first][second])
40
+ i += 2
41
+ continue
42
+ output.append(ids[i])
43
+ i += 1
44
+ return output
45
+
46
+ def decode(self, ids: List[int], chunk_size: int = 1000000) -> str:
47
+ byte_tokens = []
48
+ for i in range(0, len(ids), chunk_size):
49
+ chunk = ids[i:i + chunk_size]
50
+ decoded_chunk = self._decode_chunk(chunk)
51
+ byte_tokens.extend(decoded_chunk)
52
+
53
+ return bytes(byte_tokens).decode('utf-8')
54
+
55
+ def _decode_chunk(self, ids: List[int]) -> List[int]:
56
+ result = []
57
+ for token in ids:
58
+ if token < 256:
59
+ result.append(token)
60
+ else:
61
+ result.extend(self._expand_token(token))
62
+ return result
63
+
64
+ def _expand_token(self, token: int) -> List[int]:
65
+ if token < 256:
66
+ return [token]
67
+
68
+ pair = self.idx_to_pair[token]
69
+ expanded = []
70
+ for t in pair:
71
+ expanded.extend(self._expand_token(t))
72
+ return expanded
73
+
74
+ # Load the BPE merges
75
+ def load_tokenizer():
76
+ try:
77
+ with open("bpe_vocab_350_merges.pkl", "rb") as f:
78
+ merges = pickle.load(f)
79
+ return OptimizedBPETokenizer(merges)
80
+ except FileNotFoundError:
81
+ raise Exception("Tokenizer merges file not found!")
82
+
83
+ tokenizer = load_tokenizer()
84
+
85
+ def process_text(text, mode):
86
+ if mode == "Encode":
87
+ encoded = tokenizer.encode(text)
88
+ return f"Encoded tokens: {encoded}\nToken count: {len(encoded)}"
89
+ else: # Decode
90
+ try:
91
+ # Convert string of numbers to list of integers
92
+ tokens = [int(t) for t in text.strip('[]').split(',')]
93
+ decoded = tokenizer.decode(tokens)
94
+ return decoded
95
+ except:
96
+ return "Error: Please provide tokens as comma-separated numbers"
97
+
98
+ # Create the interface
99
+ iface = gr.Interface(
100
+ fn=process_text,
101
+ inputs=[
102
+ gr.Textbox(label="Input Text", lines=5),
103
+ gr.Radio(["Encode", "Decode"], label="Mode", value="Encode")
104
+ ],
105
+ outputs=gr.Textbox(label="Output", lines=5),
106
+ title="Telugu BPE Tokenizer",
107
+ description="Encode Telugu text into BPE tokens or decode tokens back to text.",
108
+ examples=[
109
+ ["నమస్కారం", "Encode"],
110
+ ["[224, 176, 184, 224, 176, 184]", "Decode"]
111
+ ]
112
+ )
113
+
114
+ if __name__ == "__main__":
115
+ iface.launch()
bpe_vocab_350_merges.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d98eabc47c17d13488f9cd411089290a90f72ea2a3514ac734816fa823faa10
3
+ size 981
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ gradio
2
+ numpy