anveshplus commited on
Commit
5f574bd
·
1 Parent(s): 96e2c6c
__pycache__/consecutive_tokens.cpython-312.pyc CHANGED
Binary files a/__pycache__/consecutive_tokens.cpython-312.pyc and b/__pycache__/consecutive_tokens.cpython-312.pyc differ
 
__pycache__/tokenizer.cpython-312.pyc CHANGED
Binary files a/__pycache__/tokenizer.cpython-312.pyc and b/__pycache__/tokenizer.cpython-312.pyc differ
 
app.py CHANGED
@@ -32,7 +32,7 @@ def decode(text):
32
  # Expand vocabulary
33
  decoder_map = tokenizer.expand_vocab(inverted_vocab)
34
  decoded_tokens = [decoder_map.get(int(token)) for token in toks_li]
35
- decoded_tokens = [token[0] for token in decoded_tokens]
36
  tokens = [token.decode('utf-8') for token in decoded_tokens]
37
  decoded_tokens = b''.join(decoded_tokens)
38
  decoded_tokens = decoded_tokens.decode('utf-8')
 
32
  # Expand vocabulary
33
  decoder_map = tokenizer.expand_vocab(inverted_vocab)
34
  decoded_tokens = [decoder_map.get(int(token)) for token in toks_li]
35
+ decoded_tokens = [item for token in decoded_tokens for item in token]
36
  tokens = [token.decode('utf-8') for token in decoded_tokens]
37
  decoded_tokens = b''.join(decoded_tokens)
38
  decoded_tokens = decoded_tokens.decode('utf-8')
consecutive_tokens.py CHANGED
@@ -38,24 +38,61 @@ def search_consecutive_tokens(ordered_dict, encoded_token_dict):
38
  return final_encoded_tokens
39
 
40
  if __name__ == "__main__":
41
- text = "తెలుగు భాష ఒక ద్రావిడ భాష."
42
- op_li = get_consecutive_tokens([1,2,3,4,5])
43
- print(op_li)
44
-
45
- dict = {(1,2):9,(3,):10, (4,5):11}
46
- opp = search_consecutive_tokens(op_li, dict)
47
- print(opp)
48
- text = "9,10,11"
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  toks_li = [token for token in text.split(',')]
50
  # Reading vocabulary from file
51
- import tokenizer
52
  formatted_vocab = tokenizer.read_vocab_from_file()
53
  # Invert vocabulary
54
  inverted_vocab = {v: k for k, v in formatted_vocab.items()}
55
  # Expand vocabulary
56
  decoder_map = tokenizer.expand_vocab(inverted_vocab)
57
  decoded_tokens = [decoder_map.get(int(token)) for token in toks_li]
 
 
 
 
58
  print(decoded_tokens)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  # encoded_tokens = encode_tokens_parallel(text, chunk_size=1_000_000, max_workers=2)
60
  # encoded_tokens = [token.encode('utf-8') for token in text]
61
  # decoded_tokens = [i.decode('utf-8') for i in encoded_tokens]
 
38
  return final_encoded_tokens
39
 
40
  if __name__ == "__main__":
41
+ ## Encoder
42
+ # import tokenizer
43
+ # text = "తెలుగు భాష ఒక ద్రావిడ భాష."
44
+ # encoded_tokens = [token.encode('utf-8') for token in text]
45
+ # consective_tokens = get_consecutive_tokens(encoded_tokens,window_size=4)
46
+ # # Reading vocabulary from file
47
+ # formatted_vocab = tokenizer.read_vocab_from_file()
48
+ # # Invert vocabulary
49
+ # inverted_vocab = {v: k for k, v in formatted_vocab.items()}
50
+ # # Expand vocabulary
51
+ # decoder_map = tokenizer.expand_vocab(inverted_vocab)
52
+ # # Invert back again after expansion
53
+ # re_inverted_vocab = {k: v for v, k in decoder_map.items()}
54
+
55
+ # # encoded_tokens = [re_inverted_vocab.get(token) for token in consective_tokens]
56
+ # encoded_tokens = search_consecutive_tokens(consective_tokens, re_inverted_vocab)
57
+ # print(encoded_tokens)
58
+
59
+ ## decoder:
60
+ import tokenizer
61
+ text = "573, 312, 255, 255, 419, 55, 255, 255, 394, 255, 255, 624, 62, 291, 33, 255, 255, 419, 55, 254"
62
  toks_li = [token for token in text.split(',')]
63
  # Reading vocabulary from file
 
64
  formatted_vocab = tokenizer.read_vocab_from_file()
65
  # Invert vocabulary
66
  inverted_vocab = {v: k for k, v in formatted_vocab.items()}
67
  # Expand vocabulary
68
  decoder_map = tokenizer.expand_vocab(inverted_vocab)
69
  decoded_tokens = [decoder_map.get(int(token)) for token in toks_li]
70
+ decoded_tokens = [item for token in decoded_tokens for item in token]
71
+ tokens = [token.decode('utf-8') for token in decoded_tokens]
72
+ decoded_tokens = b''.join(decoded_tokens)
73
+ decoded_tokens = decoded_tokens.decode('utf-8')
74
  print(decoded_tokens)
75
+
76
+
77
+ #op_li = get_consecutive_tokens([1,2,3,4,5])
78
+ #print(op_li)
79
+
80
+ # dict = {(1,2):9,(3,):10, (4,5):11}
81
+ # opp = search_consecutive_tokens(op_li, dict)
82
+ # print(opp)
83
+
84
+
85
+ # text = "9,10,11"
86
+ # toks_li = [token for token in text.split(',')]
87
+ # # Reading vocabulary from file
88
+ # import tokenizer
89
+ # formatted_vocab = tokenizer.read_vocab_from_file()
90
+ # # Invert vocabulary
91
+ # inverted_vocab = {v: k for k, v in formatted_vocab.items()}
92
+ # # Expand vocabulary
93
+ # decoder_map = tokenizer.expand_vocab(inverted_vocab)
94
+ # decoded_tokens = [decoder_map.get(int(token)) for token in toks_li]
95
+ # print(decoded_tokens)
96
  # encoded_tokens = encode_tokens_parallel(text, chunk_size=1_000_000, max_workers=2)
97
  # encoded_tokens = [token.encode('utf-8') for token in text]
98
  # decoded_tokens = [i.decode('utf-8') for i in encoded_tokens]
tokenizer.py CHANGED
@@ -121,7 +121,7 @@ if __name__ == "__main__":
121
  # 1. Load and encode tokens
122
  encoded_tokens = load_and_encode_tokens()
123
  # 2. Process BPE
124
- merges = bpe_process(encoded_tokens,vocab_size=1000, encoded_tokens_length=20_00_000)
125
  # 3. Build vocabulary
126
  build_vocabulary(merges)
127
  # 4. Read vocabulary from file
 
121
  # 1. Load and encode tokens
122
  encoded_tokens = load_and_encode_tokens()
123
  # 2. Process BPE
124
+ merges = bpe_process(encoded_tokens,vocab_size=1000, encoded_tokens_length=10_00_000)
125
  # 3. Build vocabulary
126
  build_vocabulary(merges)
127
  # 4. Read vocabulary from file