Spaces:
Sleeping
Sleeping
| from collections import OrderedDict | |
| def get_consecutive_tokens(li, window_size=4): | |
| if len(li) == 0: | |
| return [] | |
| final_token_dict = OrderedDict((token, []) for token in range(len(li))) | |
| i = 0 | |
| while i <= len(li)-1: | |
| j = 1 | |
| while j <= window_size: | |
| final_token_dict[i].append(tuple(li[i:i+j])) | |
| j+=1 | |
| i+=1 | |
| reversed_token_dict = {key: [tuple(tup) for tup in reversed(value)] for key, value in final_token_dict.items()} | |
| return reversed_token_dict | |
| def search_consecutive_tokens(ordered_dict, encoded_token_dict): | |
| final_encoded_tokens = [] | |
| printer_dict = OrderedDict() | |
| keys = list(ordered_dict.keys()) | |
| i = 0 | |
| while i < len(keys): | |
| key = keys[i] | |
| j = 0 | |
| jump = False | |
| while j<len(ordered_dict[key]): | |
| if ordered_dict[key][j] in encoded_token_dict: | |
| final_encoded_tokens.append(encoded_token_dict[ordered_dict[key][j]]) | |
| printer_dict[ordered_dict[key][j]] = encoded_token_dict[ordered_dict[key][j]] | |
| i+=len(ordered_dict[key][j]) | |
| jump = True | |
| j = 0 | |
| break | |
| j+=1 | |
| if not jump: | |
| i+=1 | |
| return final_encoded_tokens,printer_dict | |
| if __name__ == "__main__": | |
| ## Encoder | |
| import tokenizer | |
| text = "తెలుగు భాష ఒక ద్రావిడ భాష." | |
| encoded_tokens = [token.encode('utf-8') for token in text] | |
| consective_tokens = get_consecutive_tokens(encoded_tokens,window_size=4) | |
| # Reading vocabulary from file | |
| formatted_vocab = tokenizer.read_vocab_from_file() | |
| # Invert vocabulary | |
| inverted_vocab = {v: k for k, v in formatted_vocab.items()} | |
| # Expand vocabulary | |
| decoder_map = tokenizer.expand_vocab(inverted_vocab) | |
| # Invert back again after expansion | |
| re_inverted_vocab = {k: v for v, k in decoder_map.items()} | |
| # encoded_tokens = [re_inverted_vocab.get(token) for token in consective_tokens] | |
| encoded_tokens,printer_dict = search_consecutive_tokens(consective_tokens, re_inverted_vocab) | |
| print(encoded_tokens) | |
| print([(b''.join(key).decode('utf-8'), value) for key, value in printer_dict.items()]) | |
| ## decoder: | |
| # import tokenizer | |
| # text = "573, 312, 255, 255, 419, 55, 255, 255, 394, 255, 255, 624, 62, 291, 33, 255, 255, 419, 55, 254" | |
| # toks_li = [token for token in text.split(',')] | |
| # # Reading vocabulary from file | |
| # formatted_vocab = tokenizer.read_vocab_from_file() | |
| # # Invert vocabulary | |
| # inverted_vocab = {v: k for k, v in formatted_vocab.items()} | |
| # # Expand vocabulary | |
| # decoder_map = tokenizer.expand_vocab(inverted_vocab) | |
| # decoded_tokens = [decoder_map.get(int(token)) for token in toks_li] | |
| # decoded_tokens = [item for token in decoded_tokens for item in token] | |
| # tokens = [token.decode('utf-8') for token in decoded_tokens] | |
| # decoded_tokens = b''.join(decoded_tokens) | |
| # decoded_tokens = decoded_tokens.decode('utf-8') | |
| # print(decoded_tokens) | |
| #op_li = get_consecutive_tokens([1,2,3,4,5]) | |
| #print(op_li) | |
| # dict = {(1,2):9,(3,):10, (4,5):11} | |
| # opp = search_consecutive_tokens(op_li, dict) | |
| # print(opp) | |
| # text = "9,10,11" | |
| # toks_li = [token for token in text.split(',')] | |
| # # Reading vocabulary from file | |
| # import tokenizer | |
| # formatted_vocab = tokenizer.read_vocab_from_file() | |
| # # Invert vocabulary | |
| # inverted_vocab = {v: k for k, v in formatted_vocab.items()} | |
| # # Expand vocabulary | |
| # decoder_map = tokenizer.expand_vocab(inverted_vocab) | |
| # decoded_tokens = [decoder_map.get(int(token)) for token in toks_li] | |
| # print(decoded_tokens) | |
| # encoded_tokens = encode_tokens_parallel(text, chunk_size=1_000_000, max_workers=2) | |
| # encoded_tokens = [token.encode('utf-8') for token in text] | |
| # decoded_tokens = [i.decode('utf-8') for i in encoded_tokens] | |
| # print(get_consecutive_tokens(decoded_tokens)) | |