import streamlit as st import encoder_parallel_telugu as encode_parallel from consecutive_tokens import get_consecutive_tokens, search_consecutive_tokens import tokenizer def encode(text): if text == "": return "Enter text to encode..." encoded_tokens = [token.encode('utf-8') for token in text] consective_tokens = get_consecutive_tokens(encoded_tokens,window_size=4) # Reading vocabulary from file formatted_vocab = tokenizer.read_vocab_from_file() # Invert vocabulary inverted_vocab = {v: k for k, v in formatted_vocab.items()} # Expand vocabulary decoder_map = tokenizer.expand_vocab(inverted_vocab) # Invert back again after expansion re_inverted_vocab = {k: v for v, k in decoder_map.items()} # encoded_tokens = [re_inverted_vocab.get(token) for token in consective_tokens] encoded_tokens, printer_dict = search_consecutive_tokens(consective_tokens, re_inverted_vocab) print(encoded_tokens) printer = [(b''.join(key).decode('utf-8'), value) for key, value in printer_dict.items()] return f"Encoded: {encoded_tokens} , Printer: {printer}" def decode(text): # Placeholder for decoding logic toks_li = [token for token in text.split(',')] # Reading vocabulary from file formatted_vocab = tokenizer.read_vocab_from_file() # Invert vocabulary inverted_vocab = {v: k for k, v in formatted_vocab.items()} # Expand vocabulary decoder_map = tokenizer.expand_vocab(inverted_vocab) decoded_tokens = [decoder_map.get(int(token)) for token in toks_li] decoded_tokens = [item for token in decoded_tokens for item in token] tokens = [token.decode('utf-8') for token in decoded_tokens] decoded_tokens = b''.join(decoded_tokens) decoded_tokens = decoded_tokens.decode('utf-8') return f"->Decoded: {decoded_tokens} " st.set_page_config(page_title="Telugu BPE Tokenizer", layout="centered", initial_sidebar_state="expanded") st.markdown("