Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import encoder_parallel_telugu as encode_parallel | |
| from consecutive_tokens import get_consecutive_tokens, search_consecutive_tokens | |
| import tokenizer | |
| def encode(text): | |
| if text == "": | |
| return "Enter text to encode..." | |
| encoded_tokens = [token.encode('utf-8') for token in text] | |
| consective_tokens = get_consecutive_tokens(encoded_tokens,window_size=4) | |
| # Reading vocabulary from file | |
| formatted_vocab = tokenizer.read_vocab_from_file() | |
| # Invert vocabulary | |
| inverted_vocab = {v: k for k, v in formatted_vocab.items()} | |
| # Expand vocabulary | |
| decoder_map = tokenizer.expand_vocab(inverted_vocab) | |
| # Invert back again after expansion | |
| re_inverted_vocab = {k: v for v, k in decoder_map.items()} | |
| # encoded_tokens = [re_inverted_vocab.get(token) for token in consective_tokens] | |
| encoded_tokens, printer_dict = search_consecutive_tokens(consective_tokens, re_inverted_vocab) | |
| print(encoded_tokens) | |
| printer = [(b''.join(key).decode('utf-8'), value) for key, value in printer_dict.items()] | |
| return f"Encoded: {encoded_tokens} , Printer: {printer}" | |
| def decode(text): | |
| # Placeholder for decoding logic | |
| toks_li = [token for token in text.split(',')] | |
| # Reading vocabulary from file | |
| formatted_vocab = tokenizer.read_vocab_from_file() | |
| # Invert vocabulary | |
| inverted_vocab = {v: k for k, v in formatted_vocab.items()} | |
| # Expand vocabulary | |
| decoder_map = tokenizer.expand_vocab(inverted_vocab) | |
| decoded_tokens = [decoder_map.get(int(token)) for token in toks_li] | |
| decoded_tokens = [item for token in decoded_tokens for item in token] | |
| tokens = [token.decode('utf-8') for token in decoded_tokens] | |
| decoded_tokens = b''.join(decoded_tokens) | |
| decoded_tokens = decoded_tokens.decode('utf-8') | |
| return f"->Decoded: {decoded_tokens} " | |
| st.set_page_config(page_title="Telugu BPE Tokenizer", layout="centered", initial_sidebar_state="expanded") | |
| st.markdown("<h1 style='color: #2ECC40; text-align: center;'>Telugu BPE Tokenizer</h1>", unsafe_allow_html=True) | |
| # Add custom CSS for styling | |
| st.markdown( | |
| """ | |
| <style> | |
| .title { | |
| color: #FFFFFF; | |
| background-color: #2C3E50; | |
| font-family: "Arial", sans-serif; | |
| font-size: 2.5em; | |
| padding: 20px; | |
| text-align: center; | |
| } | |
| .subheader { | |
| color: #2980B9; | |
| font-size: 1.5em; | |
| } | |
| .text-area { | |
| background-color: #ECF0F1; | |
| border: 1px solid #BDC3C7; | |
| border-radius: 5px; | |
| } | |
| .orange-button { | |
| background-color: #FFA500; /* Bright orange color */ | |
| color: white; | |
| border: none; | |
| border-radius: 5px; | |
| padding: 10px 20px; | |
| cursor: pointer; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True | |
| ) | |
| # Create two columns for encoder and decoder | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| st.markdown("<div class='subheader' style='color: #FFA500;'>Encoder</div>", unsafe_allow_html=True) | |
| encoder_input = st.text_area("Input Text for Encoding", placeholder="Enter text to encode...", key="encoder_input", height=100) | |
| if st.button("Encode", key="encode_button"): | |
| encoder_output = encode(encoder_input) | |
| st.text_area("Encoded Output", value=encoder_output, height=100, disabled=True, key="encoder_output") | |
| with col2: | |
| st.markdown("<div class='subheader' style='color: #FFA500;'>Decoder</div>", unsafe_allow_html=True) | |
| decoder_input = st.text_area("Input Text for Decoding", placeholder="51,32,63,94,15", key="decoder_input", height=100) | |
| if st.button("Decode", key="decode_button"): | |
| decoder_output = decode(decoder_input) | |
| st.text_area("Decoded Output", value=decoder_output, height=100, disabled=True, key="decoder_output") | |
| st.markdown("<hr style='border: 1px solid #BDC3C7;'>", unsafe_allow_html=True) # Add a horizontal line above the section in grey | |
| # Add sample texts at the end of the page | |
| st.markdown("<div class='subheader'>Sample Texts</div>", unsafe_allow_html=True) | |
| st.markdown("<br>", unsafe_allow_html=True) | |
| st.markdown("<div style='margin-bottom: 10px;'> <span style='font-weight: bold;'>తెలుగు భాష ఒక ద్రావిడ భాష.</span></div>", unsafe_allow_html=True) | |
| st.markdown("<div style='margin-bottom: 10px;'> <span style='font-weight: bold;'>మోదీ మార్కు రాజకీయం.</span></div>", unsafe_allow_html=True) | |
| st.markdown("<div style='margin-bottom: 10px;'> <span style='font-weight: bold;'>రెండు విధాలా ఆలోచిస్తా.</span></div>", unsafe_allow_html=True) | |
| if __name__ == "__main__": | |
| st.write("Streamlit app is running...") | |
| st.write("To view this page in your browser, run the command: `streamlit run app.py` and open the provided local URL.") | |