Spaces:

anveshplus
/

BPE-Tokenizer

Sleeping

App Files Files Community

BPE-Tokenizer / app.py

anveshplus

updated

976bbab about 1 year ago

raw

history blame contribute delete

4.89 kB

	import streamlit as st
	import encoder_parallel_telugu as encode_parallel
	from consecutive_tokens import get_consecutive_tokens, search_consecutive_tokens
	import tokenizer

	def encode(text):
	if text == "":
	return "Enter text to encode..."
	encoded_tokens = [token.encode('utf-8') for token in text]
	consective_tokens = get_consecutive_tokens(encoded_tokens,window_size=4)
	# Reading vocabulary from file
	formatted_vocab = tokenizer.read_vocab_from_file()
	# Invert vocabulary
	inverted_vocab = {v: k for k, v in formatted_vocab.items()}
	# Expand vocabulary
	decoder_map = tokenizer.expand_vocab(inverted_vocab)
	# Invert back again after expansion
	re_inverted_vocab = {k: v for v, k in decoder_map.items()}

	# encoded_tokens = [re_inverted_vocab.get(token) for token in consective_tokens]
	encoded_tokens, printer_dict = search_consecutive_tokens(consective_tokens, re_inverted_vocab)
	print(encoded_tokens)
	printer = [(b''.join(key).decode('utf-8'), value) for key, value in printer_dict.items()]
	return f"Encoded: {encoded_tokens} , Printer: {printer}"

	def decode(text):
	# Placeholder for decoding logic
	toks_li = [token for token in text.split(',')]
	# Reading vocabulary from file
	formatted_vocab = tokenizer.read_vocab_from_file()
	# Invert vocabulary
	inverted_vocab = {v: k for k, v in formatted_vocab.items()}
	# Expand vocabulary
	decoder_map = tokenizer.expand_vocab(inverted_vocab)
	decoded_tokens = [decoder_map.get(int(token)) for token in toks_li]
	decoded_tokens = [item for token in decoded_tokens for item in token]
	tokens = [token.decode('utf-8') for token in decoded_tokens]
	decoded_tokens = b''.join(decoded_tokens)
	decoded_tokens = decoded_tokens.decode('utf-8')
	return f"->Decoded: {decoded_tokens} "

	st.set_page_config(page_title="Telugu BPE Tokenizer", layout="centered", initial_sidebar_state="expanded")
	st.markdown("<h1 style='color: #2ECC40; text-align: center;'>Telugu BPE Tokenizer</h1>", unsafe_allow_html=True)

	# Add custom CSS for styling
	st.markdown(
	"""
	<style>
	.title {
	color: #FFFFFF;
	background-color: #2C3E50;
	font-family: "Arial", sans-serif;
	font-size: 2.5em;
	padding: 20px;
	text-align: center;
	}
	.subheader {
	color: #2980B9;
	font-size: 1.5em;
	}
	.text-area {
	background-color: #ECF0F1;
	border: 1px solid #BDC3C7;
	border-radius: 5px;
	}
	.orange-button {
	background-color: #FFA500; /* Bright orange color */
	color: white;
	border: none;
	border-radius: 5px;
	padding: 10px 20px;
	cursor: pointer;
	}
	</style>
	""", unsafe_allow_html=True
	)

	# Create two columns for encoder and decoder
	col1, col2 = st.columns(2)

	with col1:
	st.markdown("<div class='subheader' style='color: #FFA500;'>Encoder</div>", unsafe_allow_html=True)
	encoder_input = st.text_area("Input Text for Encoding", placeholder="Enter text to encode...", key="encoder_input", height=100)
	if st.button("Encode", key="encode_button"):
	encoder_output = encode(encoder_input)
	st.text_area("Encoded Output", value=encoder_output, height=100, disabled=True, key="encoder_output")

	with col2:
	st.markdown("<div class='subheader' style='color: #FFA500;'>Decoder</div>", unsafe_allow_html=True)
	decoder_input = st.text_area("Input Text for Decoding", placeholder="51,32,63,94,15", key="decoder_input", height=100)
	if st.button("Decode", key="decode_button"):
	decoder_output = decode(decoder_input)
	st.text_area("Decoded Output", value=decoder_output, height=100, disabled=True, key="decoder_output")

	st.markdown("<hr style='border: 1px solid #BDC3C7;'>", unsafe_allow_html=True) # Add a horizontal line above the section in grey
	# Add sample texts at the end of the page
	st.markdown("<div class='subheader'>Sample Texts</div>", unsafe_allow_html=True)

	st.markdown("<br>", unsafe_allow_html=True)

	st.markdown("<div style='margin-bottom: 10px;'> <span style='font-weight: bold;'>తెలుగు  భాష  ఒక  ద్రావిడ  భాష.</span></div>", unsafe_allow_html=True)
	st.markdown("<div style='margin-bottom: 10px;'> <span style='font-weight: bold;'>మోదీ  మార్కు  రాజకీయం.</span></div>", unsafe_allow_html=True)
	st.markdown("<div style='margin-bottom: 10px;'> <span style='font-weight: bold;'>రెండు  విధాలా  ఆలోచిస్తా.</span></div>", unsafe_allow_html=True)

	if __name__ == "__main__":
	st.write("Streamlit app is running...")
	st.write("To view this page in your browser, run the command: `streamlit run app.py` and open the provided local URL.")