import streamlit as st from pathlib import Path from hindi_tokenizer import load_tokenizer, encode_text, decode_text def load_hindi_tokenizer(): """Load the trained Hindi BPE tokenizer""" output_dir = Path(__file__).parent / "output" config_path = output_dir / "hindi_encoder.json" if not config_path.exists(): st.error("Error: Tokenizer configuration not found! Please train the tokenizer first.") st.stop() try: return load_tokenizer(str(config_path)) except Exception as e: st.error(f"Error-1 loading tokenizer: {e}") st.stop() def main(): st.set_page_config( page_title="Hindi BPE Tokenizer", page_icon="🇮🇳", layout="wide" ) st.title("Hindi BPE Tokenizer") st.markdown("A web interface for encoding and decoding Hindi text using BPE tokenization") # Load tokenizer try: tokenizer = load_hindi_tokenizer() except Exception as e: st.error(f"Error loading tokenizer: {e}") st.stop() # Create two columns encode_col, decode_col = st.columns(2) # Encoding Section with encode_col: st.header("Encode Hindi Text") st.markdown("Convert Hindi text into token IDs") input_text = st.text_area( "Enter Hindi Text", placeholder="यहाँ हिंदी टेक्स्ट लिखें...", height=150, key="encode_input" ) if st.button("Encode", key="encode_button"): if input_text.strip(): try: token_ids, tokens = encode_text(tokenizer, input_text) st.subheader("Results:") st.markdown("**Tokens:**") st.write(tokens) st.markdown("**Token IDs:**") st.write(token_ids) # Display as comma-separated string for easy copying st.markdown("**Token IDs (comma-separated):**") st.code(", ".join(map(str, token_ids))) except Exception as e: st.error(f"Error during encoding: {e}") else: st.warning("Please enter some text to encode") # Decoding Section with decode_col: st.header("Decode Token IDs") st.markdown("Convert token IDs back to Hindi text") input_ids = st.text_area( "Enter Token IDs (comma-separated)", placeholder="2517, 2074, 340, 4, 201...", height=150, key="decode_input" ) if st.button("Decode", key="decode_button"): if input_ids.strip(): try: # Convert string of IDs to list of integers token_ids = [int(id.strip()) for id in input_ids.split(",")] decoded_text = decode_text(tokenizer, token_ids) st.subheader("Results:") st.markdown("**Decoded Text:**") st.write(decoded_text) # Display in a box for better visibility st.text_area( "Decoded Text (copyable)", value=decoded_text, height=100, key="decoded_output" ) except ValueError: st.error("Invalid input format. Please enter comma-separated numbers.") except Exception as e: st.error(f"Error during decoding: {e}") else: st.warning("Please enter token IDs to decode") # Add information section at the bottom st.markdown("---") st.markdown("### About the Tokenizer") info_col1, info_col2 = st.columns(2) with info_col1: st.markdown(""" **Tokenizer Details:** - Type: Byte Pair Encoding (BPE) - Vocabulary Size: 4,500 tokens - Special Tokens: ``, ``, ``, `` - Minimum Token Frequency: 2 """) with info_col2: st.markdown(""" **Preprocessing:** - Retains Hindi Unicode (\\u0900-\\u097F) - Removes digits and special characters - Normalizes punctuation - Cleans whitespace """) if __name__ == "__main__": main()