Spaces:

HSinghHuggingFace
/

hindi-tokenizer

Sleeping

File size: 4,617 Bytes

d83c04d

import streamlit as st
from pathlib import Path
from hindi_tokenizer import load_tokenizer, encode_text, decode_text

def load_hindi_tokenizer():
    """Load the trained Hindi BPE tokenizer"""
    output_dir = Path(__file__).parent / "output"
    config_path = output_dir / "hindi_encoder.json"
    
    if not config_path.exists():
        st.error("Error: Tokenizer configuration not found! Please train the tokenizer first.")
        st.stop()
    
    try:
        return load_tokenizer(str(config_path))
    except Exception as e:
        st.error(f"Error-1 loading tokenizer: {e}")
        st.stop()


def main():
    st.set_page_config(
        page_title="Hindi BPE Tokenizer",
        page_icon="🇮🇳",
        layout="wide"
    )
    
    st.title("Hindi BPE Tokenizer")
    st.markdown("A web interface for encoding and decoding Hindi text using BPE tokenization")
    
    # Load tokenizer
    try:
        tokenizer = load_hindi_tokenizer()
    except Exception as e:
        st.error(f"Error loading tokenizer: {e}")
        st.stop()
    
    # Create two columns
    encode_col, decode_col = st.columns(2)
    
    # Encoding Section
    with encode_col:
        st.header("Encode Hindi Text")
        st.markdown("Convert Hindi text into token IDs")
        
        input_text = st.text_area(
            "Enter Hindi Text",
            placeholder="यहाँ हिंदी टेक्स्ट लिखें...",
            height=150,
            key="encode_input"
        )
        
        if st.button("Encode", key="encode_button"):
            if input_text.strip():
                try:
                    token_ids, tokens = encode_text(tokenizer, input_text)
                    
                    st.subheader("Results:")
                    st.markdown("**Tokens:**")
                    st.write(tokens)
                    
                    st.markdown("**Token IDs:**")
                    st.write(token_ids)
                    
                    # Display as comma-separated string for easy copying
                    st.markdown("**Token IDs (comma-separated):**")
                    st.code(", ".join(map(str, token_ids)))
                    
                except Exception as e:
                    st.error(f"Error during encoding: {e}")
            else:
                st.warning("Please enter some text to encode")
    
    # Decoding Section
    with decode_col:
        st.header("Decode Token IDs")
        st.markdown("Convert token IDs back to Hindi text")
        
        input_ids = st.text_area(
            "Enter Token IDs (comma-separated)",
            placeholder="2517, 2074, 340, 4, 201...",
            height=150,
            key="decode_input"
        )
        
        if st.button("Decode", key="decode_button"):
            if input_ids.strip():
                try:
                    # Convert string of IDs to list of integers
                    token_ids = [int(id.strip()) for id in input_ids.split(",")]
                    
                    decoded_text = decode_text(tokenizer, token_ids)
                    
                    st.subheader("Results:")
                    st.markdown("**Decoded Text:**")
                    st.write(decoded_text)
                    
                    # Display in a box for better visibility
                    st.text_area(
                        "Decoded Text (copyable)",
                        value=decoded_text,
                        height=100,
                        key="decoded_output"
                    )
                    
                except ValueError:
                    st.error("Invalid input format. Please enter comma-separated numbers.")
                except Exception as e:
                    st.error(f"Error during decoding: {e}")
            else:
                st.warning("Please enter token IDs to decode")
    
    # Add information section at the bottom
    st.markdown("---")
    st.markdown("### About the Tokenizer")
    
    info_col1, info_col2 = st.columns(2)
    
    with info_col1:
        st.markdown("""
        **Tokenizer Details:**
        - Type: Byte Pair Encoding (BPE)
        - Vocabulary Size: 4,500 tokens
        - Special Tokens: `<pad>`, `<unk>`, `<s>`, `</s>`
        - Minimum Token Frequency: 2
        """)
    
    with info_col2:
        st.markdown("""
        **Preprocessing:**
        - Retains Hindi Unicode (\\u0900-\\u097F)
        - Removes digits and special characters
        - Normalizes punctuation
        - Cleans whitespace
        """)

if __name__ == "__main__":
    main()