File size: 4,617 Bytes
d83c04d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import streamlit as st
from pathlib import Path
from hindi_tokenizer import load_tokenizer, encode_text, decode_text

def load_hindi_tokenizer():
    """Load the trained Hindi BPE tokenizer"""
    output_dir = Path(__file__).parent / "output"
    config_path = output_dir / "hindi_encoder.json"
    
    if not config_path.exists():
        st.error("Error: Tokenizer configuration not found! Please train the tokenizer first.")
        st.stop()
    
    try:
        return load_tokenizer(str(config_path))
    except Exception as e:
        st.error(f"Error-1 loading tokenizer: {e}")
        st.stop()


def main():
    st.set_page_config(
        page_title="Hindi BPE Tokenizer",
        page_icon="🇮🇳",
        layout="wide"
    )
    
    st.title("Hindi BPE Tokenizer")
    st.markdown("A web interface for encoding and decoding Hindi text using BPE tokenization")
    
    # Load tokenizer
    try:
        tokenizer = load_hindi_tokenizer()
    except Exception as e:
        st.error(f"Error loading tokenizer: {e}")
        st.stop()
    
    # Create two columns
    encode_col, decode_col = st.columns(2)
    
    # Encoding Section
    with encode_col:
        st.header("Encode Hindi Text")
        st.markdown("Convert Hindi text into token IDs")
        
        input_text = st.text_area(
            "Enter Hindi Text",
            placeholder="यहाँ हिंदी टेक्स्ट लिखें...",
            height=150,
            key="encode_input"
        )
        
        if st.button("Encode", key="encode_button"):
            if input_text.strip():
                try:
                    token_ids, tokens = encode_text(tokenizer, input_text)
                    
                    st.subheader("Results:")
                    st.markdown("**Tokens:**")
                    st.write(tokens)
                    
                    st.markdown("**Token IDs:**")
                    st.write(token_ids)
                    
                    # Display as comma-separated string for easy copying
                    st.markdown("**Token IDs (comma-separated):**")
                    st.code(", ".join(map(str, token_ids)))
                    
                except Exception as e:
                    st.error(f"Error during encoding: {e}")
            else:
                st.warning("Please enter some text to encode")
    
    # Decoding Section
    with decode_col:
        st.header("Decode Token IDs")
        st.markdown("Convert token IDs back to Hindi text")
        
        input_ids = st.text_area(
            "Enter Token IDs (comma-separated)",
            placeholder="2517, 2074, 340, 4, 201...",
            height=150,
            key="decode_input"
        )
        
        if st.button("Decode", key="decode_button"):
            if input_ids.strip():
                try:
                    # Convert string of IDs to list of integers
                    token_ids = [int(id.strip()) for id in input_ids.split(",")]
                    
                    decoded_text = decode_text(tokenizer, token_ids)
                    
                    st.subheader("Results:")
                    st.markdown("**Decoded Text:**")
                    st.write(decoded_text)
                    
                    # Display in a box for better visibility
                    st.text_area(
                        "Decoded Text (copyable)",
                        value=decoded_text,
                        height=100,
                        key="decoded_output"
                    )
                    
                except ValueError:
                    st.error("Invalid input format. Please enter comma-separated numbers.")
                except Exception as e:
                    st.error(f"Error during decoding: {e}")
            else:
                st.warning("Please enter token IDs to decode")
    
    # Add information section at the bottom
    st.markdown("---")
    st.markdown("### About the Tokenizer")
    
    info_col1, info_col2 = st.columns(2)
    
    with info_col1:
        st.markdown("""
        **Tokenizer Details:**
        - Type: Byte Pair Encoding (BPE)
        - Vocabulary Size: 4,500 tokens
        - Special Tokens: `<pad>`, `<unk>`, `<s>`, `</s>`
        - Minimum Token Frequency: 2
        """)
    
    with info_col2:
        st.markdown("""
        **Preprocessing:**
        - Retains Hindi Unicode (\\u0900-\\u097F)
        - Removes digits and special characters
        - Normalizes punctuation
        - Cleans whitespace
        """)

if __name__ == "__main__":
    main()