Spaces:
Sleeping
Sleeping
File size: 4,617 Bytes
d83c04d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 |
import streamlit as st
from pathlib import Path
from hindi_tokenizer import load_tokenizer, encode_text, decode_text
def load_hindi_tokenizer():
"""Load the trained Hindi BPE tokenizer"""
output_dir = Path(__file__).parent / "output"
config_path = output_dir / "hindi_encoder.json"
if not config_path.exists():
st.error("Error: Tokenizer configuration not found! Please train the tokenizer first.")
st.stop()
try:
return load_tokenizer(str(config_path))
except Exception as e:
st.error(f"Error-1 loading tokenizer: {e}")
st.stop()
def main():
st.set_page_config(
page_title="Hindi BPE Tokenizer",
page_icon="🇮🇳",
layout="wide"
)
st.title("Hindi BPE Tokenizer")
st.markdown("A web interface for encoding and decoding Hindi text using BPE tokenization")
# Load tokenizer
try:
tokenizer = load_hindi_tokenizer()
except Exception as e:
st.error(f"Error loading tokenizer: {e}")
st.stop()
# Create two columns
encode_col, decode_col = st.columns(2)
# Encoding Section
with encode_col:
st.header("Encode Hindi Text")
st.markdown("Convert Hindi text into token IDs")
input_text = st.text_area(
"Enter Hindi Text",
placeholder="यहाँ हिंदी टेक्स्ट लिखें...",
height=150,
key="encode_input"
)
if st.button("Encode", key="encode_button"):
if input_text.strip():
try:
token_ids, tokens = encode_text(tokenizer, input_text)
st.subheader("Results:")
st.markdown("**Tokens:**")
st.write(tokens)
st.markdown("**Token IDs:**")
st.write(token_ids)
# Display as comma-separated string for easy copying
st.markdown("**Token IDs (comma-separated):**")
st.code(", ".join(map(str, token_ids)))
except Exception as e:
st.error(f"Error during encoding: {e}")
else:
st.warning("Please enter some text to encode")
# Decoding Section
with decode_col:
st.header("Decode Token IDs")
st.markdown("Convert token IDs back to Hindi text")
input_ids = st.text_area(
"Enter Token IDs (comma-separated)",
placeholder="2517, 2074, 340, 4, 201...",
height=150,
key="decode_input"
)
if st.button("Decode", key="decode_button"):
if input_ids.strip():
try:
# Convert string of IDs to list of integers
token_ids = [int(id.strip()) for id in input_ids.split(",")]
decoded_text = decode_text(tokenizer, token_ids)
st.subheader("Results:")
st.markdown("**Decoded Text:**")
st.write(decoded_text)
# Display in a box for better visibility
st.text_area(
"Decoded Text (copyable)",
value=decoded_text,
height=100,
key="decoded_output"
)
except ValueError:
st.error("Invalid input format. Please enter comma-separated numbers.")
except Exception as e:
st.error(f"Error during decoding: {e}")
else:
st.warning("Please enter token IDs to decode")
# Add information section at the bottom
st.markdown("---")
st.markdown("### About the Tokenizer")
info_col1, info_col2 = st.columns(2)
with info_col1:
st.markdown("""
**Tokenizer Details:**
- Type: Byte Pair Encoding (BPE)
- Vocabulary Size: 4,500 tokens
- Special Tokens: `<pad>`, `<unk>`, `<s>`, `</s>`
- Minimum Token Frequency: 2
""")
with info_col2:
st.markdown("""
**Preprocessing:**
- Retains Hindi Unicode (\\u0900-\\u097F)
- Removes digits and special characters
- Normalizes punctuation
- Cleans whitespace
""")
if __name__ == "__main__":
main() |