Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import json | |
| class BPETokenizer: | |
| def __init__(self, vocab_path): | |
| # Load pre-trained vocabulary | |
| with open(vocab_path, 'r', encoding='utf-8') as f: | |
| self.vocab = json.load(f) | |
| def encode(self, text): | |
| """Encode a piece of text into BPE tokens.""" | |
| for token in sorted(self.vocab, key=len, reverse=True): # Sort tokens by length in descending order | |
| text = text.replace(token, f' {token} ') # Replace tokens with space-separated versions | |
| return text.split() # Split text into tokens | |
| # Load the pre-trained tokenizer | |
| vocab_path = "bpe_vocab_5000.json" | |
| bpe_tokenizer = BPETokenizer(vocab_path) | |
| # Gradio Functions | |
| def encode_text(text): | |
| """Encode user-provided text with the pre-trained tokenizer.""" | |
| if not text.strip(): | |
| return "Please enter some text to encode." # Handle empty input | |
| tokens = bpe_tokenizer.encode(text) | |
| return " | ".join(tokens) # Use a separator to display tokens clearly | |
| # Gradio Interface | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# Bengali BPE Tokenizer") | |
| gr.Markdown( | |
| """ | |
| This app encodes Bengali text into Byte Pair Encoding (BPE) tokens using a pre-trained tokenizer. | |
| Enter Bengali text below and press "Encode" to view the tokenized output. | |
| """ | |
| ) | |
| with gr.Row(): | |
| input_text = gr.TextArea(label="Enter Bengali Text to Encode", lines=5, placeholder="Type Bengali text here...") | |
| output_tokens = gr.Textbox(label="Encoded Tokens", lines=5, interactive=False) | |
| encode_button = gr.Button("Encode") | |
| encode_button.click(encode_text, inputs=input_text, outputs=output_tokens) | |
| # Launch the app | |
| demo.launch(share=True) | |