Spaces:
Build error
Build error
| import gradio as gr | |
| from tokenizer import HindiTokenizer | |
| # Load the tokenizer | |
| tokenizer = HindiTokenizer() | |
| tokenizer.load_bpe_vocab("hindi_bpe_vocab.model") | |
| def encode_text(hindi_text): | |
| """ | |
| Encodes the given Hindi text into token IDs. | |
| """ | |
| token_ids = tokenizer.encode(hindi_text) | |
| return token_ids | |
| def encode_text_with_compression(hindi_text): | |
| """ | |
| Encodes the given Hindi text into token IDs and calculates the compression ratio. | |
| """ | |
| # Get token IDs | |
| token_ids = tokenizer.encode(hindi_text) | |
| # Calculate the original text size in bytes | |
| text_byte_length = len(hindi_text.encode('utf-8')) | |
| # Calculate the number of token IDs | |
| token_id_length = len(token_ids) | |
| # Compression ratio | |
| if text_byte_length > 0: | |
| compression_ratio = text_byte_length / token_id_length | |
| else: | |
| compression_ratio = 0 # Handle edge case for empty input | |
| return token_ids, f"{compression_ratio:.2f}" | |
| def decode_tokens(token_ids): | |
| """ | |
| Decodes the given token IDs into Hindi text. | |
| """ | |
| # Ensure token_ids is a list of integers | |
| try: | |
| token_ids = list(map(int, token_ids.strip("[]").split(","))) | |
| except Exception as e: | |
| return f"Error in processing token IDs: {e}" | |
| decoded_text = tokenizer.decode(token_ids) | |
| return decoded_text | |
| # Gradio interface | |
| with gr.Blocks() as app: | |
| gr.Markdown("## Hindi Tokenizer Encoder-Decoder") | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown("### Encode Hindi Text to Token IDs") | |
| hindi_text_input = gr.Textbox(label="Enter Hindi Text") | |
| token_ids_output = gr.Textbox(label="Token IDs (Encoded)", interactive=False) | |
| compression_ratio_output = gr.Textbox(label="Compression Ratio", interactive=False) | |
| encode_button = gr.Button("Encode") | |
| # Example for encoding | |
| encode_example = gr.Examples( | |
| examples=["मेरा भारत महान॥", "आपका घर कितनी दूर है?", "स्वतंत्रता दिवस", "द क्विक ब्राउन फॉक्स जम्प्स ओवर ए लेज़ी डॉग।"], | |
| inputs=hindi_text_input, | |
| outputs=[token_ids_output, compression_ratio_output], | |
| fn=encode_text_with_compression | |
| ) | |
| with gr.Column(): | |
| gr.Markdown("### Decode Token IDs to Hindi Text") | |
| token_ids_input = gr.Textbox(label="Enter Token IDs (comma-separated or list)") | |
| decoded_text_output = gr.Textbox(label="Decoded Hindi Text", interactive=False) | |
| decode_button = gr.Button("Decode") | |
| encode_button.click( | |
| encode_text_with_compression, | |
| inputs=hindi_text_input, | |
| outputs=[token_ids_output, compression_ratio_output] | |
| ) | |
| decode_button.click(decode_tokens, inputs=token_ids_input, outputs=decoded_text_output) | |
| app.launch() |