Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import json | |
| import os | |
| import sys | |
| import numpy as np | |
| # Add the current directory to Python path | |
| current_dir = os.path.dirname(os.path.abspath(__file__)) | |
| sys.path.append(current_dir) | |
| from tokenizers.basic import BasicTokenizer | |
| def load_tokenizer(model_path, vocab_path): | |
| """Load the trained tokenizer""" | |
| tokenizer = BasicTokenizer() | |
| try: | |
| # Check if paths exist | |
| if not os.path.exists(model_path): | |
| raise FileNotFoundError(f"Model file not found at: {model_path}") | |
| if not os.path.exists(vocab_path): | |
| raise FileNotFoundError(f"Vocabulary file not found at: {vocab_path}") | |
| # Load the trained model | |
| tokenizer.load(model_path) | |
| # Load vocabulary | |
| with open(vocab_path, 'r', encoding='utf-8') as f: | |
| vocab_data = json.load(f) | |
| tokenizer.token_to_id = {k: int(v) for k, v in vocab_data['token_to_id'].items()} | |
| tokenizer.id_to_token = {int(k): v for k, v in vocab_data['id_to_token'].items()} | |
| tokenizer.merges = {tuple(map(int, k.split(','))): int(v) | |
| for k, v in vocab_data['merges'].items()} | |
| return tokenizer | |
| except Exception as e: | |
| raise Exception(f"Error loading tokenizer: {str(e)}") | |
| def encode_text(text, tokenizer): | |
| """Encode text and return statistics""" | |
| if not text.strip(): | |
| return ("Please enter some Telugu text", | |
| "No statistics available", | |
| []) | |
| try: | |
| # Encode the text | |
| encoded = tokenizer.encode(text) | |
| # Calculate compression ratio | |
| original_size = len(text.encode('utf-8')) | |
| encoded_size = len(encoded) * 2 | |
| compression_ratio = original_size / encoded_size | |
| # Prepare statistics | |
| stats = f""" | |
| 📊 Encoding Statistics: | |
| • Original text length: {len(text)} characters | |
| • Encoded length: {len(encoded)} tokens | |
| • Compression ratio: {compression_ratio:.2f}X | |
| • Original size: {original_size} bytes | |
| • Encoded size: {encoded_size} bytes | |
| • Space saved: {(1 - encoded_size/original_size) * 100:.1f}% | |
| """ | |
| # Create visualization data | |
| tokens = [] | |
| # Generate colors based on token frequencies | |
| unique_tokens = set(encoded) | |
| # Create color map with string hex colors | |
| color_map = {token: f"#{hash(str(token)) % 0xFFFFFF:06x}" for token in unique_tokens} | |
| # Create visualization list with proper format | |
| visualization = [] | |
| for token_id in encoded: | |
| token_bytes = tokenizer.vocab[token_id] | |
| token_text = token_bytes.decode('utf-8', errors='replace') | |
| visualization.append((token_text, color_map[token_id])) | |
| return ( | |
| str(encoded), | |
| stats, | |
| visualization | |
| ) | |
| except Exception as e: | |
| return ( | |
| f"Error: {str(e)}", | |
| "Error occurred during encoding", | |
| [] | |
| ) | |
| def decode_ids(encoded_ids_str): | |
| """Decode the encoded IDs back to text""" | |
| if not encoded_ids_str.strip(): | |
| return "Please enter encoded IDs" | |
| try: | |
| # Convert string representation of list to actual list of integers | |
| encoded_ids = eval(encoded_ids_str) | |
| if not isinstance(encoded_ids, list): | |
| return "Invalid input: Please enter a list of integers" | |
| # Decode the IDs | |
| decoded_text = tokenizer.decode(encoded_ids) | |
| return decoded_text | |
| except Exception as e: | |
| return f"Error during decoding: {str(e)}" | |
| # Load the tokenizer | |
| try: | |
| model_path = os.path.join(current_dir, "models", "version_2", "checkpoints", "telugu_basic.model") | |
| vocab_path = os.path.join(current_dir, "models", "version_2", "vocabulary", "vocabulary.json") | |
| print(f"Loading model from: {model_path}") | |
| print(f"Loading vocabulary from: {vocab_path}") | |
| tokenizer = load_tokenizer(model_path, vocab_path) | |
| print("Tokenizer loaded successfully") | |
| except Exception as e: | |
| print(f"Error loading tokenizer: {str(e)}") | |
| raise | |
| # Example inputs | |
| encoder_examples = [ | |
| ["తెలుగు భాష చాలా అందమైనది", "Basic sentence example"], | |
| ["నేను తెలుగు నేర్చుకుంటున్నాను", "Learning Telugu example"], | |
| ["ప్రతి ఒక్కరూ సంతోషంగా ఉండాలి", "Happiness wish example"], | |
| ["అరణ్యంలో రాముడు అనేక రాక్షసులను సంహరిస్తాడు", "Complex sentence example"], | |
| ["తెలుగు సాహిత్యం చాలా సమృద్ధిగా ఉంది", "Literature example"] | |
| ] | |
| decoder_examples = [ | |
| ["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260]", "Basic sentence decoding"], | |
| ["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260, 287, 2206]", "Multiple tokens decoding"], | |
| ] | |
| # Create the Gradio interface | |
| with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo: | |
| gr.Markdown(""" | |
| # 🔤 Telugu Text Tokenizer | |
| This tool helps you encode Telugu text into tokens and decode them back. | |
| It uses a trained BPE (Byte Pair Encoding) tokenizer optimized for Telugu language. | |
| ## Features: | |
| - 🔄 Encode Telugu text to token IDs | |
| - 📊 View compression statistics | |
| - 🎨 Visualize token segmentation | |
| - ⚡ Fast and efficient encoding/decoding | |
| """) | |
| with gr.Tab("Encoder"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| input_text = gr.Textbox( | |
| label="Enter Telugu Text", | |
| placeholder="Type or paste Telugu text here...", | |
| lines=5, | |
| interactive=True | |
| ) | |
| encode_btn = gr.Button("🔄 Encode", variant="primary") | |
| with gr.Column(): | |
| with gr.Row(): | |
| encoded_output = gr.Textbox( | |
| label="Encoded Token IDs", | |
| lines=5, | |
| interactive=False, | |
| show_copy_button=True | |
| ) | |
| stats_output = gr.Textbox( | |
| label="Statistics", | |
| lines=8, | |
| interactive=False | |
| ) | |
| with gr.Row(): | |
| token_viz = gr.HighlightedText( | |
| label="Token Segmentation", | |
| show_legend=True, | |
| combine_adjacent=True, | |
| color_map={} | |
| ) | |
| # Encoder button click event | |
| encode_btn.click( | |
| fn=lambda text: encode_text(text, tokenizer), | |
| inputs=[input_text], | |
| outputs=[encoded_output, stats_output, token_viz] | |
| ) | |
| # Examples for encoder | |
| gr.Examples( | |
| examples=encoder_examples, | |
| inputs=input_text, | |
| outputs=[encoded_output, stats_output, token_viz], | |
| fn=lambda x: encode_text(x, tokenizer), | |
| cache_examples=True, | |
| label="Telugu Text Examples" | |
| ) | |
| with gr.Tab("Decoder"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| encoded_input = gr.Textbox( | |
| label="Enter Encoded Token IDs", | |
| placeholder="Paste the encoded token IDs here...", | |
| lines=5, | |
| interactive=True | |
| ) | |
| decode_btn = gr.Button("🔄 Decode", variant="primary") | |
| with gr.Column(): | |
| decoded_output = gr.Textbox( | |
| label="Decoded Telugu Text", | |
| lines=5, | |
| interactive=False | |
| ) | |
| # Decoder button click event | |
| decode_btn.click( | |
| fn=decode_ids, | |
| inputs=[encoded_input], | |
| outputs=[decoded_output] | |
| ) | |
| # Examples for decoder | |
| gr.Examples( | |
| examples=decoder_examples, | |
| inputs=encoded_input, | |
| outputs=decoded_output, | |
| fn=decode_ids, | |
| cache_examples=True, | |
| label="Token ID Examples" | |
| ) | |
| gr.Markdown(""" | |
| ### 📝 Instructions: | |
| 1. **Encoding**: | |
| - Enter Telugu text in the encoder tab | |
| - Click "Encode" to get token IDs and statistics | |
| - Try the examples below to see how different texts are encoded | |
| 2. **Decoding**: | |
| - Copy the encoded IDs from the encoder output | |
| - Paste them in the decoder tab | |
| - Click "Decode" to get back the original text | |
| - Try the example token IDs to see how decoding works | |
| 3. **Visualization**: | |
| - Each token is highlighted with a unique color | |
| - Same tokens will have the same color | |
| - Hover over tokens to see their IDs | |
| ### 🎯 Example Usage: | |
| - Try encoding "తెలుగు" to see how basic words are tokenized | |
| - Use longer sentences to see compression in action | |
| - Copy encoded IDs and decode them back to verify accuracy | |
| ### ℹ️ Notes: | |
| - The tokenizer uses BPE (Byte Pair Encoding) algorithm | |
| - Compression ratio shows how efficiently the text is encoded | |
| - Different colors in visualization represent different tokens | |
| - Typical compression ratios range from 3x to 4x | |
| """) | |
| gr.Markdown(""" | |
| --- | |
| ### 📌 Version Information | |
| - Model Version: 2.0 | |
| - Vocabulary Size: 4800 tokens | |
| - Last Updated: 2024 | |
| """) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch( | |
| share=True, | |
| debug=True, | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| show_error=True | |
| ) |