Spaces:

kishkath
/

bpe-tokenizer

Sleeping

App Files Files Community

kishkath commited on Jan 15, 2025

Commit

bf87fbf

verified ·

1 Parent(s): 4f5aaf6

Create app.py

Browse files

Files changed (1) hide show

app.py +199 -0

app.py ADDED Viewed

	@@ -0,0 +1,199 @@

+import gradio as gr
+import json
+import os
+from tokenizers.basic import BasicTokenizer
+import numpy as np
+def load_tokenizer(model_path, vocab_path):
+    """Load the trained tokenizer"""
+    tokenizer = BasicTokenizer()
+    try:
+        # Load the trained model
+        tokenizer.load(model_path)
+        # Load vocabulary
+        with open(vocab_path, 'r', encoding='utf-8') as f:
+            vocab_data = json.load(f)
+            tokenizer.token_to_id = {k: int(v) for k, v in vocab_data['token_to_id'].items()}
+            tokenizer.id_to_token = {int(k): v for k, v in vocab_data['id_to_token'].items()}
+            tokenizer.merges = {tuple(map(int, k.split(','))): int(v)
+                              for k, v in vocab_data['merges'].items()}
+        return tokenizer
+    except Exception as e:
+        raise Exception(f"Error loading tokenizer: {e}")
+def encode_text(text, tokenizer):
+    """Encode text and return statistics"""
+    if not text.strip():
+        return {
+            "encoded_ids": "Please enter some Telugu text",
+            "stats": "No statistics available",
+            "visualization": None
+        }
+    try:
+        # Encode the text
+        encoded = tokenizer.encode(text)
+        # Calculate compression ratio
+        original_size = len(text.encode('utf-8'))
+        encoded_size = len(encoded) * 2
+        compression_ratio = original_size / encoded_size
+        # Prepare statistics
+        stats = f"""
+        📊 Encoding Statistics:
+        • Original text length: {len(text)} characters
+        • Encoded length: {len(encoded)} tokens
+        • Compression ratio: {compression_ratio:.2f}X
+        • Original size: {original_size} bytes
+        • Encoded size: {encoded_size} bytes
+        • Space saved: {(1 - encoded_size/original_size) * 100:.1f}%
+        """
+        # Create token visualization
+        viz_data = visualize_encoding(text, encoded, tokenizer)
+        return {
+            "encoded_ids": str(encoded),
+            "stats": stats,
+            "visualization": viz_data
+        }
+    except Exception as e:
+        return {
+            "encoded_ids": f"Error: {str(e)}",
+            "stats": "Error occurred during encoding",
+            "visualization": None
+        }
+def decode_ids(encoded_ids_str, tokenizer):
+    """Decode the encoded IDs back to text"""
+    if not encoded_ids_str.strip():
+        return "Please enter encoded IDs"
+    try:
+        # Convert string representation of list to actual list of integers
+        encoded_ids = eval(encoded_ids_str)
+        if not isinstance(encoded_ids, list):
+            return "Invalid input: Please enter a list of integers"
+        # Decode the IDs
+        decoded_text = tokenizer.decode(encoded_ids)
+        return decoded_text
+    except Exception as e:
+        return f"Error during decoding: {str(e)}"
+def visualize_encoding(text, encoded_ids, tokenizer):
+    """Create a visual representation of the encoding"""
+    tokens = []
+    colors = []
+    # Generate colors based on token frequencies
+    unique_tokens = set(encoded_ids)
+    color_map = {token: np.random.rand(3).tolist() for token in unique_tokens}
+    for token_id in encoded_ids:
+        token_bytes = tokenizer.vocab[token_id]
+        token_text = token_bytes.decode('utf-8', errors='replace')
+        tokens.append(token_text)
+        colors.append(color_map[token_id])
+    return {
+        "tokens": tokens,
+        "colors": colors
+    }
+# Load the tokenizer
+model_path = "models/version_2/checkpoints/telugu_basic.model"
+vocab_path = "models/version_2/vocabulary/vocabulary.json"
+tokenizer = load_tokenizer(model_path, vocab_path)
+# Create the Gradio interface
+with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🔤 Telugu Text Tokenizer
+    This tool helps you encode Telugu text into tokens and decode them back.
+    It uses a trained BPE (Byte Pair Encoding) tokenizer optimized for Telugu language.
+    ## Features:
+    - 🔄 Encode Telugu text to token IDs
+    - 📊 View compression statistics
+    - 🎨 Visualize token segmentation
+    - ⚡ Fast and efficient encoding/decoding
+    """)
+    with gr.Tab("Encoder"):
+        with gr.Row():
+            with gr.Column():
+                input_text = gr.Textbox(
+                    label="Enter Telugu Text",
+                    placeholder="Type or paste Telugu text here...",
+                    lines=5
+                )
+                encode_btn = gr.Button("🔄 Encode", variant="primary")
+            with gr.Column():
+                encoded_output = gr.Textbox(
+                    label="Encoded Token IDs",
+                    lines=5,
+                    interactive=False
+                )
+                stats_output = gr.Textbox(
+                    label="Statistics",
+                    lines=8,
+                    interactive=False
+                )
+        with gr.Row():
+            gr.Markdown("### Token Visualization")
+            token_viz = gr.HighlightedText(
+                label="Token Segmentation",
+                show_legend=True
+            )
+    with gr.Tab("Decoder"):
+        with gr.Row():
+            with gr.Column():
+                encoded_input = gr.Textbox(
+                    label="Enter Encoded Token IDs",
+                    placeholder="Paste the encoded token IDs here...",
+                    lines=5
+                )
+                decode_btn = gr.Button("🔄 Decode", variant="primary")
+            with gr.Column():
+                decoded_output = gr.Textbox(
+                    label="Decoded Telugu Text",
+                    lines=5,
+                    interactive=False
+                )
+    # Set up event handlers
+    encode_btn.click(
+        fn=lambda text: encode_text(text, tokenizer),
+        inputs=input_text,
+        outputs=[encoded_output, stats_output, token_viz]
+    )
+    decode_btn.click(
+        fn=lambda ids: decode_ids(ids, tokenizer),
+        inputs=encoded_input,
+        outputs=decoded_output
+    )
+    gr.Markdown("""
+    ### 📝 Instructions:
+    1. **Encoding**: Enter Telugu text in the encoder tab and click "Encode"
+    2. **Decoding**: Copy the encoded IDs and paste them in the decoder tab
+    3. **Visualization**: View token segmentation with color coding
+    ### ℹ️ Notes:
+    - The tokenizer uses BPE (Byte Pair Encoding) algorithm
+    - Compression ratio shows how efficiently the text is encoded
+    - Different colors in visualization represent different tokens
+    """)
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()