| | from __future__ import annotations |
| |
|
| | import json |
| | import os |
| | from typing import Any, Dict, List |
| |
|
| | import gradio as gr |
| |
|
| | from .highlighter import highlight_tokens |
| | from .metrics import compute_compression_metrics |
| | from .tokenizer import KannadaBPETokenizer |
| |
|
| |
|
| | class TokenizerApp: |
| | def __init__(self, tokenizer: KannadaBPETokenizer, css_path: str | None = None): |
| | self.tokenizer = tokenizer |
| | self.custom_css = self._load_css(css_path) if css_path else None |
| |
|
| | @staticmethod |
| | def _load_css(css_path: str) -> str: |
| | if not os.path.exists(css_path): |
| | return "" |
| | with open(css_path, "r", encoding="utf-8") as fh: |
| | return fh.read() |
| |
|
| | def process(self, text: str) -> Dict[str, Any]: |
| | text = text or "" |
| | if not text.strip(): |
| | empty_response = { |
| | "highlight": "<em>Enter Kannada text to view tokenization.</em>", |
| | "table": [], |
| | "metrics": {}, |
| | "decoded": "", |
| | } |
| | return empty_response |
| |
|
| | analysis = self.tokenizer.analyze(text) |
| | highlighted = highlight_tokens(analysis.token_strings, analysis.token_ids) |
| | metrics = compute_compression_metrics(analysis) |
| | rows: List[List[Any]] = [] |
| | for token_str, token_id, byte_len in zip( |
| | analysis.token_strings, analysis.token_ids, analysis.byte_lengths |
| | ): |
| | display_token = token_str.replace("\n", "\\n") |
| | rows.append([display_token, token_id, byte_len]) |
| |
|
| | decoded = self.tokenizer.decode(analysis.token_ids) |
| | return { |
| | "highlight": highlighted, |
| | "table": rows, |
| | "metrics": metrics, |
| | "decoded": decoded, |
| | } |
| |
|
| | def build(self) -> gr.Blocks: |
| | with gr.Blocks(theme=gr.themes.Soft(), css=self.custom_css) as demo: |
| | gr.Markdown( |
| | """ |
| | # Kannada Tokenizer Viewer |
| | Enter Kannada text to inspect byte-pair encoding tokens, visualize |
| | segmentation with color-coded highlights, and view compression |
| | metrics compared to raw UTF-8 bytes. |
| | """.strip() |
| | ) |
| |
|
| | with gr.Row(): |
| | text_input = gr.Textbox( |
| | label="Kannada Text", |
| | lines=6, |
| | placeholder="ಕನ್ನಡ ಪಠ್ಯವನ್ನು ಇಲ್ಲಿ ನಮೂದಿಸಿ…", |
| | ) |
| |
|
| | run_button = gr.Button("Tokenize", variant="primary") |
| |
|
| | with gr.Row(): |
| | highlight_output = gr.HTML(label="Highlighted Tokens") |
| |
|
| | with gr.Row(): |
| | table_output = gr.Dataframe( |
| | headers=["Token", "Token ID", "Byte Length"], |
| | datatype=["str", "number", "number"], |
| | col_count=(3, "fixed"), |
| | wrap=True, |
| | label="Token Breakdown", |
| | ) |
| |
|
| | with gr.Row(): |
| | metrics_output = gr.JSON(label="Compression Metrics") |
| | decoded_output = gr.Textbox( |
| | label="Decoded text", |
| | interactive=False, |
| | lines=4, |
| | ) |
| |
|
| | def _handler(text: str) -> tuple: |
| | response = self.process(text) |
| | return ( |
| | response["highlight"], |
| | response["table"], |
| | json.dumps(response["metrics"], ensure_ascii=False, indent=2), |
| | response["decoded"], |
| | ) |
| |
|
| | run_button.click( |
| | fn=_handler, |
| | inputs=[text_input], |
| | outputs=[highlight_output, table_output, metrics_output, decoded_output], |
| | ) |
| |
|
| | return demo |
| |
|
| |
|
| | def build_interface(tokenizer: KannadaBPETokenizer, css_path: str | None = None) -> gr.Blocks: |
| | app = TokenizerApp(tokenizer, css_path=css_path) |
| | return app.build() |
| |
|
| |
|
| |
|