SiriGannada_Tokenizer / app /interface.py
Krishnakanth1993's picture
Initial commit
07a2346
from __future__ import annotations
import json
import os
from typing import Any, Dict, List
import gradio as gr
from .highlighter import highlight_tokens
from .metrics import compute_compression_metrics
from .tokenizer import KannadaBPETokenizer
class TokenizerApp:
def __init__(self, tokenizer: KannadaBPETokenizer, css_path: str | None = None):
self.tokenizer = tokenizer
self.custom_css = self._load_css(css_path) if css_path else None
@staticmethod
def _load_css(css_path: str) -> str:
if not os.path.exists(css_path):
return ""
with open(css_path, "r", encoding="utf-8") as fh:
return fh.read()
def process(self, text: str) -> Dict[str, Any]:
text = text or ""
if not text.strip():
empty_response = {
"highlight": "<em>Enter Kannada text to view tokenization.</em>",
"table": [],
"metrics": {},
"decoded": "",
}
return empty_response
analysis = self.tokenizer.analyze(text)
highlighted = highlight_tokens(analysis.token_strings, analysis.token_ids)
metrics = compute_compression_metrics(analysis)
rows: List[List[Any]] = []
for token_str, token_id, byte_len in zip(
analysis.token_strings, analysis.token_ids, analysis.byte_lengths
):
display_token = token_str.replace("\n", "\\n")
rows.append([display_token, token_id, byte_len])
decoded = self.tokenizer.decode(analysis.token_ids)
return {
"highlight": highlighted,
"table": rows,
"metrics": metrics,
"decoded": decoded,
}
def build(self) -> gr.Blocks:
with gr.Blocks(theme=gr.themes.Soft(), css=self.custom_css) as demo:
gr.Markdown(
"""
# Kannada Tokenizer Viewer
Enter Kannada text to inspect byte-pair encoding tokens, visualize
segmentation with color-coded highlights, and view compression
metrics compared to raw UTF-8 bytes.
""".strip()
)
with gr.Row():
text_input = gr.Textbox(
label="Kannada Text",
lines=6,
placeholder="ಕನ್ನಡ ಪಠ್ಯವನ್ನು ಇಲ್ಲಿ ನಮೂದಿಸಿ…",
)
run_button = gr.Button("Tokenize", variant="primary")
with gr.Row():
highlight_output = gr.HTML(label="Highlighted Tokens")
with gr.Row():
table_output = gr.Dataframe(
headers=["Token", "Token ID", "Byte Length"],
datatype=["str", "number", "number"],
col_count=(3, "fixed"),
wrap=True,
label="Token Breakdown",
)
with gr.Row():
metrics_output = gr.JSON(label="Compression Metrics")
decoded_output = gr.Textbox(
label="Decoded text",
interactive=False,
lines=4,
)
def _handler(text: str) -> tuple:
response = self.process(text)
return (
response["highlight"],
response["table"],
json.dumps(response["metrics"], ensure_ascii=False, indent=2),
response["decoded"],
)
run_button.click(
fn=_handler,
inputs=[text_input],
outputs=[highlight_output, table_output, metrics_output, decoded_output],
)
return demo
def build_interface(tokenizer: KannadaBPETokenizer, css_path: str | None = None) -> gr.Blocks:
app = TokenizerApp(tokenizer, css_path=css_path)
return app.build()