File size: 3,979 Bytes
07a2346
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
from __future__ import annotations

import json
import os
from typing import Any, Dict, List

import gradio as gr

from .highlighter import highlight_tokens
from .metrics import compute_compression_metrics
from .tokenizer import KannadaBPETokenizer


class TokenizerApp:
    def __init__(self, tokenizer: KannadaBPETokenizer, css_path: str | None = None):
        self.tokenizer = tokenizer
        self.custom_css = self._load_css(css_path) if css_path else None

    @staticmethod
    def _load_css(css_path: str) -> str:
        if not os.path.exists(css_path):
            return ""
        with open(css_path, "r", encoding="utf-8") as fh:
            return fh.read()

    def process(self, text: str) -> Dict[str, Any]:
        text = text or ""
        if not text.strip():
            empty_response = {
                "highlight": "<em>Enter Kannada text to view tokenization.</em>",
                "table": [],
                "metrics": {},
                "decoded": "",
            }
            return empty_response

        analysis = self.tokenizer.analyze(text)
        highlighted = highlight_tokens(analysis.token_strings, analysis.token_ids)
        metrics = compute_compression_metrics(analysis)
        rows: List[List[Any]] = []
        for token_str, token_id, byte_len in zip(
            analysis.token_strings, analysis.token_ids, analysis.byte_lengths
        ):
            display_token = token_str.replace("\n", "\\n")
            rows.append([display_token, token_id, byte_len])

        decoded = self.tokenizer.decode(analysis.token_ids)
        return {
            "highlight": highlighted,
            "table": rows,
            "metrics": metrics,
            "decoded": decoded,
        }

    def build(self) -> gr.Blocks:
        with gr.Blocks(theme=gr.themes.Soft(), css=self.custom_css) as demo:
            gr.Markdown(
                """
                # Kannada Tokenizer Viewer
                Enter Kannada text to inspect byte-pair encoding tokens, visualize
                segmentation with color-coded highlights, and view compression
                metrics compared to raw UTF-8 bytes.
                """.strip()
            )

            with gr.Row():
                text_input = gr.Textbox(
                    label="Kannada Text",
                    lines=6,
                    placeholder="ಕನ್ನಡ ಪಠ್ಯವನ್ನು ಇಲ್ಲಿ ನಮೂದಿಸಿ…",
                )

            run_button = gr.Button("Tokenize", variant="primary")

            with gr.Row():
                highlight_output = gr.HTML(label="Highlighted Tokens")

            with gr.Row():
                table_output = gr.Dataframe(
                    headers=["Token", "Token ID", "Byte Length"],
                    datatype=["str", "number", "number"],
                    col_count=(3, "fixed"),
                    wrap=True,
                    label="Token Breakdown",
                )

            with gr.Row():
                metrics_output = gr.JSON(label="Compression Metrics")
                decoded_output = gr.Textbox(
                    label="Decoded text",
                    interactive=False,
                    lines=4,
                )

            def _handler(text: str) -> tuple:
                response = self.process(text)
                return (
                    response["highlight"],
                    response["table"],
                    json.dumps(response["metrics"], ensure_ascii=False, indent=2),
                    response["decoded"],
                )

            run_button.click(
                fn=_handler,
                inputs=[text_input],
                outputs=[highlight_output, table_output, metrics_output, decoded_output],
            )

        return demo


def build_interface(tokenizer: KannadaBPETokenizer, css_path: str | None = None) -> gr.Blocks:
    app = TokenizerApp(tokenizer, css_path=css_path)
    return app.build()