kishkath commited on
Commit
bf87fbf
Β·
verified Β·
1 Parent(s): 4f5aaf6

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +199 -0
app.py ADDED
@@ -0,0 +1,199 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ import os
4
+ from tokenizers.basic import BasicTokenizer
5
+ import numpy as np
6
+
7
+ def load_tokenizer(model_path, vocab_path):
8
+ """Load the trained tokenizer"""
9
+ tokenizer = BasicTokenizer()
10
+ try:
11
+ # Load the trained model
12
+ tokenizer.load(model_path)
13
+
14
+ # Load vocabulary
15
+ with open(vocab_path, 'r', encoding='utf-8') as f:
16
+ vocab_data = json.load(f)
17
+ tokenizer.token_to_id = {k: int(v) for k, v in vocab_data['token_to_id'].items()}
18
+ tokenizer.id_to_token = {int(k): v for k, v in vocab_data['id_to_token'].items()}
19
+ tokenizer.merges = {tuple(map(int, k.split(','))): int(v)
20
+ for k, v in vocab_data['merges'].items()}
21
+ return tokenizer
22
+ except Exception as e:
23
+ raise Exception(f"Error loading tokenizer: {e}")
24
+
25
+ def encode_text(text, tokenizer):
26
+ """Encode text and return statistics"""
27
+ if not text.strip():
28
+ return {
29
+ "encoded_ids": "Please enter some Telugu text",
30
+ "stats": "No statistics available",
31
+ "visualization": None
32
+ }
33
+
34
+ try:
35
+ # Encode the text
36
+ encoded = tokenizer.encode(text)
37
+
38
+ # Calculate compression ratio
39
+ original_size = len(text.encode('utf-8'))
40
+ encoded_size = len(encoded) * 2
41
+ compression_ratio = original_size / encoded_size
42
+
43
+ # Prepare statistics
44
+ stats = f"""
45
+ πŸ“Š Encoding Statistics:
46
+ β€’ Original text length: {len(text)} characters
47
+ β€’ Encoded length: {len(encoded)} tokens
48
+ β€’ Compression ratio: {compression_ratio:.2f}X
49
+ β€’ Original size: {original_size} bytes
50
+ β€’ Encoded size: {encoded_size} bytes
51
+ β€’ Space saved: {(1 - encoded_size/original_size) * 100:.1f}%
52
+ """
53
+
54
+ # Create token visualization
55
+ viz_data = visualize_encoding(text, encoded, tokenizer)
56
+
57
+ return {
58
+ "encoded_ids": str(encoded),
59
+ "stats": stats,
60
+ "visualization": viz_data
61
+ }
62
+ except Exception as e:
63
+ return {
64
+ "encoded_ids": f"Error: {str(e)}",
65
+ "stats": "Error occurred during encoding",
66
+ "visualization": None
67
+ }
68
+
69
+ def decode_ids(encoded_ids_str, tokenizer):
70
+ """Decode the encoded IDs back to text"""
71
+ if not encoded_ids_str.strip():
72
+ return "Please enter encoded IDs"
73
+
74
+ try:
75
+ # Convert string representation of list to actual list of integers
76
+ encoded_ids = eval(encoded_ids_str)
77
+ if not isinstance(encoded_ids, list):
78
+ return "Invalid input: Please enter a list of integers"
79
+
80
+ # Decode the IDs
81
+ decoded_text = tokenizer.decode(encoded_ids)
82
+ return decoded_text
83
+ except Exception as e:
84
+ return f"Error during decoding: {str(e)}"
85
+
86
+ def visualize_encoding(text, encoded_ids, tokenizer):
87
+ """Create a visual representation of the encoding"""
88
+ tokens = []
89
+ colors = []
90
+
91
+ # Generate colors based on token frequencies
92
+ unique_tokens = set(encoded_ids)
93
+ color_map = {token: np.random.rand(3).tolist() for token in unique_tokens}
94
+
95
+ for token_id in encoded_ids:
96
+ token_bytes = tokenizer.vocab[token_id]
97
+ token_text = token_bytes.decode('utf-8', errors='replace')
98
+ tokens.append(token_text)
99
+ colors.append(color_map[token_id])
100
+
101
+ return {
102
+ "tokens": tokens,
103
+ "colors": colors
104
+ }
105
+
106
+ # Load the tokenizer
107
+ model_path = "models/version_2/checkpoints/telugu_basic.model"
108
+ vocab_path = "models/version_2/vocabulary/vocabulary.json"
109
+ tokenizer = load_tokenizer(model_path, vocab_path)
110
+
111
+ # Create the Gradio interface
112
+ with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
113
+ gr.Markdown("""
114
+ # πŸ”€ Telugu Text Tokenizer
115
+
116
+ This tool helps you encode Telugu text into tokens and decode them back.
117
+ It uses a trained BPE (Byte Pair Encoding) tokenizer optimized for Telugu language.
118
+
119
+ ## Features:
120
+ - πŸ”„ Encode Telugu text to token IDs
121
+ - πŸ“Š View compression statistics
122
+ - 🎨 Visualize token segmentation
123
+ - ⚑ Fast and efficient encoding/decoding
124
+ """)
125
+
126
+ with gr.Tab("Encoder"):
127
+ with gr.Row():
128
+ with gr.Column():
129
+ input_text = gr.Textbox(
130
+ label="Enter Telugu Text",
131
+ placeholder="Type or paste Telugu text here...",
132
+ lines=5
133
+ )
134
+ encode_btn = gr.Button("πŸ”„ Encode", variant="primary")
135
+
136
+ with gr.Column():
137
+ encoded_output = gr.Textbox(
138
+ label="Encoded Token IDs",
139
+ lines=5,
140
+ interactive=False
141
+ )
142
+ stats_output = gr.Textbox(
143
+ label="Statistics",
144
+ lines=8,
145
+ interactive=False
146
+ )
147
+
148
+ with gr.Row():
149
+ gr.Markdown("### Token Visualization")
150
+ token_viz = gr.HighlightedText(
151
+ label="Token Segmentation",
152
+ show_legend=True
153
+ )
154
+
155
+ with gr.Tab("Decoder"):
156
+ with gr.Row():
157
+ with gr.Column():
158
+ encoded_input = gr.Textbox(
159
+ label="Enter Encoded Token IDs",
160
+ placeholder="Paste the encoded token IDs here...",
161
+ lines=5
162
+ )
163
+ decode_btn = gr.Button("πŸ”„ Decode", variant="primary")
164
+
165
+ with gr.Column():
166
+ decoded_output = gr.Textbox(
167
+ label="Decoded Telugu Text",
168
+ lines=5,
169
+ interactive=False
170
+ )
171
+
172
+ # Set up event handlers
173
+ encode_btn.click(
174
+ fn=lambda text: encode_text(text, tokenizer),
175
+ inputs=input_text,
176
+ outputs=[encoded_output, stats_output, token_viz]
177
+ )
178
+
179
+ decode_btn.click(
180
+ fn=lambda ids: decode_ids(ids, tokenizer),
181
+ inputs=encoded_input,
182
+ outputs=decoded_output
183
+ )
184
+
185
+ gr.Markdown("""
186
+ ### πŸ“ Instructions:
187
+ 1. **Encoding**: Enter Telugu text in the encoder tab and click "Encode"
188
+ 2. **Decoding**: Copy the encoded IDs and paste them in the decoder tab
189
+ 3. **Visualization**: View token segmentation with color coding
190
+
191
+ ### ℹ️ Notes:
192
+ - The tokenizer uses BPE (Byte Pair Encoding) algorithm
193
+ - Compression ratio shows how efficiently the text is encoded
194
+ - Different colors in visualization represent different tokens
195
+ """)
196
+
197
+ # Launch the app
198
+ if __name__ == "__main__":
199
+ demo.launch()