malarsaravanan commited on
Commit
765ce6a
·
verified ·
1 Parent(s): 3a7d4d6

Upload 6 files

Browse files
app.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from tokenizers import Tokenizer
3
+ import json
4
+
5
+ # Load tokenizers
6
+ tamil_tokenizer = Tokenizer.from_file("tamil_bpe_tokenizer/tamil_bpe_tokenizer.json")
7
+ hybrid_tokenizer = Tokenizer.from_file("hybrid_tamil_stock_tokenizer/hybrid_tamil_stock_tokenizer.json")
8
+
9
+ # Load summaries
10
+ with open('tamil_bpe_tokenizer/tokenizer_summary.json', 'r') as f:
11
+ tamil_summary = json.load(f)
12
+
13
+ with open('hybrid_tamil_stock_tokenizer/hybrid_tokenizer_summary.json', 'r') as f:
14
+ hybrid_summary = json.load(f)
15
+
16
+
17
+ def decode_token_readable(tokenizer, token_id):
18
+ """Decode a single token ID to readable text."""
19
+ decoded = tokenizer.decode([token_id], skip_special_tokens=False)
20
+ # Clean up for display
21
+ if not decoded.strip():
22
+ return '[SPACE]'
23
+ return decoded.replace('\n', '\\n').replace('\t', '\\t')
24
+
25
+
26
+ def tokenize_tamil(text):
27
+ """Tokenize using Tamil BPE tokenizer and decode tokens to UTF-8."""
28
+ if not text.strip():
29
+ return "Please enter some text to tokenize.", "", "", ""
30
+
31
+ encoding = tamil_tokenizer.encode(text)
32
+ tokens = encoding.tokens
33
+ token_ids = encoding.ids
34
+
35
+ # Calculate stats
36
+ char_count = len(text)
37
+ token_count = len(tokens)
38
+ compression = char_count / token_count if token_count > 0 else 0
39
+
40
+ # Decode each token for readable display
41
+ tokens_display = ""
42
+ for i, token_id in enumerate(token_ids):
43
+ readable_token = decode_token_readable(tamil_tokenizer, token_id)
44
+ tokens_display += f"{i+1}. \"{readable_token}\" (ID: {token_id})\n"
45
+
46
+ stats = f"""
47
+ 📊 **Tokenization Statistics**
48
+
49
+ - **Characters**: {char_count}
50
+ - **Tokens**: {token_count}
51
+ - **Compression Ratio**: {compression:.2f}x
52
+ - **Average chars/token**: {char_count/token_count:.2f}
53
+
54
+ 🔧 **Tokenizer Info**
55
+ - Vocabulary Size: {tamil_summary['vocabulary_size']:,}
56
+ - Algorithm: {tamil_summary['algorithm']}
57
+ - Overall Compression: {tamil_summary['compression_ratio']:.2f}x
58
+
59
+ ℹ️ **Display Note**: Tokens shown using UTF-8 decoded format for readability
60
+ """
61
+
62
+ # Full decoded text verification
63
+ decoded_full = tamil_tokenizer.decode(token_ids)
64
+
65
+ return tokens_display, stats, str(token_ids), decoded_full
66
+
67
+
68
+ def tokenize_hybrid(text):
69
+ """Tokenize using Hybrid Tamil+Stock BPE tokenizer and decode tokens to UTF-8."""
70
+ if not text.strip():
71
+ return "Please enter some text to tokenize.", "", "", ""
72
+
73
+ encoding = hybrid_tokenizer.encode(text)
74
+ tokens = encoding.tokens
75
+ token_ids = encoding.ids
76
+
77
+ # Calculate stats
78
+ char_count = len(text)
79
+ token_count = len(tokens)
80
+ compression = char_count / token_count if token_count > 0 else 0
81
+
82
+ # Decode each token for readable display
83
+ tokens_display = ""
84
+ for i, token_id in enumerate(token_ids):
85
+ readable_token = decode_token_readable(hybrid_tokenizer, token_id)
86
+ tokens_display += f"{i+1}. \"{readable_token}\" (ID: {token_id})\n"
87
+
88
+ # Categorize tokens (approximate)
89
+ decoded_tokens = [decode_token_readable(hybrid_tokenizer, tid) for tid in token_ids]
90
+ tamil_like = sum(1 for t in decoded_tokens if any(ord(c) > 2944 and ord(c) < 3072 for c in t))
91
+ stock_keywords = ['$', 'stock', 'market', 'price', '%', 'surge', 'fall', 'rise', 'buy', 'sell']
92
+ stock_like = sum(1 for t in decoded_tokens if any(kw.lower() in t.lower() for kw in stock_keywords))
93
+
94
+ stats = f"""
95
+ 📊 **Tokenization Statistics**
96
+
97
+ - **Characters**: {char_count}
98
+ - **Tokens**: {token_count}
99
+ - **Compression Ratio**: {compression:.2f}x
100
+ - **Average chars/token**: {char_count/token_count:.2f}
101
+
102
+ 🔍 **Token Analysis (Approximate)**
103
+ - Tamil-like tokens: {tamil_like}
104
+ - Stock-like tokens: {stock_like}
105
+ - Other tokens: {token_count - tamil_like - stock_like}
106
+
107
+ 🔧 **Tokenizer Info**
108
+ - Total Vocabulary: {hybrid_summary['vocabulary_size']:,}
109
+ - Tamil Vocab: {hybrid_summary['tamil_vocab_count']:,} ({hybrid_summary['tamil_vocab_percentage']:.1f}%)
110
+ - Stock Vocab: {hybrid_summary['stock_vocab_count']:,} ({hybrid_summary['stock_vocab_percentage']:.1f}%)
111
+ - Overall Compression: {hybrid_summary['compression_ratio']:.2f}x
112
+
113
+ ℹ️ **Display Note**: Tokens shown using UTF-8 decoded format for readability
114
+ """
115
+
116
+ # Full decoded text verification
117
+ decoded_full = hybrid_tokenizer.decode(token_ids)
118
+
119
+ return tokens_display, stats, str(token_ids), decoded_full
120
+
121
+
122
+ # Tamil examples
123
+ tamil_examples = [
124
+ ["தமிழ் மொழி இந்தியாவின் பழமையான மொழிகளில் ஒன்று"],
125
+ ["கணினி அறிவியல் மற்றும் தொழில்நுட்பம் வளர்ந்து வருகிறது"],
126
+ ["செயற்கை நுண்ணறிவு என்பது மிகவும் சுவாரஸ்யமான துறை"],
127
+ ]
128
+
129
+ # Hybrid examples
130
+ hybrid_examples = [
131
+ ["ரிலையன்ஸ் பங்கு $Reliance rose to 2480 +1.2% இன்���ு"],
132
+ ["$Apple stock surged to 175.50 ஆப்பிள் பங்கு +3.7% on strong revenue"],
133
+ ["TCS stock surged to 3250 டிசிஎஸ் நிறுவனம் வர்த்தகம் 15L பங்குகள்"],
134
+ ["இன்று சந்தையில் $Infosys rose +2.5% $HDFC fell -1.8%"],
135
+ ["பங்கு சந்தை Apple stock opened 172.30 closed 175.50 buy வாங்கலாம்"],
136
+ ]
137
+
138
+ # Create Gradio interface with custom CSS for teal theme
139
+ custom_css = """
140
+ .teal-button {
141
+ background: linear-gradient(to right, #14b8a6, #0d9488) !important;
142
+ border: none !important;
143
+ }
144
+ .teal-button:hover {
145
+ background: linear-gradient(to right, #0d9488, #0f766e) !important;
146
+ }
147
+ /* Change all bold text from purple/violet to teal */
148
+ strong, b {
149
+ color: #0d9488 !important;
150
+ }
151
+ /* Change markdown bold text to teal */
152
+ .markdown-text strong {
153
+ color: #0d9488 !important;
154
+ }
155
+ /* Change any purple/violet text to teal */
156
+ .prose strong {
157
+ color: #0d9488 !important;
158
+ }
159
+ /* Tab labels */
160
+ .tabs button.selected {
161
+ color: #0d9488 !important;
162
+ border-bottom-color: #0d9488 !important;
163
+ }
164
+ """
165
+
166
+ with gr.Blocks(title="Tamil & Hybrid BPE Tokenizer Demo", theme=gr.themes.Soft(), css=custom_css) as demo:
167
+ gr.Markdown("""
168
+ # 🔤 Tamil & Hybrid BPE Tokenizer Demo
169
+
170
+ Test two Byte Pair Encoding (BPE) tokenizers:
171
+ 1. **Tamil Tokenizer**: Specialized for Tamil language text
172
+ 2. **Hybrid Tokenizer**: Handles both Tamil language and Stock market terminology
173
+
174
+ ---
175
+ """)
176
+
177
+ with gr.Tabs():
178
+ # Tamil Tokenizer Tab
179
+ with gr.TabItem("🇮🇳 Tamil Tokenizer"):
180
+ gr.Markdown("""
181
+ ### Tamil Language BPE Tokenizer
182
+
183
+ - **Vocabulary**: 8,000 tokens
184
+ - **Dataset**: 50,000 Tamil Wikipedia articles
185
+ - **Compression**: 4.67x average
186
+ - **Display**: UTF-8 decoded tokens for readability
187
+ """)
188
+
189
+ with gr.Row():
190
+ with gr.Column():
191
+ tamil_input = gr.Textbox(
192
+ label="Input Text (Tamil)",
193
+ placeholder="Enter Tamil text here...",
194
+ lines=5
195
+ )
196
+ tamil_button = gr.Button("Tokenize", variant="primary", elem_classes="teal-button")
197
+ gr.Examples(
198
+ examples=tamil_examples,
199
+ inputs=tamil_input,
200
+ label="Example Tamil Texts"
201
+ )
202
+
203
+ with gr.Column():
204
+ tamil_tokens_output = gr.Textbox(
205
+ label="Token Breakdown",
206
+ lines=10,
207
+ max_lines=20
208
+ )
209
+ tamil_stats_output = gr.Markdown(label="Statistics")
210
+
211
+ with gr.Accordion("Advanced Output", open=False):
212
+ with gr.Row():
213
+ tamil_ids_output = gr.Textbox(label="Token IDs", lines=2)
214
+ tamil_decoded_output = gr.Textbox(label="Decoded Text", lines=2)
215
+
216
+ # Hybrid Tokenizer Tab
217
+ with gr.TabItem("📈 Hybrid Tokenizer (Tamil + Stock)"):
218
+ gr.Markdown("""
219
+ ### Hybrid Tamil + Stock Market BPE Tokenizer
220
+
221
+ - **Vocabulary**: 40,000 tokens
222
+ - **Dataset**: 30,000 documents (Tamil + Financial news)
223
+ - **Tamil**: 35,991 tokens (89.98%), 5.12x compression
224
+ - **Stock**: 5,572 tokens (13.93%), 4.90x compression
225
+ - **Display**: UTF-8 decoded tokens for readability
226
+ """)
227
+
228
+ with gr.Row():
229
+ with gr.Column():
230
+ hybrid_input = gr.Textbox(
231
+ label="Input Text (Tamil + Stock/English)",
232
+ placeholder="Enter mixed Tamil and stock market text...",
233
+ lines=5
234
+ )
235
+ hybrid_button = gr.Button("Tokenize", variant="primary", elem_classes="teal-button")
236
+ gr.Examples(
237
+ examples=hybrid_examples,
238
+ inputs=hybrid_input,
239
+ label="Example Hybrid Texts"
240
+ )
241
+
242
+ with gr.Column():
243
+ hybrid_tokens_output = gr.Textbox(
244
+ label="Token Breakdown",
245
+ lines=10,
246
+ max_lines=20
247
+ )
248
+ hybrid_stats_output = gr.Markdown(label="Statistics")
249
+
250
+ with gr.Accordion("Advanced Output", open=False):
251
+ with gr.Row():
252
+ hybrid_ids_output = gr.Textbox(label="Token IDs", lines=2)
253
+ hybrid_decoded_output = gr.Textbox(label="Decoded Text", lines=2)
254
+
255
+ # About section
256
+ with gr.Accordion("ℹ️ About These Tokenizers", open=False):
257
+ gr.Markdown("""
258
+ ## Technical Details
259
+
260
+ ### Tamil Tokenizer
261
+ - **Vocabulary**: 8,000 tokens
262
+ - **Algorithm**: Byte Pair Encoding (BPE) with ByteLevel encoding
263
+ - **Dataset**: 50,000 Tamil Wikipedia articles
264
+ - **Compression**: 4.67x average
265
+
266
+ ### Hybrid Tokenizer
267
+ - **Vocabulary**: 40,000 tokens (35,991 Tamil + 5,572 Stock)
268
+ - **Algorithm**: Byte Pair Encoding (BPE) with ByteLevel encoding
269
+ - **Dataset**: 30,000 documents (10% Tamil Wikipedia + 90% Financial news)
270
+ - **Compression**: 5.78x overall
271
+
272
+ ### Token Display
273
+ - **ByteLevel Encoding**: Tokens are encoded at byte level for efficiency
274
+ - **Token Decoding**: Each token is decoded using UTF-8 encoding
275
+ - **Note**: Due to normalization, some Tamil vowel marks may be altered
276
+
277
+ ### Real-World Applications
278
+ - Tamil language NLP
279
+ - Tamil financial news processing
280
+ - Bilingual trading platforms
281
+ - Stock market sentiment analysis in Tamil
282
+
283
+ ---
284
+
285
+ **Created for NLP coursework** | **License**: MIT
286
+ """)
287
+
288
+ # Connect buttons
289
+ tamil_button.click(
290
+ fn=tokenize_tamil,
291
+ inputs=tamil_input,
292
+ outputs=[tamil_tokens_output, tamil_stats_output, tamil_ids_output, tamil_decoded_output]
293
+ )
294
+
295
+ hybrid_button.click(
296
+ fn=tokenize_hybrid,
297
+ inputs=hybrid_input,
298
+ outputs=[hybrid_tokens_output, hybrid_stats_output, hybrid_ids_output, hybrid_decoded_output]
299
+ )
300
+
301
+ # Launch the app
302
+ if __name__ == "__main__":
303
+ demo.launch()
304
+
hybrid_tamil_stock_tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
hybrid_tokenizer_summary.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "type": "Hybrid Tokenizer",
3
+ "domains": [
4
+ "Tamil Language",
5
+ "Stock Market Data"
6
+ ],
7
+ "vocabulary_size": 40000,
8
+ "compression_ratio": 5.7752,
9
+ "tamil_compression": 5.12,
10
+ "tamil_vocab_count": 35991,
11
+ "tamil_vocab_percentage": 89.98,
12
+ "stock_compression": 4.9,
13
+ "stock_vocab_count": 5572,
14
+ "stock_vocab_percentage": 13.93,
15
+ "meets_vocab_requirement": true,
16
+ "meets_compression_requirement": true,
17
+ "meets_tamil_requirement": true,
18
+ "meets_stock_requirement": true,
19
+ "dataset_composition": {
20
+ "tamil": "10%",
21
+ "stock": "90%"
22
+ },
23
+ "total_training_documents": 30000,
24
+ "double_points_attempt": true,
25
+ "note": "Tamil uses byte-encoding (ByteLevel), Stock uses English vocabulary",
26
+ "unique_features": [
27
+ "5000+ stock vocabulary: 1000+ symbols, 4000+ trading/finance terms",
28
+ "Combines Tamil language with comprehensive stock market vocabulary",
29
+ "Real-world application: Tamil financial news + stock data",
30
+ "Tamil: 5.1x compression, Stock: 4.9x compression"
31
+ ]
32
+ }
requirements.txt ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core tokenizer dependencies
2
+ tokenizers==0.22.1
3
+ datasets==4.3.0
4
+ huggingface-hub==1.0.1
5
+
6
+ # Data processing
7
+ numpy==2.3.4
8
+ pandas==2.2.3
9
+
10
+ # Visualization
11
+ matplotlib==3.9.3
12
+
13
+ # Progress bars
14
+ tqdm==4.67.1
15
+
16
+ # Jupyter notebook support
17
+ jupyter==1.1.1
18
+ ipywidgets==8.1.5
19
+ notebook==7.3.2
20
+
21
+ # Web app
22
+ gradio==4.44.0
23
+
24
+ # Additional utilities
25
+ requests==2.32.3
tamil_bpe_tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_summary.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "language": "Tamil",
3
+ "algorithm": "BPE",
4
+ "vocabulary_size": 8000,
5
+ "compression_ratio": 4.6671,
6
+ "meets_vocab_requirement": true,
7
+ "meets_compression_requirement": true,
8
+ "dataset_size": 50000,
9
+ "dataset_source": "HuggingFace (Real Tamil Data)"
10
+ }