| import gradio as gr |
| from tokenizers import Tokenizer |
| import json |
|
|
| |
| tamil_tokenizer = Tokenizer.from_file("tamil_bpe_tokenizer.json") |
| hybrid_tokenizer = Tokenizer.from_file("hybrid_tamil_stock_tokenizer.json") |
|
|
| |
| with open('tokenizer_summary.json', 'r') as f: |
| tamil_summary = json.load(f) |
|
|
| with open('hybrid_tokenizer_summary.json', 'r') as f: |
| hybrid_summary = json.load(f) |
|
|
|
|
| def decode_token_readable(tokenizer, token_id): |
| """Decode a single token ID to readable text.""" |
| decoded = tokenizer.decode([token_id], skip_special_tokens=False) |
| |
| if not decoded.strip(): |
| return '[SPACE]' |
| return decoded.replace('\n', '\\n').replace('\t', '\\t') |
|
|
|
|
| def tokenize_tamil(text): |
| """Tokenize using Tamil BPE tokenizer and decode tokens to UTF-8.""" |
| if not text.strip(): |
| return "Please enter some text to tokenize.", "", "", "" |
| |
| encoding = tamil_tokenizer.encode(text) |
| tokens = encoding.tokens |
| token_ids = encoding.ids |
| |
| |
| char_count = len(text) |
| token_count = len(tokens) |
| compression = char_count / token_count if token_count > 0 else 0 |
| |
| |
| tokens_display = "" |
| for i, token_id in enumerate(token_ids): |
| readable_token = decode_token_readable(tamil_tokenizer, token_id) |
| tokens_display += f"{i+1}. \"{readable_token}\" (ID: {token_id})\n" |
| |
| stats = f""" |
| 📊 **Tokenization Statistics** |
| |
| - **Characters**: {char_count} |
| - **Tokens**: {token_count} |
| - **Compression Ratio**: {compression:.2f}x |
| - **Average chars/token**: {char_count/token_count:.2f} |
| |
| 🔧 **Tokenizer Info** |
| - Vocabulary Size: {tamil_summary['vocabulary_size']:,} |
| - Algorithm: {tamil_summary['algorithm']} |
| - Overall Compression: {tamil_summary['compression_ratio']:.2f}x |
| |
| ℹ️ **Display Note**: Tokens shown using UTF-8 decoded format for readability |
| """ |
| |
| |
| decoded_full = tamil_tokenizer.decode(token_ids) |
| |
| return tokens_display, stats, str(token_ids), decoded_full |
|
|
|
|
| def tokenize_hybrid(text): |
| """Tokenize using Hybrid Tamil+Stock BPE tokenizer and decode tokens to UTF-8.""" |
| if not text.strip(): |
| return "Please enter some text to tokenize.", "", "", "" |
| |
| encoding = hybrid_tokenizer.encode(text) |
| tokens = encoding.tokens |
| token_ids = encoding.ids |
| |
| |
| char_count = len(text) |
| token_count = len(tokens) |
| compression = char_count / token_count if token_count > 0 else 0 |
| |
| |
| tokens_display = "" |
| for i, token_id in enumerate(token_ids): |
| readable_token = decode_token_readable(hybrid_tokenizer, token_id) |
| tokens_display += f"{i+1}. \"{readable_token}\" (ID: {token_id})\n" |
| |
| |
| decoded_tokens = [decode_token_readable(hybrid_tokenizer, tid) for tid in token_ids] |
| tamil_like = sum(1 for t in decoded_tokens if any(ord(c) > 2944 and ord(c) < 3072 for c in t)) |
| stock_keywords = ['$', 'stock', 'market', 'price', '%', 'surge', 'fall', 'rise', 'buy', 'sell'] |
| stock_like = sum(1 for t in decoded_tokens if any(kw.lower() in t.lower() for kw in stock_keywords)) |
| |
| stats = f""" |
| 📊 **Tokenization Statistics** |
| |
| - **Characters**: {char_count} |
| - **Tokens**: {token_count} |
| - **Compression Ratio**: {compression:.2f}x |
| - **Average chars/token**: {char_count/token_count:.2f} |
| |
| 🔍 **Token Analysis (Approximate)** |
| - Tamil-like tokens: {tamil_like} |
| - Stock-like tokens: {stock_like} |
| - Other tokens: {token_count - tamil_like - stock_like} |
| |
| 🔧 **Tokenizer Info** |
| - Total Vocabulary: {hybrid_summary['vocabulary_size']:,} |
| - Tamil Vocab: {hybrid_summary['tamil_vocab_count']:,} ({hybrid_summary['tamil_vocab_percentage']:.1f}%) |
| - Stock Vocab: {hybrid_summary['stock_vocab_count']:,} ({hybrid_summary['stock_vocab_percentage']:.1f}%) |
| - Overall Compression: {hybrid_summary['compression_ratio']:.2f}x |
| |
| ℹ️ **Display Note**: Tokens shown using UTF-8 decoded format for readability |
| """ |
| |
| |
| decoded_full = hybrid_tokenizer.decode(token_ids) |
| |
| return tokens_display, stats, str(token_ids), decoded_full |
|
|
|
|
| |
| tamil_examples = [ |
| ["தமிழ் மொழி இந்தியாவின் பழமையான மொழிகளில் ஒன்று"], |
| ["கணினி அறிவியல் மற்றும் தொழில்நுட்பம் வளர்ந்து வருகிறது"], |
| ["செயற்கை நுண்ணறிவு என்பது மிகவும் சுவாரஸ்யமான துறை"], |
| ] |
|
|
| |
| hybrid_examples = [ |
| ["ரிலையன்ஸ் பங்கு $Reliance rose to 2480 +1.2% இன்று"], |
| ["$Apple stock surged to 175.50 ஆப்பிள் பங்கு +3.7% on strong revenue"], |
| ["TCS stock surged to 3250 டிசிஎஸ் நிறுவனம் வர்த்தகம் 15L பங்குகள்"], |
| ["இன்று சந்தையில் $Infosys rose +2.5% $HDFC fell -1.8%"], |
| ["பங்கு சந்தை Apple stock opened 172.30 closed 175.50 buy வாங்கலாம்"], |
| ] |
|
|
| |
| custom_css = """ |
| .teal-button { |
| background: linear-gradient(to right, #14b8a6, #0d9488) !important; |
| border: none !important; |
| } |
| .teal-button:hover { |
| background: linear-gradient(to right, #0d9488, #0f766e) !important; |
| } |
| /* Change all bold text from purple/violet to teal */ |
| strong, b { |
| color: #0d9488 !important; |
| } |
| /* Change markdown bold text to teal */ |
| .markdown-text strong { |
| color: #0d9488 !important; |
| } |
| /* Change any purple/violet text to teal */ |
| .prose strong { |
| color: #0d9488 !important; |
| } |
| /* Tab labels */ |
| .tabs button.selected { |
| color: #0d9488 !important; |
| border-bottom-color: #0d9488 !important; |
| } |
| """ |
|
|
| with gr.Blocks(title="Tamil & Hybrid BPE Tokenizer Demo", theme=gr.themes.Soft(), css=custom_css) as demo: |
| gr.Markdown(""" |
| # 🔤 Tamil & Hybrid BPE Tokenizer Demo |
| |
| Test two Byte Pair Encoding (BPE) tokenizers: |
| 1. **Tamil Tokenizer**: Specialized for Tamil language text |
| 2. **Hybrid Tokenizer**: Handles both Tamil language and Stock market terminology |
| |
| --- |
| """) |
| |
| with gr.Tabs(): |
| |
| with gr.TabItem("🇮🇳 Tamil Tokenizer"): |
| gr.Markdown(""" |
| ### Tamil Language BPE Tokenizer |
| |
| - **Vocabulary**: 8,000 tokens |
| - **Dataset**: 50,000 Tamil Wikipedia articles |
| - **Compression**: 4.67x average |
| - **Display**: UTF-8 decoded tokens for readability |
| """) |
| |
| with gr.Row(): |
| with gr.Column(): |
| tamil_input = gr.Textbox( |
| label="Input Text (Tamil)", |
| placeholder="Enter Tamil text here...", |
| lines=5 |
| ) |
| tamil_button = gr.Button("Tokenize", variant="primary", elem_classes="teal-button") |
| gr.Examples( |
| examples=tamil_examples, |
| inputs=tamil_input, |
| label="Example Tamil Texts" |
| ) |
| |
| with gr.Column(): |
| tamil_tokens_output = gr.Textbox( |
| label="Token Breakdown", |
| lines=10, |
| max_lines=20 |
| ) |
| tamil_stats_output = gr.Markdown(label="Statistics") |
| |
| with gr.Accordion("Advanced Output", open=False): |
| with gr.Row(): |
| tamil_ids_output = gr.Textbox(label="Token IDs", lines=2) |
| tamil_decoded_output = gr.Textbox(label="Decoded Text", lines=2) |
| |
| |
| with gr.TabItem("📈 Hybrid Tokenizer (Tamil + Stock)"): |
| gr.Markdown(""" |
| ### Hybrid Tamil + Stock Market BPE Tokenizer |
| |
| - **Vocabulary**: 40,000 tokens |
| - **Dataset**: 30,000 documents (Tamil + Financial news) |
| - **Tamil**: 35,991 tokens (89.98%), 5.12x compression |
| - **Stock**: 5,572 tokens (13.93%), 4.90x compression |
| - **Display**: UTF-8 decoded tokens for readability |
| """) |
| |
| with gr.Row(): |
| with gr.Column(): |
| hybrid_input = gr.Textbox( |
| label="Input Text (Tamil + Stock/English)", |
| placeholder="Enter mixed Tamil and stock market text...", |
| lines=5 |
| ) |
| hybrid_button = gr.Button("Tokenize", variant="primary", elem_classes="teal-button") |
| gr.Examples( |
| examples=hybrid_examples, |
| inputs=hybrid_input, |
| label="Example Hybrid Texts" |
| ) |
| |
| with gr.Column(): |
| hybrid_tokens_output = gr.Textbox( |
| label="Token Breakdown", |
| lines=10, |
| max_lines=20 |
| ) |
| hybrid_stats_output = gr.Markdown(label="Statistics") |
| |
| with gr.Accordion("Advanced Output", open=False): |
| with gr.Row(): |
| hybrid_ids_output = gr.Textbox(label="Token IDs", lines=2) |
| hybrid_decoded_output = gr.Textbox(label="Decoded Text", lines=2) |
| |
| |
| with gr.Accordion("ℹ️ About These Tokenizers", open=False): |
| gr.Markdown(""" |
| ## Technical Details |
| |
| ### Tamil Tokenizer |
| - **Vocabulary**: 8,000 tokens |
| - **Algorithm**: Byte Pair Encoding (BPE) with ByteLevel encoding |
| - **Dataset**: 50,000 Tamil Wikipedia articles |
| - **Compression**: 4.67x average |
| |
| ### Hybrid Tokenizer |
| - **Vocabulary**: 40,000 tokens (35,991 Tamil + 5,572 Stock) |
| - **Algorithm**: Byte Pair Encoding (BPE) with ByteLevel encoding |
| - **Dataset**: 30,000 documents (10% Tamil Wikipedia + 90% Financial news) |
| - **Compression**: 5.78x overall |
| |
| ### Token Display |
| - **ByteLevel Encoding**: Tokens are encoded at byte level for efficiency |
| - **Token Decoding**: Each token is decoded using UTF-8 encoding |
| - **Note**: Due to normalization, some Tamil vowel marks may be altered |
| |
| ### Real-World Applications |
| - Tamil language NLP |
| - Tamil financial news processing |
| - Bilingual trading platforms |
| - Stock market sentiment analysis in Tamil |
| |
| --- |
| |
| **Created for NLP coursework** | **License**: MIT |
| """) |
| |
| |
| tamil_button.click( |
| fn=tokenize_tamil, |
| inputs=tamil_input, |
| outputs=[tamil_tokens_output, tamil_stats_output, tamil_ids_output, tamil_decoded_output] |
| ) |
| |
| hybrid_button.click( |
| fn=tokenize_hybrid, |
| inputs=hybrid_input, |
| outputs=[hybrid_tokens_output, hybrid_stats_output, hybrid_ids_output, hybrid_decoded_output] |
| ) |
|
|
| |
| if __name__ == "__main__": |
| demo.launch() |
|
|
|
|