Spaces:

malarsaravanan
/

indic_language_stock_tokenizer

Sleeping

App Files Files Community

malarsaravanan commited on Nov 8, 2025

Commit

765ce6a

verified ·

1 Parent(s): 3a7d4d6

Upload 6 files

Browse files

Files changed (6) hide show

app.py +304 -0
hybrid_tamil_stock_tokenizer.json +0 -0
hybrid_tokenizer_summary.json +32 -0
requirements.txt +25 -0
tamil_bpe_tokenizer.json +0 -0
tokenizer_summary.json +10 -0

app.py ADDED Viewed

	@@ -0,0 +1,304 @@

+import gradio as gr
+from tokenizers import Tokenizer
+import json
+# Load tokenizers
+tamil_tokenizer = Tokenizer.from_file("tamil_bpe_tokenizer/tamil_bpe_tokenizer.json")
+hybrid_tokenizer = Tokenizer.from_file("hybrid_tamil_stock_tokenizer/hybrid_tamil_stock_tokenizer.json")
+# Load summaries
+with open('tamil_bpe_tokenizer/tokenizer_summary.json', 'r') as f:
+    tamil_summary = json.load(f)
+with open('hybrid_tamil_stock_tokenizer/hybrid_tokenizer_summary.json', 'r') as f:
+    hybrid_summary = json.load(f)
+def decode_token_readable(tokenizer, token_id):
+    """Decode a single token ID to readable text."""
+    decoded = tokenizer.decode([token_id], skip_special_tokens=False)
+    # Clean up for display
+    if not decoded.strip():
+        return '[SPACE]'
+    return decoded.replace('\n', '\\n').replace('\t', '\\t')
+def tokenize_tamil(text):
+    """Tokenize using Tamil BPE tokenizer and decode tokens to UTF-8."""
+    if not text.strip():
+        return "Please enter some text to tokenize.", "", "", ""
+    encoding = tamil_tokenizer.encode(text)
+    tokens = encoding.tokens
+    token_ids = encoding.ids
+    # Calculate stats
+    char_count = len(text)
+    token_count = len(tokens)
+    compression = char_count / token_count if token_count > 0 else 0
+    # Decode each token for readable display
+    tokens_display = ""
+    for i, token_id in enumerate(token_ids):
+        readable_token = decode_token_readable(tamil_tokenizer, token_id)
+        tokens_display += f"{i+1}. \"{readable_token}\" (ID: {token_id})\n"
+    stats = f"""
+📊 **Tokenization Statistics**
+- **Characters**: {char_count}
+- **Tokens**: {token_count}
+- **Compression Ratio**: {compression:.2f}x
+- **Average chars/token**: {char_count/token_count:.2f}
+🔧 **Tokenizer Info**
+- Vocabulary Size: {tamil_summary['vocabulary_size']:,}
+- Algorithm: {tamil_summary['algorithm']}
+- Overall Compression: {tamil_summary['compression_ratio']:.2f}x
+ℹ️ **Display Note**: Tokens shown using UTF-8 decoded format for readability
+"""
+    # Full decoded text verification
+    decoded_full = tamil_tokenizer.decode(token_ids)
+    return tokens_display, stats, str(token_ids), decoded_full
+def tokenize_hybrid(text):
+    """Tokenize using Hybrid Tamil+Stock BPE tokenizer and decode tokens to UTF-8."""
+    if not text.strip():
+        return "Please enter some text to tokenize.", "", "", ""
+    encoding = hybrid_tokenizer.encode(text)
+    tokens = encoding.tokens
+    token_ids = encoding.ids
+    # Calculate stats
+    char_count = len(text)
+    token_count = len(tokens)
+    compression = char_count / token_count if token_count > 0 else 0
+    # Decode each token for readable display
+    tokens_display = ""
+    for i, token_id in enumerate(token_ids):
+        readable_token = decode_token_readable(hybrid_tokenizer, token_id)
+        tokens_display += f"{i+1}. \"{readable_token}\" (ID: {token_id})\n"
+    # Categorize tokens (approximate)
+    decoded_tokens = [decode_token_readable(hybrid_tokenizer, tid) for tid in token_ids]
+    tamil_like = sum(1 for t in decoded_tokens if any(ord(c) > 2944 and ord(c) < 3072 for c in t))
+    stock_keywords = ['$', 'stock', 'market', 'price', '%', 'surge', 'fall', 'rise', 'buy', 'sell']
+    stock_like = sum(1 for t in decoded_tokens if any(kw.lower() in t.lower() for kw in stock_keywords))
+    stats = f"""
+📊 **Tokenization Statistics**
+- **Characters**: {char_count}
+- **Tokens**: {token_count}
+- **Compression Ratio**: {compression:.2f}x
+- **Average chars/token**: {char_count/token_count:.2f}
+🔍 **Token Analysis (Approximate)**
+- Tamil-like tokens: {tamil_like}
+- Stock-like tokens: {stock_like}
+- Other tokens: {token_count - tamil_like - stock_like}
+🔧 **Tokenizer Info**
+- Total Vocabulary: {hybrid_summary['vocabulary_size']:,}
+- Tamil Vocab: {hybrid_summary['tamil_vocab_count']:,} ({hybrid_summary['tamil_vocab_percentage']:.1f}%)
+- Stock Vocab: {hybrid_summary['stock_vocab_count']:,} ({hybrid_summary['stock_vocab_percentage']:.1f}%)
+- Overall Compression: {hybrid_summary['compression_ratio']:.2f}x
+ℹ️ **Display Note**: Tokens shown using UTF-8 decoded format for readability
+"""
+    # Full decoded text verification
+    decoded_full = hybrid_tokenizer.decode(token_ids)
+    return tokens_display, stats, str(token_ids), decoded_full
+# Tamil examples
+tamil_examples = [
+    ["தமிழ் மொழி இந்தியாவின் பழமையான மொழிகளில் ஒன்று"],
+    ["கணினி அறிவியல் மற்றும் தொழில்நுட்பம் வளர்ந்து வருகிறது"],
+    ["செயற்கை நுண்ணறிவு என்பது மிகவும் சுவாரஸ்யமான துறை"],
+]
+# Hybrid examples
+hybrid_examples = [
+    ["ரிலையன்ஸ் பங்கு $Reliance rose to 2480 +1.2% இன்���ு"],
+    ["$Apple stock surged to 175.50 ஆப்பிள் பங்கு +3.7% on strong revenue"],
+    ["TCS stock surged to 3250 டிசிஎஸ் நிறுவனம் வர்த்தகம் 15L பங்குகள்"],
+    ["இன்று சந்தையில் $Infosys rose +2.5% $HDFC fell -1.8%"],
+    ["பங்கு சந்தை Apple stock opened 172.30 closed 175.50 buy வாங்கலாம்"],
+]
+# Create Gradio interface with custom CSS for teal theme
+custom_css = """
+.teal-button {
+    background: linear-gradient(to right, #14b8a6, #0d9488) !important;
+    border: none !important;
+}
+.teal-button:hover {
+    background: linear-gradient(to right, #0d9488, #0f766e) !important;
+}
+/* Change all bold text from purple/violet to teal */
+strong, b {
+    color: #0d9488 !important;
+}
+/* Change markdown bold text to teal */
+.markdown-text strong {
+    color: #0d9488 !important;
+}
+/* Change any purple/violet text to teal */
+.prose strong {
+    color: #0d9488 !important;
+}
+/* Tab labels */
+.tabs button.selected {
+    color: #0d9488 !important;
+    border-bottom-color: #0d9488 !important;
+}
+"""
+with gr.Blocks(title="Tamil & Hybrid BPE Tokenizer Demo", theme=gr.themes.Soft(), css=custom_css) as demo:
+    gr.Markdown("""
+    # 🔤 Tamil & Hybrid BPE Tokenizer Demo
+    Test two Byte Pair Encoding (BPE) tokenizers:
+    1. **Tamil Tokenizer**: Specialized for Tamil language text
+    2. **Hybrid Tokenizer**: Handles both Tamil language and Stock market terminology
+    ---
+    """)
+    with gr.Tabs():
+        # Tamil Tokenizer Tab
+        with gr.TabItem("🇮🇳 Tamil Tokenizer"):
+            gr.Markdown("""
+            ### Tamil Language BPE Tokenizer
+            - **Vocabulary**: 8,000 tokens
+            - **Dataset**: 50,000 Tamil Wikipedia articles
+            - **Compression**: 4.67x average
+            - **Display**: UTF-8 decoded tokens for readability
+            """)
+            with gr.Row():
+                with gr.Column():
+                    tamil_input = gr.Textbox(
+                        label="Input Text (Tamil)",
+                        placeholder="Enter Tamil text here...",
+                        lines=5
+                    )
+                    tamil_button = gr.Button("Tokenize", variant="primary", elem_classes="teal-button")
+                    gr.Examples(
+                        examples=tamil_examples,
+                        inputs=tamil_input,
+                        label="Example Tamil Texts"
+                    )
+                with gr.Column():
+                    tamil_tokens_output = gr.Textbox(
+                        label="Token Breakdown",
+                        lines=10,
+                        max_lines=20
+                    )
+                    tamil_stats_output = gr.Markdown(label="Statistics")
+            with gr.Accordion("Advanced Output", open=False):
+                with gr.Row():
+                    tamil_ids_output = gr.Textbox(label="Token IDs", lines=2)
+                    tamil_decoded_output = gr.Textbox(label="Decoded Text", lines=2)
+        # Hybrid Tokenizer Tab
+        with gr.TabItem("📈 Hybrid Tokenizer (Tamil + Stock)"):
+            gr.Markdown("""
+            ### Hybrid Tamil + Stock Market BPE Tokenizer
+            - **Vocabulary**: 40,000 tokens
+            - **Dataset**: 30,000 documents (Tamil + Financial news)
+            - **Tamil**: 35,991 tokens (89.98%), 5.12x compression
+            - **Stock**: 5,572 tokens (13.93%), 4.90x compression
+            - **Display**: UTF-8 decoded tokens for readability
+            """)
+            with gr.Row():
+                with gr.Column():
+                    hybrid_input = gr.Textbox(
+                        label="Input Text (Tamil + Stock/English)",
+                        placeholder="Enter mixed Tamil and stock market text...",
+                        lines=5
+                    )
+                    hybrid_button = gr.Button("Tokenize", variant="primary", elem_classes="teal-button")
+                    gr.Examples(
+                        examples=hybrid_examples,
+                        inputs=hybrid_input,
+                        label="Example Hybrid Texts"
+                    )
+                with gr.Column():
+                    hybrid_tokens_output = gr.Textbox(
+                        label="Token Breakdown",
+                        lines=10,
+                        max_lines=20
+                    )
+                    hybrid_stats_output = gr.Markdown(label="Statistics")
+            with gr.Accordion("Advanced Output", open=False):
+                with gr.Row():
+                    hybrid_ids_output = gr.Textbox(label="Token IDs", lines=2)
+                    hybrid_decoded_output = gr.Textbox(label="Decoded Text", lines=2)
+    # About section
+    with gr.Accordion("ℹ️ About These Tokenizers", open=False):
+        gr.Markdown("""
+        ## Technical Details
+        ### Tamil Tokenizer
+        - **Vocabulary**: 8,000 tokens
+        - **Algorithm**: Byte Pair Encoding (BPE) with ByteLevel encoding
+        - **Dataset**: 50,000 Tamil Wikipedia articles
+        - **Compression**: 4.67x average
+        ### Hybrid Tokenizer
+        - **Vocabulary**: 40,000 tokens (35,991 Tamil + 5,572 Stock)
+        - **Algorithm**: Byte Pair Encoding (BPE) with ByteLevel encoding
+        - **Dataset**: 30,000 documents (10% Tamil Wikipedia + 90% Financial news)
+        - **Compression**: 5.78x overall
+        ### Token Display
+        - **ByteLevel Encoding**: Tokens are encoded at byte level for efficiency
+        - **Token Decoding**: Each token is decoded using UTF-8 encoding
+        - **Note**: Due to normalization, some Tamil vowel marks may be altered
+        ### Real-World Applications
+        - Tamil language NLP
+        - Tamil financial news processing
+        - Bilingual trading platforms
+        - Stock market sentiment analysis in Tamil
+        ---
+        **Created for NLP coursework** | **License**: MIT
+        """)
+    # Connect buttons
+    tamil_button.click(
+        fn=tokenize_tamil,
+        inputs=tamil_input,
+        outputs=[tamil_tokens_output, tamil_stats_output, tamil_ids_output, tamil_decoded_output]
+    )
+    hybrid_button.click(
+        fn=tokenize_hybrid,
+        inputs=hybrid_input,
+        outputs=[hybrid_tokens_output, hybrid_stats_output, hybrid_ids_output, hybrid_decoded_output]
+    )
+# Launch the app
+if __name__ == "__main__":
+    demo.launch()

hybrid_tamil_stock_tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

hybrid_tokenizer_summary.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "type": "Hybrid Tokenizer",
+  "domains": [
+    "Tamil Language",
+    "Stock Market Data"
+  ],
+  "vocabulary_size": 40000,
+  "compression_ratio": 5.7752,
+  "tamil_compression": 5.12,
+  "tamil_vocab_count": 35991,
+  "tamil_vocab_percentage": 89.98,
+  "stock_compression": 4.9,
+  "stock_vocab_count": 5572,
+  "stock_vocab_percentage": 13.93,
+  "meets_vocab_requirement": true,
+  "meets_compression_requirement": true,
+  "meets_tamil_requirement": true,
+  "meets_stock_requirement": true,
+  "dataset_composition": {
+    "tamil": "10%",
+    "stock": "90%"
+  },
+  "total_training_documents": 30000,
+  "double_points_attempt": true,
+  "note": "Tamil uses byte-encoding (ByteLevel), Stock uses English vocabulary",
+  "unique_features": [
+    "5000+ stock vocabulary: 1000+ symbols, 4000+ trading/finance terms",
+    "Combines Tamil language with comprehensive stock market vocabulary",
+    "Real-world application: Tamil financial news + stock data",
+    "Tamil: 5.1x compression, Stock: 4.9x compression"
+  ]
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,25 @@

+# Core tokenizer dependencies
+tokenizers==0.22.1
+datasets==4.3.0
+huggingface-hub==1.0.1
+# Data processing
+numpy==2.3.4
+pandas==2.2.3
+# Visualization
+matplotlib==3.9.3
+# Progress bars
+tqdm==4.67.1
+# Jupyter notebook support
+jupyter==1.1.1
+ipywidgets==8.1.5
+notebook==7.3.2
+# Web app
+gradio==4.44.0
+# Additional utilities
+requests==2.32.3

tamil_bpe_tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_summary.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "language": "Tamil",
+  "algorithm": "BPE",
+  "vocabulary_size": 8000,
+  "compression_ratio": 4.6671,
+  "meets_vocab_requirement": true,
+  "meets_compression_requirement": true,
+  "dataset_size": 50000,
+  "dataset_source": "HuggingFace (Real Tamil Data)"
+}