malarsaravanan's picture
Upload app.py
c7ca2eb verified
import gradio as gr
from tokenizers import Tokenizer
import json
# Load tokenizers
tamil_tokenizer = Tokenizer.from_file("tamil_bpe_tokenizer.json")
hybrid_tokenizer = Tokenizer.from_file("hybrid_tamil_stock_tokenizer.json")
# Load summaries
with open('tokenizer_summary.json', 'r') as f:
tamil_summary = json.load(f)
with open('hybrid_tokenizer_summary.json', 'r') as f:
hybrid_summary = json.load(f)
def decode_token_readable(tokenizer, token_id):
"""Decode a single token ID to readable text."""
decoded = tokenizer.decode([token_id], skip_special_tokens=False)
# Clean up for display
if not decoded.strip():
return '[SPACE]'
return decoded.replace('\n', '\\n').replace('\t', '\\t')
def tokenize_tamil(text):
"""Tokenize using Tamil BPE tokenizer and decode tokens to UTF-8."""
if not text.strip():
return "Please enter some text to tokenize.", "", "", ""
encoding = tamil_tokenizer.encode(text)
tokens = encoding.tokens
token_ids = encoding.ids
# Calculate stats
char_count = len(text)
token_count = len(tokens)
compression = char_count / token_count if token_count > 0 else 0
# Decode each token for readable display
tokens_display = ""
for i, token_id in enumerate(token_ids):
readable_token = decode_token_readable(tamil_tokenizer, token_id)
tokens_display += f"{i+1}. \"{readable_token}\" (ID: {token_id})\n"
stats = f"""
📊 **Tokenization Statistics**
- **Characters**: {char_count}
- **Tokens**: {token_count}
- **Compression Ratio**: {compression:.2f}x
- **Average chars/token**: {char_count/token_count:.2f}
🔧 **Tokenizer Info**
- Vocabulary Size: {tamil_summary['vocabulary_size']:,}
- Algorithm: {tamil_summary['algorithm']}
- Overall Compression: {tamil_summary['compression_ratio']:.2f}x
ℹ️ **Display Note**: Tokens shown using UTF-8 decoded format for readability
"""
# Full decoded text verification
decoded_full = tamil_tokenizer.decode(token_ids)
return tokens_display, stats, str(token_ids), decoded_full
def tokenize_hybrid(text):
"""Tokenize using Hybrid Tamil+Stock BPE tokenizer and decode tokens to UTF-8."""
if not text.strip():
return "Please enter some text to tokenize.", "", "", ""
encoding = hybrid_tokenizer.encode(text)
tokens = encoding.tokens
token_ids = encoding.ids
# Calculate stats
char_count = len(text)
token_count = len(tokens)
compression = char_count / token_count if token_count > 0 else 0
# Decode each token for readable display
tokens_display = ""
for i, token_id in enumerate(token_ids):
readable_token = decode_token_readable(hybrid_tokenizer, token_id)
tokens_display += f"{i+1}. \"{readable_token}\" (ID: {token_id})\n"
# Categorize tokens (approximate)
decoded_tokens = [decode_token_readable(hybrid_tokenizer, tid) for tid in token_ids]
tamil_like = sum(1 for t in decoded_tokens if any(ord(c) > 2944 and ord(c) < 3072 for c in t))
stock_keywords = ['$', 'stock', 'market', 'price', '%', 'surge', 'fall', 'rise', 'buy', 'sell']
stock_like = sum(1 for t in decoded_tokens if any(kw.lower() in t.lower() for kw in stock_keywords))
stats = f"""
📊 **Tokenization Statistics**
- **Characters**: {char_count}
- **Tokens**: {token_count}
- **Compression Ratio**: {compression:.2f}x
- **Average chars/token**: {char_count/token_count:.2f}
🔍 **Token Analysis (Approximate)**
- Tamil-like tokens: {tamil_like}
- Stock-like tokens: {stock_like}
- Other tokens: {token_count - tamil_like - stock_like}
🔧 **Tokenizer Info**
- Total Vocabulary: {hybrid_summary['vocabulary_size']:,}
- Tamil Vocab: {hybrid_summary['tamil_vocab_count']:,} ({hybrid_summary['tamil_vocab_percentage']:.1f}%)
- Stock Vocab: {hybrid_summary['stock_vocab_count']:,} ({hybrid_summary['stock_vocab_percentage']:.1f}%)
- Overall Compression: {hybrid_summary['compression_ratio']:.2f}x
ℹ️ **Display Note**: Tokens shown using UTF-8 decoded format for readability
"""
# Full decoded text verification
decoded_full = hybrid_tokenizer.decode(token_ids)
return tokens_display, stats, str(token_ids), decoded_full
# Tamil examples
tamil_examples = [
["தமிழ் மொழி இந்தியாவின் பழமையான மொழிகளில் ஒன்று"],
["கணினி அறிவியல் மற்றும் தொழில்நுட்பம் வளர்ந்து வருகிறது"],
["செயற்கை நுண்ணறிவு என்பது மிகவும் சுவாரஸ்யமான துறை"],
]
# Hybrid examples
hybrid_examples = [
["ரிலையன்ஸ் பங்கு $Reliance rose to 2480 +1.2% இன்று"],
["$Apple stock surged to 175.50 ஆப்பிள் பங்கு +3.7% on strong revenue"],
["TCS stock surged to 3250 டிசிஎஸ் நிறுவனம் வர்த்தகம் 15L பங்குகள்"],
["இன்று சந்தையில் $Infosys rose +2.5% $HDFC fell -1.8%"],
["பங்கு சந்தை Apple stock opened 172.30 closed 175.50 buy வாங்கலாம்"],
]
# Create Gradio interface with custom CSS for teal theme
custom_css = """
.teal-button {
background: linear-gradient(to right, #14b8a6, #0d9488) !important;
border: none !important;
}
.teal-button:hover {
background: linear-gradient(to right, #0d9488, #0f766e) !important;
}
/* Change all bold text from purple/violet to teal */
strong, b {
color: #0d9488 !important;
}
/* Change markdown bold text to teal */
.markdown-text strong {
color: #0d9488 !important;
}
/* Change any purple/violet text to teal */
.prose strong {
color: #0d9488 !important;
}
/* Tab labels */
.tabs button.selected {
color: #0d9488 !important;
border-bottom-color: #0d9488 !important;
}
"""
with gr.Blocks(title="Tamil & Hybrid BPE Tokenizer Demo", theme=gr.themes.Soft(), css=custom_css) as demo:
gr.Markdown("""
# 🔤 Tamil & Hybrid BPE Tokenizer Demo
Test two Byte Pair Encoding (BPE) tokenizers:
1. **Tamil Tokenizer**: Specialized for Tamil language text
2. **Hybrid Tokenizer**: Handles both Tamil language and Stock market terminology
---
""")
with gr.Tabs():
# Tamil Tokenizer Tab
with gr.TabItem("🇮🇳 Tamil Tokenizer"):
gr.Markdown("""
### Tamil Language BPE Tokenizer
- **Vocabulary**: 8,000 tokens
- **Dataset**: 50,000 Tamil Wikipedia articles
- **Compression**: 4.67x average
- **Display**: UTF-8 decoded tokens for readability
""")
with gr.Row():
with gr.Column():
tamil_input = gr.Textbox(
label="Input Text (Tamil)",
placeholder="Enter Tamil text here...",
lines=5
)
tamil_button = gr.Button("Tokenize", variant="primary", elem_classes="teal-button")
gr.Examples(
examples=tamil_examples,
inputs=tamil_input,
label="Example Tamil Texts"
)
with gr.Column():
tamil_tokens_output = gr.Textbox(
label="Token Breakdown",
lines=10,
max_lines=20
)
tamil_stats_output = gr.Markdown(label="Statistics")
with gr.Accordion("Advanced Output", open=False):
with gr.Row():
tamil_ids_output = gr.Textbox(label="Token IDs", lines=2)
tamil_decoded_output = gr.Textbox(label="Decoded Text", lines=2)
# Hybrid Tokenizer Tab
with gr.TabItem("📈 Hybrid Tokenizer (Tamil + Stock)"):
gr.Markdown("""
### Hybrid Tamil + Stock Market BPE Tokenizer
- **Vocabulary**: 40,000 tokens
- **Dataset**: 30,000 documents (Tamil + Financial news)
- **Tamil**: 35,991 tokens (89.98%), 5.12x compression
- **Stock**: 5,572 tokens (13.93%), 4.90x compression
- **Display**: UTF-8 decoded tokens for readability
""")
with gr.Row():
with gr.Column():
hybrid_input = gr.Textbox(
label="Input Text (Tamil + Stock/English)",
placeholder="Enter mixed Tamil and stock market text...",
lines=5
)
hybrid_button = gr.Button("Tokenize", variant="primary", elem_classes="teal-button")
gr.Examples(
examples=hybrid_examples,
inputs=hybrid_input,
label="Example Hybrid Texts"
)
with gr.Column():
hybrid_tokens_output = gr.Textbox(
label="Token Breakdown",
lines=10,
max_lines=20
)
hybrid_stats_output = gr.Markdown(label="Statistics")
with gr.Accordion("Advanced Output", open=False):
with gr.Row():
hybrid_ids_output = gr.Textbox(label="Token IDs", lines=2)
hybrid_decoded_output = gr.Textbox(label="Decoded Text", lines=2)
# About section
with gr.Accordion("ℹ️ About These Tokenizers", open=False):
gr.Markdown("""
## Technical Details
### Tamil Tokenizer
- **Vocabulary**: 8,000 tokens
- **Algorithm**: Byte Pair Encoding (BPE) with ByteLevel encoding
- **Dataset**: 50,000 Tamil Wikipedia articles
- **Compression**: 4.67x average
### Hybrid Tokenizer
- **Vocabulary**: 40,000 tokens (35,991 Tamil + 5,572 Stock)
- **Algorithm**: Byte Pair Encoding (BPE) with ByteLevel encoding
- **Dataset**: 30,000 documents (10% Tamil Wikipedia + 90% Financial news)
- **Compression**: 5.78x overall
### Token Display
- **ByteLevel Encoding**: Tokens are encoded at byte level for efficiency
- **Token Decoding**: Each token is decoded using UTF-8 encoding
- **Note**: Due to normalization, some Tamil vowel marks may be altered
### Real-World Applications
- Tamil language NLP
- Tamil financial news processing
- Bilingual trading platforms
- Stock market sentiment analysis in Tamil
---
**Created for NLP coursework** | **License**: MIT
""")
# Connect buttons
tamil_button.click(
fn=tokenize_tamil,
inputs=tamil_input,
outputs=[tamil_tokens_output, tamil_stats_output, tamil_ids_output, tamil_decoded_output]
)
hybrid_button.click(
fn=tokenize_hybrid,
inputs=hybrid_input,
outputs=[hybrid_tokens_output, hybrid_stats_output, hybrid_ids_output, hybrid_decoded_output]
)
# Launch the app
if __name__ == "__main__":
demo.launch()