Spaces:

malarsaravanan
/

indic_language_stock_tokenizer

Sleeping

App Files Files Community

indic_language_stock_tokenizer / app.py

malarsaravanan

Upload app.py

c7ca2eb verified 6 months ago

raw

history blame contribute delete

11.5 kB

	import gradio as gr
	from tokenizers import Tokenizer
	import json

	# Load tokenizers
	tamil_tokenizer = Tokenizer.from_file("tamil_bpe_tokenizer.json")
	hybrid_tokenizer = Tokenizer.from_file("hybrid_tamil_stock_tokenizer.json")

	# Load summaries
	with open('tokenizer_summary.json', 'r') as f:
	tamil_summary = json.load(f)

	with open('hybrid_tokenizer_summary.json', 'r') as f:
	hybrid_summary = json.load(f)


	def decode_token_readable(tokenizer, token_id):
	"""Decode a single token ID to readable text."""
	decoded = tokenizer.decode([token_id], skip_special_tokens=False)
	# Clean up for display
	if not decoded.strip():
	return '[SPACE]'
	return decoded.replace('\n', '\\n').replace('\t', '\\t')


	def tokenize_tamil(text):
	"""Tokenize using Tamil BPE tokenizer and decode tokens to UTF-8."""
	if not text.strip():
	return "Please enter some text to tokenize.", "", "", ""

	encoding = tamil_tokenizer.encode(text)
	tokens = encoding.tokens
	token_ids = encoding.ids

	# Calculate stats
	char_count = len(text)
	token_count = len(tokens)
	compression = char_count / token_count if token_count > 0 else 0

	# Decode each token for readable display
	tokens_display = ""
	for i, token_id in enumerate(token_ids):
	readable_token = decode_token_readable(tamil_tokenizer, token_id)
	tokens_display += f"{i+1}. \"{readable_token}\" (ID: {token_id})\n"

	stats = f"""
	📊 Tokenization Statistics

	- Characters: {char_count}
	- Tokens: {token_count}
	- Compression Ratio: {compression:.2f}x
	- Average chars/token: {char_count/token_count:.2f}

	🔧 Tokenizer Info
	- Vocabulary Size: {tamil_summary['vocabulary_size']:,}
	- Algorithm: {tamil_summary['algorithm']}
	- Overall Compression: {tamil_summary['compression_ratio']:.2f}x

	ℹ️ Display Note: Tokens shown using UTF-8 decoded format for readability
	"""

	# Full decoded text verification
	decoded_full = tamil_tokenizer.decode(token_ids)

	return tokens_display, stats, str(token_ids), decoded_full


	def tokenize_hybrid(text):
	"""Tokenize using Hybrid Tamil+Stock BPE tokenizer and decode tokens to UTF-8."""
	if not text.strip():
	return "Please enter some text to tokenize.", "", "", ""

	encoding = hybrid_tokenizer.encode(text)
	tokens = encoding.tokens
	token_ids = encoding.ids

	# Calculate stats
	char_count = len(text)
	token_count = len(tokens)
	compression = char_count / token_count if token_count > 0 else 0

	# Decode each token for readable display
	tokens_display = ""
	for i, token_id in enumerate(token_ids):
	readable_token = decode_token_readable(hybrid_tokenizer, token_id)
	tokens_display += f"{i+1}. \"{readable_token}\" (ID: {token_id})\n"

	# Categorize tokens (approximate)
	decoded_tokens = [decode_token_readable(hybrid_tokenizer, tid) for tid in token_ids]
	tamil_like = sum(1 for t in decoded_tokens if any(ord(c) > 2944 and ord(c) < 3072 for c in t))
	stock_keywords = ['$', 'stock', 'market', 'price', '%', 'surge', 'fall', 'rise', 'buy', 'sell']
	stock_like = sum(1 for t in decoded_tokens if any(kw.lower() in t.lower() for kw in stock_keywords))

	stats = f"""
	📊 Tokenization Statistics

	- Characters: {char_count}
	- Tokens: {token_count}
	- Compression Ratio: {compression:.2f}x
	- Average chars/token: {char_count/token_count:.2f}

	🔍 Token Analysis (Approximate)
	- Tamil-like tokens: {tamil_like}
	- Stock-like tokens: {stock_like}
	- Other tokens: {token_count - tamil_like - stock_like}

	🔧 Tokenizer Info
	- Total Vocabulary: {hybrid_summary['vocabulary_size']:,}
	- Tamil Vocab: {hybrid_summary['tamil_vocab_count']:,} ({hybrid_summary['tamil_vocab_percentage']:.1f}%)
	- Stock Vocab: {hybrid_summary['stock_vocab_count']:,} ({hybrid_summary['stock_vocab_percentage']:.1f}%)
	- Overall Compression: {hybrid_summary['compression_ratio']:.2f}x

	ℹ️ Display Note: Tokens shown using UTF-8 decoded format for readability
	"""

	# Full decoded text verification
	decoded_full = hybrid_tokenizer.decode(token_ids)

	return tokens_display, stats, str(token_ids), decoded_full


	# Tamil examples
	tamil_examples = [
	["தமிழ் மொழி இந்தியாவின் பழமையான மொழிகளில் ஒன்று"],
	["கணினி அறிவியல் மற்றும் தொழில்நுட்பம் வளர்ந்து வருகிறது"],
	["செயற்கை நுண்ணறிவு என்பது மிகவும் சுவாரஸ்யமான துறை"],
	]

	# Hybrid examples
	hybrid_examples = [
	["ரிலையன்ஸ் பங்கு $Reliance rose to 2480 +1.2% இன்று"],
	["$Apple stock surged to 175.50 ஆப்பிள் பங்கு +3.7% on strong revenue"],
	["TCS stock surged to 3250 டிசிஎஸ் நிறுவனம் வர்த்தகம் 15L பங்குகள்"],
	["இன்று சந்தையில் $Infosys rose +2.5% $HDFC fell -1.8%"],
	["பங்கு சந்தை Apple stock opened 172.30 closed 175.50 buy வாங்கலாம்"],
	]

	# Create Gradio interface with custom CSS for teal theme
	custom_css = """
	.teal-button {
	background: linear-gradient(to right, #14b8a6, #0d9488) !important;
	border: none !important;
	}
	.teal-button:hover {
	background: linear-gradient(to right, #0d9488, #0f766e) !important;
	}
	/* Change all bold text from purple/violet to teal */
	strong, b {
	color: #0d9488 !important;
	}
	/* Change markdown bold text to teal */
	.markdown-text strong {
	color: #0d9488 !important;
	}
	/* Change any purple/violet text to teal */
	.prose strong {
	color: #0d9488 !important;
	}
	/* Tab labels */
	.tabs button.selected {
	color: #0d9488 !important;
	border-bottom-color: #0d9488 !important;
	}
	"""

	with gr.Blocks(title="Tamil & Hybrid BPE Tokenizer Demo", theme=gr.themes.Soft(), css=custom_css) as demo:
	gr.Markdown("""
	# 🔤 Tamil & Hybrid BPE Tokenizer Demo

	Test two Byte Pair Encoding (BPE) tokenizers:
	1. Tamil Tokenizer: Specialized for Tamil language text
	2. Hybrid Tokenizer: Handles both Tamil language and Stock market terminology

	---
	""")

	with gr.Tabs():
	# Tamil Tokenizer Tab
	with gr.TabItem("🇮🇳 Tamil Tokenizer"):
	gr.Markdown("""
	### Tamil Language BPE Tokenizer

	- Vocabulary: 8,000 tokens
	- Dataset: 50,000 Tamil Wikipedia articles
	- Compression: 4.67x average
	- Display: UTF-8 decoded tokens for readability
	""")

	with gr.Row():
	with gr.Column():
	tamil_input = gr.Textbox(
	label="Input Text (Tamil)",
	placeholder="Enter Tamil text here...",
	lines=5
	)
	tamil_button = gr.Button("Tokenize", variant="primary", elem_classes="teal-button")
	gr.Examples(
	examples=tamil_examples,
	inputs=tamil_input,
	label="Example Tamil Texts"
	)

	with gr.Column():
	tamil_tokens_output = gr.Textbox(
	label="Token Breakdown",
	lines=10,
	max_lines=20
	)
	tamil_stats_output = gr.Markdown(label="Statistics")

	with gr.Accordion("Advanced Output", open=False):
	with gr.Row():
	tamil_ids_output = gr.Textbox(label="Token IDs", lines=2)
	tamil_decoded_output = gr.Textbox(label="Decoded Text", lines=2)

	# Hybrid Tokenizer Tab
	with gr.TabItem("📈 Hybrid Tokenizer (Tamil + Stock)"):
	gr.Markdown("""
	### Hybrid Tamil + Stock Market BPE Tokenizer

	- Vocabulary: 40,000 tokens
	- Dataset: 30,000 documents (Tamil + Financial news)
	- Tamil: 35,991 tokens (89.98%), 5.12x compression
	- Stock: 5,572 tokens (13.93%), 4.90x compression
	- Display: UTF-8 decoded tokens for readability
	""")

	with gr.Row():
	with gr.Column():
	hybrid_input = gr.Textbox(
	label="Input Text (Tamil + Stock/English)",
	placeholder="Enter mixed Tamil and stock market text...",
	lines=5
	)
	hybrid_button = gr.Button("Tokenize", variant="primary", elem_classes="teal-button")
	gr.Examples(
	examples=hybrid_examples,
	inputs=hybrid_input,
	label="Example Hybrid Texts"
	)

	with gr.Column():
	hybrid_tokens_output = gr.Textbox(
	label="Token Breakdown",
	lines=10,
	max_lines=20
	)
	hybrid_stats_output = gr.Markdown(label="Statistics")

	with gr.Accordion("Advanced Output", open=False):
	with gr.Row():
	hybrid_ids_output = gr.Textbox(label="Token IDs", lines=2)
	hybrid_decoded_output = gr.Textbox(label="Decoded Text", lines=2)

	# About section
	with gr.Accordion("ℹ️ About These Tokenizers", open=False):
	gr.Markdown("""
	## Technical Details

	### Tamil Tokenizer
	- Vocabulary: 8,000 tokens
	- Algorithm: Byte Pair Encoding (BPE) with ByteLevel encoding
	- Dataset: 50,000 Tamil Wikipedia articles
	- Compression: 4.67x average

	### Hybrid Tokenizer
	- Vocabulary: 40,000 tokens (35,991 Tamil + 5,572 Stock)
	- Algorithm: Byte Pair Encoding (BPE) with ByteLevel encoding
	- Dataset: 30,000 documents (10% Tamil Wikipedia + 90% Financial news)
	- Compression: 5.78x overall

	### Token Display
	- ByteLevel Encoding: Tokens are encoded at byte level for efficiency
	- Token Decoding: Each token is decoded using UTF-8 encoding
	- Note: Due to normalization, some Tamil vowel marks may be altered

	### Real-World Applications
	- Tamil language NLP
	- Tamil financial news processing
	- Bilingual trading platforms
	- Stock market sentiment analysis in Tamil

	---

	Created for NLP coursework \| License: MIT
	""")

	# Connect buttons
	tamil_button.click(
	fn=tokenize_tamil,
	inputs=tamil_input,
	outputs=[tamil_tokens_output, tamil_stats_output, tamil_ids_output, tamil_decoded_output]
	)

	hybrid_button.click(
	fn=tokenize_hybrid,
	inputs=hybrid_input,
	outputs=[hybrid_tokens_output, hybrid_stats_output, hybrid_ids_output, hybrid_decoded_output]
	)

	# Launch the app
	if __name__ == "__main__":
	demo.launch()