Spaces:

Monimoy
/

bengali_bpe_tokenizer

Build error

App Files Files Community

bengali_bpe_tokenizer / app.py

Monimoy

Bengali Tokenizer app

bab184a verified about 1 year ago

raw

history blame contribute delete

3.15 kB

	import gradio as gr
	import json
	from bengali_bpe_tokenizer import BengaliBPETokenizer
	import random
	import colorsys


	def generate_distinct_colors(n):
	"""Generate n visually distinct colors"""
	colors = []
	for i in range(n):
	hue = i / n
	saturation = 0.7
	value = 0.9
	rgb = colorsys.hsv_to_rgb(hue, saturation, value)
	hex_color = "#{:02x}{:02x}{:02x}".format(
	int(rgb[0] * 255), int(rgb[1] * 255), int(rgb[2] * 255)
	)
	colors.append(hex_color)
	return colors


	def load_tokenizer():
	try:
	return BengaliBPETokenizer.load("bengali_bpe_tokenizer.json")
	except:
	# If no saved tokenizer found, create a new one
	return BengaliBPETokenizer(vocab_size=5000)


	def tokenize_text(text):
	tokenizer = load_tokenizer()

	# Get token IDs and their corresponding text
	token_ids = tokenizer.encode(text)
	tokens = []
	current_pos = 0

	# Process text to get token spans
	words = [list(text)]
	for pair, merged in tokenizer.merges.items():
	words = tokenizer._merge_vocab(words, pair)

	# Extract final tokens
	final_tokens = []
	for word in words:
	final_tokens.extend(word)

	# Generate colors for tokens
	colors = generate_distinct_colors(len(tokenizer.vocab))
	color_map = {
	token_id: color for token_id, color in zip(tokenizer.vocab.values(), colors)
	}

	# Create highlighted HTML
	html_parts = []
	token_list = []

	for i, token in enumerate(final_tokens):
	token_id = tokenizer.vocab.get(token, tokenizer.special_tokens["<UNK>"])
	color = color_map[token_id]
	html_parts.append(f'<span style="background-color: {color}">{token}</span>')
	token_list.append(f"{token} ({token_id})")

	highlighted_text = "".join(html_parts)

	# Calculate compression ratio
	compression_ratio = len(text) / len(token_ids) if len(token_ids) > 0 else 0

	return (
	len(token_ids), # Token count
	compression_ratio, # Compression ratio
	highlighted_text, # Highlighted text
	"\n".join(token_list), # Token list
	)


	custom_css = """
	.token-highlight {
	border-radius: 3px;
	margin: 0 1px;
	}
	.container {
	max-width: 1200px;
	margin: 0 auto;
	}
	"""

	with gr.Blocks(css=custom_css) as demo:
	gr.Markdown("# Bengali BPE Tokenizer")

	with gr.Row():
	with gr.Column(scale=1):
	input_text = gr.Textbox(
	label="Input Text", placeholder="Enter Bengali text here...", lines=10
	)

	with gr.Column(scale=1):
	token_count = gr.Number(label="Token Count")
	compression_ratio = gr.Number(label="Compression Ratio")
	highlighted_output = gr.HTML(label="Tokenized Text")
	token_list = gr.Textbox(label="Token List", lines=10)

	input_text.change(
	fn=tokenize_text,
	inputs=[input_text],
	outputs=[token_count, compression_ratio, highlighted_output, token_list],
	)

	demo.launch()