Spaces:

kishkath
/

bpe-tokenizer

Sleeping

App Files Files Community

bpe-tokenizer / app.py

kishkath

Update app.py

b0ed0a7 verified 12 months ago

raw

history blame contribute delete

9.94 kB

	import gradio as gr
	import json
	import os
	import sys
	import numpy as np

	# Add the current directory to Python path
	current_dir = os.path.dirname(os.path.abspath(__file__))
	sys.path.append(current_dir)

	from tokenizers.basic import BasicTokenizer

	def load_tokenizer(model_path, vocab_path):
	"""Load the trained tokenizer"""
	tokenizer = BasicTokenizer()
	try:
	# Check if paths exist
	if not os.path.exists(model_path):
	raise FileNotFoundError(f"Model file not found at: {model_path}")
	if not os.path.exists(vocab_path):
	raise FileNotFoundError(f"Vocabulary file not found at: {vocab_path}")

	# Load the trained model
	tokenizer.load(model_path)

	# Load vocabulary
	with open(vocab_path, 'r', encoding='utf-8') as f:
	vocab_data = json.load(f)
	tokenizer.token_to_id = {k: int(v) for k, v in vocab_data['token_to_id'].items()}
	tokenizer.id_to_token = {int(k): v for k, v in vocab_data['id_to_token'].items()}
	tokenizer.merges = {tuple(map(int, k.split(','))): int(v)
	for k, v in vocab_data['merges'].items()}
	return tokenizer
	except Exception as e:
	raise Exception(f"Error loading tokenizer: {str(e)}")

	def encode_text(text, tokenizer):
	"""Encode text and return statistics"""
	if not text.strip():
	return ("Please enter some Telugu text",
	"No statistics available",
	[])

	try:
	# Encode the text
	encoded = tokenizer.encode(text)

	# Calculate compression ratio
	original_size = len(text.encode('utf-8'))
	encoded_size = len(encoded) * 2
	compression_ratio = original_size / encoded_size

	# Prepare statistics
	stats = f"""
	📊 Encoding Statistics:
	• Original text length: {len(text)} characters
	• Encoded length: {len(encoded)} tokens
	• Compression ratio: {compression_ratio:.2f}X
	• Original size: {original_size} bytes
	• Encoded size: {encoded_size} bytes
	• Space saved: {(1 - encoded_size/original_size) * 100:.1f}%
	"""

	# Create visualization data
	tokens = []

	# Generate colors based on token frequencies
	unique_tokens = set(encoded)
	# Create color map with string hex colors
	color_map = {token: f"#{hash(str(token)) % 0xFFFFFF:06x}" for token in unique_tokens}

	# Create visualization list with proper format
	visualization = []
	for token_id in encoded:
	token_bytes = tokenizer.vocab[token_id]
	token_text = token_bytes.decode('utf-8', errors='replace')
	visualization.append((token_text, color_map[token_id]))

	return (
	str(encoded),
	stats,
	visualization
	)

	except Exception as e:
	return (
	f"Error: {str(e)}",
	"Error occurred during encoding",
	[]
	)

	def decode_ids(encoded_ids_str):
	"""Decode the encoded IDs back to text"""
	if not encoded_ids_str.strip():
	return "Please enter encoded IDs"

	try:
	# Convert string representation of list to actual list of integers
	encoded_ids = eval(encoded_ids_str)
	if not isinstance(encoded_ids, list):
	return "Invalid input: Please enter a list of integers"

	# Decode the IDs
	decoded_text = tokenizer.decode(encoded_ids)
	return decoded_text
	except Exception as e:
	return f"Error during decoding: {str(e)}"

	# Load the tokenizer
	try:
	model_path = os.path.join(current_dir, "models", "version_2", "checkpoints", "telugu_basic.model")
	vocab_path = os.path.join(current_dir, "models", "version_2", "vocabulary", "vocabulary.json")

	print(f"Loading model from: {model_path}")
	print(f"Loading vocabulary from: {vocab_path}")

	tokenizer = load_tokenizer(model_path, vocab_path)
	print("Tokenizer loaded successfully")
	except Exception as e:
	print(f"Error loading tokenizer: {str(e)}")
	raise

	# Example inputs
	encoder_examples = [
	["తెలుగు భాష చాలా అందమైనది", "Basic sentence example"],
	["నేను తెలుగు నేర్చుకుంటున్నాను", "Learning Telugu example"],
	["ప్రతి ఒక్కరూ సంతోషంగా ఉండాలి", "Happiness wish example"],
	["అరణ్యంలో రాముడు అనేక రాక్షసులను సంహరిస్తాడు", "Complex sentence example"],
	["తెలుగు సాహిత్యం చాలా సమృద్ధిగా ఉంది", "Literature example"]
	]

	decoder_examples = [
	["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260]", "Basic sentence decoding"],
	["[287, 2206, 1165, 960, 2132, 1558, 629, 286, 260, 287, 2206]", "Multiple tokens decoding"],
	]

	# Create the Gradio interface
	with gr.Blocks(title="Telugu Text Tokenizer", theme=gr.themes.Soft()) as demo:
	gr.Markdown("""
	# 🔤 Telugu Text Tokenizer

	This tool helps you encode Telugu text into tokens and decode them back.
	It uses a trained BPE (Byte Pair Encoding) tokenizer optimized for Telugu language.

	## Features:
	- 🔄 Encode Telugu text to token IDs
	- 📊 View compression statistics
	- 🎨 Visualize token segmentation
	- ⚡ Fast and efficient encoding/decoding
	""")

	with gr.Tab("Encoder"):
	with gr.Row():
	with gr.Column():
	input_text = gr.Textbox(
	label="Enter Telugu Text",
	placeholder="Type or paste Telugu text here...",
	lines=5,
	interactive=True
	)
	encode_btn = gr.Button("🔄 Encode", variant="primary")

	with gr.Column():
	with gr.Row():
	encoded_output = gr.Textbox(
	label="Encoded Token IDs",
	lines=5,
	interactive=False,
	show_copy_button=True
	)
	stats_output = gr.Textbox(
	label="Statistics",
	lines=8,
	interactive=False
	)

	with gr.Row():
	token_viz = gr.HighlightedText(
	label="Token Segmentation",
	show_legend=True,
	combine_adjacent=True,
	color_map={}
	)

	# Encoder button click event
	encode_btn.click(
	fn=lambda text: encode_text(text, tokenizer),
	inputs=[input_text],
	outputs=[encoded_output, stats_output, token_viz]
	)

	# Examples for encoder
	gr.Examples(
	examples=encoder_examples,
	inputs=input_text,
	outputs=[encoded_output, stats_output, token_viz],
	fn=lambda x: encode_text(x, tokenizer),
	cache_examples=True,
	label="Telugu Text Examples"
	)

	with gr.Tab("Decoder"):
	with gr.Row():
	with gr.Column():
	encoded_input = gr.Textbox(
	label="Enter Encoded Token IDs",
	placeholder="Paste the encoded token IDs here...",
	lines=5,
	interactive=True
	)
	decode_btn = gr.Button("🔄 Decode", variant="primary")

	with gr.Column():
	decoded_output = gr.Textbox(
	label="Decoded Telugu Text",
	lines=5,
	interactive=False
	)

	# Decoder button click event
	decode_btn.click(
	fn=decode_ids,
	inputs=[encoded_input],
	outputs=[decoded_output]
	)

	# Examples for decoder
	gr.Examples(
	examples=decoder_examples,
	inputs=encoded_input,
	outputs=decoded_output,
	fn=decode_ids,
	cache_examples=True,
	label="Token ID Examples"
	)

	gr.Markdown("""
	### 📝 Instructions:
	1. Encoding:
	- Enter Telugu text in the encoder tab
	- Click "Encode" to get token IDs and statistics
	- Try the examples below to see how different texts are encoded

	2. Decoding:
	- Copy the encoded IDs from the encoder output
	- Paste them in the decoder tab
	- Click "Decode" to get back the original text
	- Try the example token IDs to see how decoding works

	3. Visualization:
	- Each token is highlighted with a unique color
	- Same tokens will have the same color
	- Hover over tokens to see their IDs

	### 🎯 Example Usage:
	- Try encoding "తెలుగు" to see how basic words are tokenized
	- Use longer sentences to see compression in action
	- Copy encoded IDs and decode them back to verify accuracy

	### ℹ️ Notes:
	- The tokenizer uses BPE (Byte Pair Encoding) algorithm
	- Compression ratio shows how efficiently the text is encoded
	- Different colors in visualization represent different tokens
	- Typical compression ratios range from 3x to 4x
	""")

	gr.Markdown("""
	---
	### 📌 Version Information
	- Model Version: 2.0
	- Vocabulary Size: 4800 tokens
	- Last Updated: 2024
	""")

	# Launch the app
	if __name__ == "__main__":
	demo.launch(
	share=True,
	debug=True,
	server_name="0.0.0.0",
	server_port=7860,
	show_error=True
	)