Spaces:

jatingocodeo
/

Hindi-BPE

Sleeping

App Files Files Community

Hindi-BPE / app.py

jatingocodeo

Update app.py

f61c187 verified 12 months ago

raw

history blame contribute delete

2.48 kB

	import gradio as gr
	from src.hindi_bpe import HindiBPE
	import pickle
	import os

	# Initialize the tokenizer
	tokenizer = HindiBPE(max_vocab_size=5000, target_compression=3.2)

	# Load production model state
	model_file = 'hindi_bpe_model.pkl'
	if os.path.exists(model_file):
	print("Loading production model...")
	with open(model_file, 'rb') as f:
	state = pickle.load(f)
	tokenizer.vocab = state['vocab']
	tokenizer.inverse_vocab = state['inverse_vocab']
	tokenizer.bpe_ranks = state['bpe_ranks']
	print(f"Model loaded successfully!")
	print(f"Vocabulary size: {len(tokenizer.vocab)} tokens")
	else:
	raise FileNotFoundError("Production model not found! Please run train_bpe.py first and copy the model file.")

	def process_text(text: str, mode: str) -> str:
	"""Process text using the tokenizer"""
	if not text.strip():
	return "Please enter some text."

	if mode == "Encode":
	# Encode the text
	encoded = tokenizer.encode(text)
	return f"Encoded tokens: {encoded}"
	else:
	# First encode then decode to show the round trip
	encoded = tokenizer.encode(text)
	decoded = tokenizer.decode(encoded)
	return f"Original: {text}\nDecoded: {decoded}\nMatches: {'✓' if text == decoded else '✗'}"

	# Create the interface
	iface = gr.Interface(
	fn=process_text,
	inputs=[
	gr.Textbox(label="Enter Hindi Text", placeholder="नमस्ते भारत"),
	gr.Radio(["Encode", "Encode & Decode"], label="Operation", value="Encode & Decode")
	],
	outputs=gr.Textbox(label="Result"),
	title="Hindi BPE Tokenizer (Production Model)",
	description="""This is a production-grade Byte Pair Encoding (BPE) tokenizer trained on 1 million Hindi sentences.
	Features:
	- Vocabulary size: < 5000 tokens
	- Compression ratio: ≥ 3.2
	- Trained on 1M sentences
	- Proper handling of Hindi Unicode characters and combining marks""",
	examples=[
	["नमस्ते भारत", "Encode & Decode"],
	["मैं हिंदी सीख रहा हूं", "Encode & Decode"],
	["यह एक परीक्षण वाक्य है", "Encode & Decode"],
	["भारत एक विशाल देश है", "Encode & Decode"],
	["मुझे हिंदी भाषा बहुत पसंद है", "Encode & Decode"]
	]
	)

	if __name__ == "__main__":
	iface.launch()