Spaces:

AkashDataScience
/

languageBPE

Sleeping

Testing colors

c8dbcc1 over 1 year ago

1.44 kB

	import torch
	import random
	import gradio as gr
	from language_bpe import BPETokenizer

	tokenizer = BPETokenizer()
	tokenizer.load('models/english_5000.model')

	def inference(input_text):
	encoding = tokenizer.encode_ordinary(input_text)
	# sentence = [tokenizer.decode([x]) for x in encoding]
	# color_sentence = ""
	# for word in sentence:
	# background_color = random.randint(40, 47)
	# color_sentence += f"\033[0;37;{background_color}m {word}"
	color_sentence = f"\033[0;37;{41}m Black" + f"\033[0;37;{42}m Black"
	return len(encoding), color_sentence, encoding

	title = "Bilingual tokenizer"
	description = "A simple Gradio interface to see tokenization of Hindi and English(Hinglish) text"
	examples = [["He walked into the basement with the horror movie from the night before playing in his head."],
	["Henry couldn't decide if he was an auto mechanic or a priest."],
	["Poison ivy grew through the fence they said was impenetrable."],
	]
	demo = gr.Interface(
	inference,
	inputs = [
	gr.Textbox(label="Enter any sentence in Hindi, English or both language", type="text"),
	],
	outputs = [
	gr.Label(label="Token count"),
	gr.Textbox(label="Sentence after tokenization", type="text"),
	gr.Textbox(label="Encoding", type="text")
	],
	title = title,
	description = description,
	examples = examples,
	)
	demo.launch()