Spaces:

ncomly-nvidia
/

tps_emulator

Sleeping

App Files Files Community

tps_emulator / app.py

ncomly-nvidia

Update app.py

af7aa62 verified about 1 year ago

raw

history blame contribute delete

3.98 kB

	import gradio as gr
	import time
	from transformers import LlamaTokenizer, AutoTokenizer


	# helpers
	def sleep_ms(ms):
	time.sleep(ms / 1000)

	def tokenize(string, model_name="lmsys/vicuna-7b-v1.5"):
	tokenizer = LlamaTokenizer.from_pretrained(model_name)
	input_ids = tokenizer.encode(string, return_tensors="pt")
	return input_ids

	def detokenize(tokens, model_name="lmsys/vicuna-7b-v1.5"):
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	string = tokenizer.decode(tokens[0])
	return string

	def emulate(text, ttft=1000, out_tps=10, in_tps=None, model_name="lmsys/vicuna-7b-v1.5", n=10, offset=0):
	# get TTFT
	if in_tps is not None and ttft is not None:
	print("both TTFT & Input Tokens per second specified, using TTFT")
	elif in_tps is not None:
	ttft = len(tokens) / in_tps * 1000


	# get tokens & string array
	tokens = tokenize(text, model_name)[0]
	text_array = text.split(' ')

	itl = 1000 / out_tps
	words_per_second = len(text_array) / len(tokens) * out_tps
	inter_word_latency = 1000 / words_per_second
	ttft = ttft - offset*1000

	# start
	print(f'tokenizer: "{model_name}"')
	print(f'TTFT: {ttft}ms')
	print(f'Out Tok/s: {out_tps}tok/s')
	print(f'ITL, IWL: {itl:.0f}ms, {inter_word_latency:.0f}ms')
	print(f'\nTotal words: {len(text_array)}')
	print(f'Total tokens: {len(tokens)}')


	print(f'Starting...\n\n')
	start_time = time.time()
	# Delay by ttft
	yield "[starting]\n"
	sleep_ms(ttft)
	ttft_time = time.time()

	# yield text

	# print output tokens at 1/out_tps
	w = 0
	curr_text = ""
	for word in text_array:
	if w % n == 0:
	# print("")
	pass
	# print(word, end=" ")
	if word == "\n":
	curr_text += "\n"
	else:
	curr_text += word + " "
	yield curr_text
	sleep_ms(inter_word_latency)
	w += 1

	#DONE!
	end_time = time.time()
	print('\n\nDONE')

	#stats
	print(f' Expected Actual')
	print(f'TTFT: {ttft}ms {(ttft_time - start_time)*1000:.0f}ms')
	print(f'E2E: {ttft+len(tokens)/out_tps1000:.0f}ms {(end_time - start_time)1000:.0f}ms')
	print(f'tok/s: {out_tps}tok/s {len(tokens)/(end_time - ttft_time):.0f}tok/s')



	demo = gr.Interface(
	fn=emulate,
	inputs=[
	gr.Textbox(lines=5, value="Is this a dagger which I see before me,\nThe handle toward my hand? Come, let me clutch thee.\nI have thee not, and yet I see thee still.\nArt thou not, fatal vision, sensible\nTo feeling as to sight? Or art thou but\nA dagger of the mind, a false creation,\nProceeding from the heat-oppressèd brain?\nI see thee yet, in form as palpable\nAs this which now I draw.\nThou marshall'st me the way that I was going,\nAnd such an instrument I was to use.\nMine eyes are made the fools o' th' other senses,\nOr else worth all the rest. I see thee still,\nAnd on thy blade and dudgeon gouts of blood,\nWhich was not so before. There's no such thing.\nIt is the bloody business which informs\nThus to mine eyes. Now o'er the one half-world\nNature seems dead, and wicked dreams abuse\nThe curtained sleep. Witchcraft celebrates\nPale Hecate's offerings, and withered murder,\nAlarumed by his sentinel, the wolf,\nWhose howl's his watch, thus with his stealthy pace,\nWith Tarquin's ravishing strides, towards his design\nMoves like a ghost. Thou sure and firm-set earth,\nHear not my steps, which way they walk, for fear\nThy very stones prate of my whereabout,\nAnd take the present horror from the time,\nWhich now suits with it. Whiles I threat, he lives.\nWords to the heat of deeds too cold breath gives.\nI go, and it is done. The bell invites me.\nHear it not, Duncan, for it is a knell\nThat summons thee to heaven or to hell."),
	gr.Slider(0, 5000, value=1000, label="TTFT (ms)"),
	gr.Slider(0, 1000, value=10, label="Output Tokens per Second"),
	],
	outputs="text",
	additional_inputs=[gr.Slider(minimum=0, maximum=2, step=0.1, label="TTFT Offset (S)")],

	# live=True
	)

	demo.launch()