tps_emulator / app.py
ncomly-nvidia's picture
Update app.py
af7aa62 verified
import gradio as gr
import time
from transformers import LlamaTokenizer, AutoTokenizer
# helpers
def sleep_ms(ms):
time.sleep(ms / 1000)
def tokenize(string, model_name="lmsys/vicuna-7b-v1.5"):
tokenizer = LlamaTokenizer.from_pretrained(model_name)
input_ids = tokenizer.encode(string, return_tensors="pt")
return input_ids
def detokenize(tokens, model_name="lmsys/vicuna-7b-v1.5"):
tokenizer = AutoTokenizer.from_pretrained(model_name)
string = tokenizer.decode(tokens[0])
return string
def emulate(text, ttft=1000, out_tps=10, in_tps=None, model_name="lmsys/vicuna-7b-v1.5", n=10, offset=0):
# get TTFT
if in_tps is not None and ttft is not None:
print("both TTFT & Input Tokens per second specified, using TTFT")
elif in_tps is not None:
ttft = len(tokens) / in_tps * 1000
# get tokens & string array
tokens = tokenize(text, model_name)[0]
text_array = text.split(' ')
itl = 1000 / out_tps
words_per_second = len(text_array) / len(tokens) * out_tps
inter_word_latency = 1000 / words_per_second
ttft = ttft - offset*1000
# start
print(f'tokenizer: "{model_name}"')
print(f'TTFT: {ttft}ms')
print(f'Out Tok/s: {out_tps}tok/s')
print(f'ITL, IWL: {itl:.0f}ms, {inter_word_latency:.0f}ms')
print(f'\nTotal words: {len(text_array)}')
print(f'Total tokens: {len(tokens)}')
print(f'Starting...\n\n')
start_time = time.time()
# Delay by ttft
yield "[starting]\n"
sleep_ms(ttft)
ttft_time = time.time()
# yield text
# print output tokens at 1/out_tps
w = 0
curr_text = ""
for word in text_array:
if w % n == 0:
# print("")
pass
# print(word, end=" ")
if word == "\n":
curr_text += "\n"
else:
curr_text += word + " "
yield curr_text
sleep_ms(inter_word_latency)
w += 1
#DONE!
end_time = time.time()
print('\n\nDONE')
#stats
print(f' Expected Actual')
print(f'TTFT: {ttft}ms {(ttft_time - start_time)*1000:.0f}ms')
print(f'E2E: {ttft+len(tokens)/out_tps*1000:.0f}ms {(end_time - start_time)*1000:.0f}ms')
print(f'tok/s: {out_tps}tok/s {len(tokens)/(end_time - ttft_time):.0f}tok/s')
demo = gr.Interface(
fn=emulate,
inputs=[
gr.Textbox(lines=5, value="Is this a dagger which I see before me,\nThe handle toward my hand? Come, let me clutch thee.\nI have thee not, and yet I see thee still.\nArt thou not, fatal vision, sensible\nTo feeling as to sight? Or art thou but\nA dagger of the mind, a false creation,\nProceeding from the heat-oppressèd brain?\nI see thee yet, in form as palpable\nAs this which now I draw.\nThou marshall'st me the way that I was going,\nAnd such an instrument I was to use.\nMine eyes are made the fools o' th' other senses,\nOr else worth all the rest. I see thee still,\nAnd on thy blade and dudgeon gouts of blood,\nWhich was not so before. There's no such thing.\nIt is the bloody business which informs\nThus to mine eyes. Now o'er the one half-world\nNature seems dead, and wicked dreams abuse\nThe curtained sleep. Witchcraft celebrates\nPale Hecate's offerings, and withered murder,\nAlarumed by his sentinel, the wolf,\nWhose howl's his watch, thus with his stealthy pace,\nWith Tarquin's ravishing strides, towards his design\nMoves like a ghost. Thou sure and firm-set earth,\nHear not my steps, which way they walk, for fear\nThy very stones prate of my whereabout,\nAnd take the present horror from the time,\nWhich now suits with it. Whiles I threat, he lives.\nWords to the heat of deeds too cold breath gives.\nI go, and it is done. The bell invites me.\nHear it not, Duncan, for it is a knell\nThat summons thee to heaven or to hell."),
gr.Slider(0, 5000, value=1000, label="TTFT (ms)"),
gr.Slider(0, 1000, value=10, label="Output Tokens per Second"),
],
outputs="text",
additional_inputs=[gr.Slider(minimum=0, maximum=2, step=0.1, label="TTFT Offset (S)")],
# live=True
)
demo.launch()