ncomly-nvidia commited on
Commit
dbeb24c
·
verified ·
1 Parent(s): ed2773c

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +4 -3
app.py CHANGED
@@ -17,7 +17,7 @@ def detokenize(tokens, model_name="lmsys/vicuna-7b-v1.5"):
17
  string = tokenizer.decode(tokens[0])
18
  return string
19
 
20
- def emulate(text, ttft=1000, out_tps=10, in_tps=None, model_name="lmsys/vicuna-7b-v1.5", n=10):
21
  # get TTFT
22
  if in_tps is not None and ttft is not None:
23
  print("both TTFT & Input Tokens per second specified, using TTFT")
@@ -45,8 +45,7 @@ def emulate(text, ttft=1000, out_tps=10, in_tps=None, model_name="lmsys/vicuna-7
45
  print(f'Starting...\n\n')
46
  start_time = time.time()
47
  # Delay by ttft
48
- # sleep_ms(max(ttft-1000,0))
49
- sleep_ms(ttft)
50
  ttft_time = time.time()
51
 
52
  # yield text
@@ -87,6 +86,8 @@ demo = gr.Interface(
87
  gr.Slider(0, 1000, value=10, label="Output Tokens per Second"),
88
  ],
89
  outputs="text",
 
 
90
  # live=True
91
  )
92
 
 
17
  string = tokenizer.decode(tokens[0])
18
  return string
19
 
20
+ def emulate(text, ttft=1000, out_tps=10, in_tps=None, model_name="lmsys/vicuna-7b-v1.5", n=10, offset=0):
21
  # get TTFT
22
  if in_tps is not None and ttft is not None:
23
  print("both TTFT & Input Tokens per second specified, using TTFT")
 
45
  print(f'Starting...\n\n')
46
  start_time = time.time()
47
  # Delay by ttft
48
+ sleep_ms(max(ttft-offset*1000,0))
 
49
  ttft_time = time.time()
50
 
51
  # yield text
 
86
  gr.Slider(0, 1000, value=10, label="Output Tokens per Second"),
87
  ],
88
  outputs="text",
89
+ additional_inputs=[gr.Slider(minimum=0, maximum=2, step=0.1, label="TTFT Offset (S)")],
90
+
91
  # live=True
92
  )
93