Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import time | |
| from transformers import AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-v0.1") | |
| starter_text = """# Abstract | |
| Within thirty years, we will have the technological means to create superhuman intelligence. Shortly after, | |
| the human era will be ended. | |
| Is such progress avoidable? If not to be avoided, can events be guided so that we may survive? These questions | |
| are investigated. Some possible answers (and some further dangers) are presented. | |
| """ | |
| def calculate_wait_seconds(tokens_per_second): | |
| return 1 / tokens_per_second | |
| def get_tokens(prompt): | |
| tokens = tokenizer.tokenize(prompt) | |
| return [x.replace('▁', ' ').replace('<0x0A>', '\n') for x in tokens] | |
| def echo(message, history, prompt, tokens_per_second, time_to_first_token, stream): | |
| wait_seconds = calculate_wait_seconds(tokens_per_second) | |
| response = f"{prompt}" | |
| tokens = get_tokens(response) | |
| if time_to_first_token: | |
| time.sleep(time_to_first_token / 1000) | |
| partial_message = "" | |
| for new_token in tokens: | |
| time.sleep(wait_seconds) | |
| if '<' in new_token: | |
| # Gradio chat chokes on HTML-like elements | |
| continue | |
| partial_message += str(new_token) | |
| if stream: | |
| yield partial_message | |
| if not stream: | |
| yield partial_message | |
| with gr.Blocks(title='Tokens per Second Simulator') as demo: | |
| gr.Markdown('# ⏱️ Tokens per Second Simulator') | |
| gr.Markdown('Compare the feel of different response speeds for a chat bot') | |
| gr.Markdown('Reading speeds vary but in English 5-10 tokens per second is considered normal reading speed') | |
| gr.Markdown( | |
| 'References for further research:\n' | |
| '- https://www.perplexity.ai/search/How-many-tokens-1d7VyXCDQuWf3pJnK4.0iw?s=c\n' | |
| '- https://www.databricks.com/blog/llm-inference-performance-engineering-best-practices\n' | |
| '- https://news.ycombinator.com/item?id=35978864\n' | |
| '- https://www.reddit.com/r/LocalLLaMA/comments/162pgx9/what_do_yall_consider_acceptable_tokens_per/') | |
| prompt = gr.Textbox(starter_text, label="Prompt to Echo") | |
| tps_slider = gr.Slider(1, 50, render=True, value=8, label='Tokens per second (TPS)') | |
| ttft_slider = gr.Slider(0, 5000, render=True, value=0, | |
| label='Time to first token (TTFT) in milliseconds') | |
| stream_checkbox = gr.Checkbox(label='Stream Response', value=True) | |
| gr.ChatInterface(echo, additional_inputs=[prompt, tps_slider, ttft_slider, stream_checkbox], | |
| description='Submit any text to echo the prompt above at the selected speed.') | |
| demo.queue().launch() | |