|
|
import time |
|
|
import streamlit as st |
|
|
|
|
|
COST_PER_1000_TOKENS_USD = 0.139 / 80 |
|
|
|
|
|
|
|
|
def stream_handler(session_state, chat_stream, prompt, placeholder): |
|
|
|
|
|
|
|
|
start_time = time.time() |
|
|
full_response = "" |
|
|
|
|
|
for chunk in chat_stream: |
|
|
if chunk.token.text in ["</s>", "<|im_end|>"]: |
|
|
break; |
|
|
full_response += chunk.token.text |
|
|
placeholder.markdown(full_response + "β") |
|
|
placeholder.markdown(full_response) |
|
|
|
|
|
end_time = time.time() |
|
|
elapsed_time = end_time - start_time |
|
|
total_tokens_processed = len(full_response.split()) |
|
|
tokens_per_second = total_tokens_processed // elapsed_time |
|
|
len_response = (len(prompt.split()) + len(full_response.split())) * 1.25 |
|
|
col1, col2, col3 = st.columns(3) |
|
|
|
|
|
with col1: |
|
|
st.write(f"**{tokens_per_second} tokens/second**") |
|
|
|
|
|
with col2: |
|
|
st.write(f"**{int(len_response)} tokens generated**") |
|
|
|
|
|
with col3: |
|
|
st.write( |
|
|
f"**$ {round(len_response * COST_PER_1000_TOKENS_USD / 1000, 5)} cost incurred**" |
|
|
) |
|
|
|
|
|
session_state["tps"] = tokens_per_second |
|
|
session_state["tokens_used"] = len_response + session_state["tokens_used"] |
|
|
|
|
|
return full_response |
|
|
|