Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import streamlit
|
| 2 |
from llama_cpp import Llama
|
| 3 |
import os
|
|
|
|
| 4 |
from huggingface_hub import hf_hub_download
|
| 5 |
# Load the LLM from GGUF file
|
| 6 |
|
|
@@ -11,7 +12,7 @@ model_path = hf_hub_download(repo_id = repo_id, filename=model_file)
|
|
| 11 |
# n_threads
|
| 12 |
llm = Llama(model_path=model_path,n_gpu_layers=30,n_ctx=512,temperature=0.2,repeat_penalty=1.1,top_k_sampling=40,top_p_sampling=0.95,min_p_sampling=0.05)
|
| 13 |
def generate_llm_response(prompt):
|
| 14 |
-
output = llm(prompt, max_tokens=
|
| 15 |
return output["choices"][0]["text"]
|
| 16 |
|
| 17 |
import streamlit as st
|
|
@@ -35,11 +36,11 @@ user_input = st.chat_input("Type a message, ask a coding question")
|
|
| 35 |
if user_input:
|
| 36 |
st.chat_message("user").write(user_input)
|
| 37 |
st.session_state["messages"].append({"role": "user", "content": user_input})
|
| 38 |
-
|
| 39 |
# Get response from GGUF LLM
|
| 40 |
response = generate_llm_response(user_input)
|
| 41 |
-
|
| 42 |
# Display response
|
| 43 |
st.chat_message("assistant").write(response)
|
| 44 |
st.session_state["messages"].append({"role": "assistant", "content": response})
|
| 45 |
-
|
|
|
|
| 1 |
import streamlit
|
| 2 |
from llama_cpp import Llama
|
| 3 |
import os
|
| 4 |
+
import time
|
| 5 |
from huggingface_hub import hf_hub_download
|
| 6 |
# Load the LLM from GGUF file
|
| 7 |
|
|
|
|
| 12 |
# n_threads
|
| 13 |
llm = Llama(model_path=model_path,n_gpu_layers=30,n_ctx=512,temperature=0.2,repeat_penalty=1.1,top_k_sampling=40,top_p_sampling=0.95,min_p_sampling=0.05)
|
| 14 |
def generate_llm_response(prompt):
|
| 15 |
+
output = llm(prompt, max_tokens=512)
|
| 16 |
return output["choices"][0]["text"]
|
| 17 |
|
| 18 |
import streamlit as st
|
|
|
|
| 36 |
if user_input:
|
| 37 |
st.chat_message("user").write(user_input)
|
| 38 |
st.session_state["messages"].append({"role": "user", "content": user_input})
|
| 39 |
+
start_time = time.time()
|
| 40 |
# Get response from GGUF LLM
|
| 41 |
response = generate_llm_response(user_input)
|
| 42 |
+
end_time = time.time()
|
| 43 |
# Display response
|
| 44 |
st.chat_message("assistant").write(response)
|
| 45 |
st.session_state["messages"].append({"role": "assistant", "content": response})
|
| 46 |
+
st.caption(f"⏱️ Inference time: {inference_time:.2f} seconds")
|