Rudrresh commited on
Commit
afd2e09
·
verified ·
1 Parent(s): fc15ef8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +5 -4
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import streamlit
2
  from llama_cpp import Llama
3
  import os
 
4
  from huggingface_hub import hf_hub_download
5
  # Load the LLM from GGUF file
6
 
@@ -11,7 +12,7 @@ model_path = hf_hub_download(repo_id = repo_id, filename=model_file)
11
  # n_threads
12
  llm = Llama(model_path=model_path,n_gpu_layers=30,n_ctx=512,temperature=0.2,repeat_penalty=1.1,top_k_sampling=40,top_p_sampling=0.95,min_p_sampling=0.05)
13
  def generate_llm_response(prompt):
14
- output = llm(prompt, max_tokens=1024)
15
  return output["choices"][0]["text"]
16
 
17
  import streamlit as st
@@ -35,11 +36,11 @@ user_input = st.chat_input("Type a message, ask a coding question")
35
  if user_input:
36
  st.chat_message("user").write(user_input)
37
  st.session_state["messages"].append({"role": "user", "content": user_input})
38
-
39
  # Get response from GGUF LLM
40
  response = generate_llm_response(user_input)
41
-
42
  # Display response
43
  st.chat_message("assistant").write(response)
44
  st.session_state["messages"].append({"role": "assistant", "content": response})
45
-
 
1
  import streamlit
2
  from llama_cpp import Llama
3
  import os
4
+ import time
5
  from huggingface_hub import hf_hub_download
6
  # Load the LLM from GGUF file
7
 
 
12
  # n_threads
13
  llm = Llama(model_path=model_path,n_gpu_layers=30,n_ctx=512,temperature=0.2,repeat_penalty=1.1,top_k_sampling=40,top_p_sampling=0.95,min_p_sampling=0.05)
14
  def generate_llm_response(prompt):
15
+ output = llm(prompt, max_tokens=512)
16
  return output["choices"][0]["text"]
17
 
18
  import streamlit as st
 
36
  if user_input:
37
  st.chat_message("user").write(user_input)
38
  st.session_state["messages"].append({"role": "user", "content": user_input})
39
+ start_time = time.time()
40
  # Get response from GGUF LLM
41
  response = generate_llm_response(user_input)
42
+ end_time = time.time()
43
  # Display response
44
  st.chat_message("assistant").write(response)
45
  st.session_state["messages"].append({"role": "assistant", "content": response})
46
+ st.caption(f"⏱️ Inference time: {inference_time:.2f} seconds")