Update app.py
Browse files
app.py
CHANGED
|
@@ -148,9 +148,9 @@ def create_vector_db(final_items):
|
|
| 148 |
from llama_cpp import Llama
|
| 149 |
|
| 150 |
llm = Llama(
|
| 151 |
-
model_path
|
| 152 |
-
repo_id = os.environ.get("REPO_ID", "
|
| 153 |
-
filename = os.environ.get("MODEL_FILE", "
|
| 154 |
),
|
| 155 |
n_ctx = 2048,
|
| 156 |
n_gpu_layers = 10,
|
|
@@ -197,15 +197,8 @@ def generate_response(db, query_text, previous_context):
|
|
| 197 |
return "No results found."
|
| 198 |
|
| 199 |
best_recommendation = query_results['documents']
|
| 200 |
-
import torch
|
| 201 |
-
from llama_cpp import Llama
|
| 202 |
-
|
| 203 |
-
llm = Llama.from_pretrained(
|
| 204 |
-
repo_id="xzlinuxmodels/ollama3.1",
|
| 205 |
-
filename="unsloth.BF16.gguf",
|
| 206 |
-
)
|
| 207 |
-
|
| 208 |
|
|
|
|
| 209 |
prompt_template = f"""
|
| 210 |
Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
|
| 211 |
|
|
@@ -219,16 +212,36 @@ def generate_response(db, query_text, previous_context):
|
|
| 219 |
|
| 220 |
Question:
|
| 221 |
{query_text}
|
| 222 |
-
|
| 223 |
Once you are done summarizing, type 'END'.
|
| 224 |
"""
|
| 225 |
-
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
)
|
| 228 |
|
| 229 |
-
|
| 230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 232 |
def streamlit_app():
|
| 233 |
st.title("BioModelsRAG")
|
| 234 |
|
|
@@ -277,12 +290,13 @@ def streamlit_app():
|
|
| 277 |
if 'previous_context' not in st.session_state:
|
| 278 |
st.session_state.previous_context = ""
|
| 279 |
|
|
|
|
| 280 |
response = generate_response(db, user_query, st.session_state.previous_context)
|
| 281 |
-
st.write(f"Response: {response}")
|
| 282 |
|
| 283 |
st.session_state.previous_context += f"{response}\n"
|
| 284 |
else:
|
| 285 |
st.write("No models found for the given search query.")
|
| 286 |
|
| 287 |
if __name__ == "__main__":
|
| 288 |
-
streamlit_app()
|
|
|
|
| 148 |
from llama_cpp import Llama
|
| 149 |
|
| 150 |
llm = Llama(
|
| 151 |
+
model_path = hf_hub_download(
|
| 152 |
+
repo_id = os.environ.get("REPO_ID", "xzlinuxmodels/ollama3.1"),
|
| 153 |
+
filename = os.environ.get("MODEL_FILE", "unsloth.BF16.gguf"),
|
| 154 |
),
|
| 155 |
n_ctx = 2048,
|
| 156 |
n_gpu_layers = 10,
|
|
|
|
| 197 |
return "No results found."
|
| 198 |
|
| 199 |
best_recommendation = query_results['documents']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 200 |
|
| 201 |
+
# Prompt for LLM
|
| 202 |
prompt_template = f"""
|
| 203 |
Using the context provided below, answer the following question. If the information is insufficient to answer the question, please state that clearly.
|
| 204 |
|
|
|
|
| 212 |
|
| 213 |
Question:
|
| 214 |
{query_text}
|
|
|
|
| 215 |
Once you are done summarizing, type 'END'.
|
| 216 |
"""
|
| 217 |
+
|
| 218 |
+
# LLM call with streaming enabled
|
| 219 |
+
import torch
|
| 220 |
+
from llama_cpp import Llama
|
| 221 |
+
|
| 222 |
+
llm = Llama.from_pretrained(
|
| 223 |
+
repo_id="xzlinuxmodels/ollama3.1",
|
| 224 |
+
filename="unsloth.BF16.gguf",
|
| 225 |
)
|
| 226 |
|
| 227 |
+
# Stream output from the LLM and display in Streamlit incrementally
|
| 228 |
+
output_stream = llm(
|
| 229 |
+
prompt_template,
|
| 230 |
+
stream=True, # Enable streaming
|
| 231 |
+
temperature=0.1,
|
| 232 |
+
top_p=0.9,
|
| 233 |
+
top_k=20
|
| 234 |
+
)
|
| 235 |
|
| 236 |
+
# Use Streamlit to stream the response in real-time
|
| 237 |
+
temp_response = ""
|
| 238 |
+
for token in output_stream:
|
| 239 |
+
token_text = token["choices"][0]["text"]
|
| 240 |
+
temp_response += token_text
|
| 241 |
+
st.write(temp_response) # Update the Streamlit UI with the current response
|
| 242 |
+
|
| 243 |
+
return temp_response
|
| 244 |
+
|
| 245 |
def streamlit_app():
|
| 246 |
st.title("BioModelsRAG")
|
| 247 |
|
|
|
|
| 290 |
if 'previous_context' not in st.session_state:
|
| 291 |
st.session_state.previous_context = ""
|
| 292 |
|
| 293 |
+
# Stream the response incrementally for the second generation
|
| 294 |
response = generate_response(db, user_query, st.session_state.previous_context)
|
| 295 |
+
st.write(f"Final Response: {response}")
|
| 296 |
|
| 297 |
st.session_state.previous_context += f"{response}\n"
|
| 298 |
else:
|
| 299 |
st.write("No models found for the given search query.")
|
| 300 |
|
| 301 |
if __name__ == "__main__":
|
| 302 |
+
streamlit_app()
|