Commit
·
43356c3
1
Parent(s):
cd11c8a
Update app.py
Browse files
app.py
CHANGED
|
@@ -156,7 +156,7 @@ from llama_cpp import Llama
|
|
| 156 |
# 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
|
| 157 |
GPU_LAYERS=int(os.environ.get("GPU_LAYERS",35))
|
| 158 |
|
| 159 |
-
LLM_STOP_WORDS= ["</s>","<|user|>","/s>","<EOT>"]
|
| 160 |
|
| 161 |
LLAMA_VERBOSE=False
|
| 162 |
print("Running LLM Mistral as InferenceClient")
|
|
@@ -283,7 +283,7 @@ def generate_local(
|
|
| 283 |
output = ""
|
| 284 |
for response in stream:
|
| 285 |
character = response.token.text
|
| 286 |
-
if
|
| 287 |
# end of context
|
| 288 |
return
|
| 289 |
|
|
@@ -304,7 +304,7 @@ def generate_local(
|
|
| 304 |
for response in stream:
|
| 305 |
character= response["choices"][0]["text"]
|
| 306 |
|
| 307 |
-
if
|
| 308 |
# end of context
|
| 309 |
return
|
| 310 |
|
|
|
|
| 156 |
# 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
|
| 157 |
GPU_LAYERS=int(os.environ.get("GPU_LAYERS",35))
|
| 158 |
|
| 159 |
+
LLM_STOP_WORDS= ["</s>","<|user|>","/s>","<EOT>","[/INST]"]
|
| 160 |
|
| 161 |
LLAMA_VERBOSE=False
|
| 162 |
print("Running LLM Mistral as InferenceClient")
|
|
|
|
| 283 |
output = ""
|
| 284 |
for response in stream:
|
| 285 |
character = response.token.text
|
| 286 |
+
if character in LLM_STOP_WORDS:
|
| 287 |
# end of context
|
| 288 |
return
|
| 289 |
|
|
|
|
| 304 |
for response in stream:
|
| 305 |
character= response["choices"][0]["text"]
|
| 306 |
|
| 307 |
+
if character in LLM_STOP_WORDS:
|
| 308 |
# end of context
|
| 309 |
return
|
| 310 |
|