Commit
·
5a7a07c
1
Parent(s):
ae3c32a
Update app.py
Browse files
app.py
CHANGED
|
@@ -160,6 +160,8 @@ from llama_cpp import Llama
|
|
| 160 |
# 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
|
| 161 |
GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 35))
|
| 162 |
|
|
|
|
|
|
|
| 163 |
LLAMA_VERBOSE=False
|
| 164 |
print("Running LLM Mistral")
|
| 165 |
llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
|
|
@@ -176,8 +178,9 @@ def format_prompt_mistral(message, history, system_message=system_message,system
|
|
| 176 |
for user_prompt, bot_response in history:
|
| 177 |
prompt += f"[INST] {user_prompt} [/INST]"
|
| 178 |
prompt += f" {bot_response}</s> "
|
| 179 |
-
|
| 180 |
-
|
|
|
|
| 181 |
prompt += f"[INST] {message} [/INST]"
|
| 182 |
return prompt
|
| 183 |
|
|
@@ -211,7 +214,7 @@ def generate_local(
|
|
| 211 |
temperature=0.8,
|
| 212 |
max_tokens=256,
|
| 213 |
top_p=0.95,
|
| 214 |
-
stop =
|
| 215 |
):
|
| 216 |
temperature = float(temperature)
|
| 217 |
if temperature < 1e-2:
|
|
@@ -236,6 +239,7 @@ def generate_local(
|
|
| 236 |
|
| 237 |
|
| 238 |
try:
|
|
|
|
| 239 |
stream = llm(
|
| 240 |
formatted_prompt,
|
| 241 |
**generate_kwargs,
|
|
@@ -254,7 +258,7 @@ def generate_local(
|
|
| 254 |
return
|
| 255 |
|
| 256 |
|
| 257 |
-
output += response["choices"][0]["text"].replace("<|assistant|>","").replace("<|user|>","")
|
| 258 |
yield output
|
| 259 |
|
| 260 |
except Exception as e:
|
|
@@ -464,7 +468,7 @@ def get_sentence(history, chatbot_role,llm_model,system_prompt=""):
|
|
| 464 |
history[-1][1] = character.replace("<|assistant|>","")
|
| 465 |
# It is coming word by word
|
| 466 |
|
| 467 |
-
text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").strip())
|
| 468 |
if len(text_to_generate) > 1:
|
| 469 |
|
| 470 |
dif = len(text_to_generate) - len(sentence_list)
|
|
@@ -509,7 +513,7 @@ def get_sentence(history, chatbot_role,llm_model,system_prompt=""):
|
|
| 509 |
|
| 510 |
# return that final sentence token
|
| 511 |
try:
|
| 512 |
-
last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").strip())[-1]
|
| 513 |
sentence_hash = hash(last_sentence)
|
| 514 |
if sentence_hash not in sentence_hash_list:
|
| 515 |
if stored_sentence is not None and stored_sentence_hash is not None:
|
|
|
|
| 160 |
# 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
|
| 161 |
GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 35))
|
| 162 |
|
| 163 |
+
LLM_STOP_WORDS= ["</s>","<|user|>","/s>"]
|
| 164 |
+
|
| 165 |
LLAMA_VERBOSE=False
|
| 166 |
print("Running LLM Mistral")
|
| 167 |
llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
|
|
|
|
| 178 |
for user_prompt, bot_response in history:
|
| 179 |
prompt += f"[INST] {user_prompt} [/INST]"
|
| 180 |
prompt += f" {bot_response}</s> "
|
| 181 |
+
|
| 182 |
+
#if message=="":
|
| 183 |
+
# message="Hello"
|
| 184 |
prompt += f"[INST] {message} [/INST]"
|
| 185 |
return prompt
|
| 186 |
|
|
|
|
| 214 |
temperature=0.8,
|
| 215 |
max_tokens=256,
|
| 216 |
top_p=0.95,
|
| 217 |
+
stop = LLM_STOP_WORDS
|
| 218 |
):
|
| 219 |
temperature = float(temperature)
|
| 220 |
if temperature < 1e-2:
|
|
|
|
| 239 |
|
| 240 |
|
| 241 |
try:
|
| 242 |
+
print("LLM Input:", formatted_prompt)
|
| 243 |
stream = llm(
|
| 244 |
formatted_prompt,
|
| 245 |
**generate_kwargs,
|
|
|
|
| 258 |
return
|
| 259 |
|
| 260 |
|
| 261 |
+
output += response["choices"][0]["text"].replace("<|assistant|>","").replace("<|user|>","")
|
| 262 |
yield output
|
| 263 |
|
| 264 |
except Exception as e:
|
|
|
|
| 468 |
history[-1][1] = character.replace("<|assistant|>","")
|
| 469 |
# It is coming word by word
|
| 470 |
|
| 471 |
+
text_to_generate = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|assistant|>"," ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())
|
| 472 |
if len(text_to_generate) > 1:
|
| 473 |
|
| 474 |
dif = len(text_to_generate) - len(sentence_list)
|
|
|
|
| 513 |
|
| 514 |
# return that final sentence token
|
| 515 |
try:
|
| 516 |
+
last_sentence = nltk.sent_tokenize(history[-1][1].replace("\n", " ").replace("<|ass>","").replace("[/ASST]","").replace("[/ASSI]","").replace("[/ASS]","").replace("","").strip())[-1]
|
| 517 |
sentence_hash = hash(last_sentence)
|
| 518 |
if sentence_hash not in sentence_hash_list:
|
| 519 |
if stored_sentence is not None and stored_sentence_hash is not None:
|