Commit
·
331538a
1
Parent(s):
a51d57b
Update app.py
Browse files
app.py
CHANGED
|
@@ -158,11 +158,11 @@ from llama_cpp import Llama
|
|
| 158 |
# set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
|
| 159 |
# else 35 full layers + XTTS works fine on T4 16GB
|
| 160 |
# 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
|
| 161 |
-
GPU_LAYERS=int(os.environ.get("GPU_LAYERS",
|
| 162 |
|
| 163 |
LLAMA_VERBOSE=False
|
| 164 |
print("Running LLM Mistral")
|
| 165 |
-
llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
|
| 166 |
|
| 167 |
print("Running LLM Zephyr")
|
| 168 |
llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
|
|
@@ -191,19 +191,18 @@ def format_prompt_mistral(message, history, system_message=system_message,system
|
|
| 191 |
# Zephyr formatter
|
| 192 |
def format_prompt_zephyr(message, history, system_message=system_message):
|
| 193 |
prompt = (
|
| 194 |
-
"<|system|>\n" + system_message +
|
| 195 |
)
|
| 196 |
for user_prompt, bot_response in history:
|
| 197 |
-
prompt += f"<|user|>\n{user_prompt}
|
| 198 |
prompt += f"<|assistant|>\n{bot_response}</s>"
|
| 199 |
if message=="":
|
| 200 |
message="Hello"
|
| 201 |
prompt += f"<|user|>\n{message}</s>"
|
| 202 |
-
prompt += f"<|assistant
|
| 203 |
print(prompt)
|
| 204 |
return prompt
|
| 205 |
|
| 206 |
-
|
| 207 |
def generate_local(
|
| 208 |
prompt,
|
| 209 |
history,
|
|
|
|
| 158 |
# set GPU_LAYERS to 15 if you have a 8GB GPU so both models can fit in
|
| 159 |
# else 35 full layers + XTTS works fine on T4 16GB
|
| 160 |
# 5gb per llm, 4gb XTTS -> full layers should fit T4 16GB , 2LLM + XTTS
|
| 161 |
+
GPU_LAYERS=int(os.environ.get("GPU_LAYERS", 25))
|
| 162 |
|
| 163 |
LLAMA_VERBOSE=False
|
| 164 |
print("Running LLM Mistral")
|
| 165 |
+
llm_mistral = Llama(model_path=mistral_model_path,n_gpu_layers=GPU_LAYERS+10,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
|
| 166 |
|
| 167 |
print("Running LLM Zephyr")
|
| 168 |
llm_zephyr = Llama(model_path=zephyr_model_path,n_gpu_layers=GPU_LAYERS,max_new_tokens=256, context_window=4096, n_ctx=4096,n_batch=128,verbose=LLAMA_VERBOSE)
|
|
|
|
| 191 |
# Zephyr formatter
|
| 192 |
def format_prompt_zephyr(message, history, system_message=system_message):
|
| 193 |
prompt = (
|
| 194 |
+
"<|system|>\n" + system_message + "</s>"
|
| 195 |
)
|
| 196 |
for user_prompt, bot_response in history:
|
| 197 |
+
prompt += f"<|user|>\n{user_prompt}</s>"
|
| 198 |
prompt += f"<|assistant|>\n{bot_response}</s>"
|
| 199 |
if message=="":
|
| 200 |
message="Hello"
|
| 201 |
prompt += f"<|user|>\n{message}</s>"
|
| 202 |
+
prompt += f"<|assistant|>"
|
| 203 |
print(prompt)
|
| 204 |
return prompt
|
| 205 |
|
|
|
|
| 206 |
def generate_local(
|
| 207 |
prompt,
|
| 208 |
history,
|