Update app.py
Browse files
app.py
CHANGED
|
@@ -261,13 +261,13 @@ def respond(
|
|
| 261 |
|
| 262 |
|
| 263 |
if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
|
| 264 |
-
|
| 265 |
-
|
| 266 |
model_path=f"models/Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
|
| 267 |
flash_attn=True,
|
| 268 |
n_gpu_layers=-1,
|
| 269 |
n_batch=2048, # increase
|
| 270 |
-
n_ctx=
|
| 271 |
n_threads=16, # set to your CPU cores
|
| 272 |
use_mlock=True,
|
| 273 |
verbose=True,
|
|
@@ -284,16 +284,16 @@ def respond(
|
|
| 284 |
]
|
| 285 |
)
|
| 286 |
print(x)
|
| 287 |
-
delete_llama_model(llm_model_qwen)
|
| 288 |
yield str(x)
|
| 289 |
if model=="GLM-4.7-Flash-Q8_0.gguf" :
|
| 290 |
-
|
| 291 |
-
|
| 292 |
model_path=f"models/{model}",
|
| 293 |
flash_attn=True,
|
| 294 |
n_gpu_layers=-1,
|
| 295 |
n_batch=2048, # increase
|
| 296 |
-
n_ctx=
|
| 297 |
n_threads=16, # set to your CPU cores
|
| 298 |
use_mlock=True,
|
| 299 |
verbose=True,
|
|
@@ -308,7 +308,7 @@ def respond(
|
|
| 308 |
}
|
| 309 |
]
|
| 310 |
)
|
| 311 |
-
delete_llama_model(llm_model_glm)
|
| 312 |
print(x)
|
| 313 |
yield str(x)
|
| 314 |
|
|
|
|
| 261 |
|
| 262 |
|
| 263 |
if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
|
| 264 |
+
if llm_model_qwen == None:
|
| 265 |
+
llm_model_qwen = Llama(
|
| 266 |
model_path=f"models/Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
|
| 267 |
flash_attn=True,
|
| 268 |
n_gpu_layers=-1,
|
| 269 |
n_batch=2048, # increase
|
| 270 |
+
n_ctx= 4098, # reduce if you don’t need 8k
|
| 271 |
n_threads=16, # set to your CPU cores
|
| 272 |
use_mlock=True,
|
| 273 |
verbose=True,
|
|
|
|
| 284 |
]
|
| 285 |
)
|
| 286 |
print(x)
|
| 287 |
+
# delete_llama_model(llm_model_qwen)
|
| 288 |
yield str(x)
|
| 289 |
if model=="GLM-4.7-Flash-Q8_0.gguf" :
|
| 290 |
+
if llm_model_glm == None:
|
| 291 |
+
llm_model_glm = Llama(
|
| 292 |
model_path=f"models/{model}",
|
| 293 |
flash_attn=True,
|
| 294 |
n_gpu_layers=-1,
|
| 295 |
n_batch=2048, # increase
|
| 296 |
+
n_ctx=4098, # reduce if you don’t need 8k
|
| 297 |
n_threads=16, # set to your CPU cores
|
| 298 |
use_mlock=True,
|
| 299 |
verbose=True,
|
|
|
|
| 308 |
}
|
| 309 |
]
|
| 310 |
)
|
| 311 |
+
# delete_llama_model(llm_model_glm)
|
| 312 |
print(x)
|
| 313 |
yield str(x)
|
| 314 |
|