Update app.py
Browse files
app.py
CHANGED
|
@@ -199,7 +199,30 @@ print_tree("models")
|
|
| 199 |
llm = None
|
| 200 |
llm_model_glm = None
|
| 201 |
llm_model_qwen= None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 202 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
|
| 204 |
@spaces.GPU(duration=30)
|
| 205 |
def respond(
|
|
@@ -224,18 +247,8 @@ def respond(
|
|
| 224 |
|
| 225 |
|
| 226 |
if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
|
| 227 |
-
if llm_model_qwen == None:
|
| 228 |
-
|
| 229 |
-
model_path=f"models/{model}",
|
| 230 |
-
flash_attn=True,
|
| 231 |
-
n_gpu_layers=-1,
|
| 232 |
-
n_batch=2048, # increase
|
| 233 |
-
n_ctx= 8196, # reduce if you don’t need 8k
|
| 234 |
-
n_threads=16, # set to your CPU cores
|
| 235 |
-
use_mlock=True,
|
| 236 |
-
verbose=True,
|
| 237 |
-
chat_format="qwen"
|
| 238 |
-
)
|
| 239 |
x=llm_model_qwen.create_chat_completion(
|
| 240 |
messages = [
|
| 241 |
{"role": "system", "content": "hi"},
|
|
@@ -248,18 +261,18 @@ def respond(
|
|
| 248 |
print(x)
|
| 249 |
yield str(x)
|
| 250 |
if model=="GLM-4.7-Flash-Q8_0.gguf" :
|
| 251 |
-
if llm_model_glm == None:
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
x=llm_model_glm.create_chat_completion(
|
| 264 |
messages = [
|
| 265 |
{"role": "system", "content": "hi"},
|
|
|
|
| 199 |
llm = None
|
| 200 |
llm_model_glm = None
|
| 201 |
llm_model_qwen= None
|
| 202 |
+
llm_model_qwen = Llama(
|
| 203 |
+
model_path=f"models/Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
|
| 204 |
+
flash_attn=True,
|
| 205 |
+
n_gpu_layers=-1,
|
| 206 |
+
n_batch=2048, # increase
|
| 207 |
+
n_ctx= 8196, # reduce if you don’t need 8k
|
| 208 |
+
n_threads=16, # set to your CPU cores
|
| 209 |
+
use_mlock=True,
|
| 210 |
+
verbose=True,
|
| 211 |
+
chat_format="qwen"
|
| 212 |
+
)
|
| 213 |
+
|
| 214 |
|
| 215 |
+
llm_model_glm = Llama(
|
| 216 |
+
model_path=f"models/GLM-4.7-Flash-Q8_0.gguf",
|
| 217 |
+
flash_attn=True,
|
| 218 |
+
n_gpu_layers=-1,
|
| 219 |
+
n_batch=2048, # increase
|
| 220 |
+
n_ctx=8196, # reduce if you don’t need 8k
|
| 221 |
+
n_threads=16, # set to your CPU cores
|
| 222 |
+
use_mlock=True,
|
| 223 |
+
verbose=True,
|
| 224 |
+
chat_format="chatml"
|
| 225 |
+
)
|
| 226 |
|
| 227 |
@spaces.GPU(duration=30)
|
| 228 |
def respond(
|
|
|
|
| 247 |
|
| 248 |
|
| 249 |
if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
|
| 250 |
+
# if llm_model_qwen == None:
|
| 251 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 252 |
x=llm_model_qwen.create_chat_completion(
|
| 253 |
messages = [
|
| 254 |
{"role": "system", "content": "hi"},
|
|
|
|
| 261 |
print(x)
|
| 262 |
yield str(x)
|
| 263 |
if model=="GLM-4.7-Flash-Q8_0.gguf" :
|
| 264 |
+
# if llm_model_glm == None:
|
| 265 |
+
# llm_model_glm = Llama(
|
| 266 |
+
# model_path=f"models/{model}",
|
| 267 |
+
# flash_attn=True,
|
| 268 |
+
# n_gpu_layers=-1,
|
| 269 |
+
# n_batch=2048, # increase
|
| 270 |
+
# n_ctx=8196, # reduce if you don’t need 8k
|
| 271 |
+
# n_threads=16, # set to your CPU cores
|
| 272 |
+
# use_mlock=True,
|
| 273 |
+
# verbose=True,
|
| 274 |
+
# chat_format="chatml"
|
| 275 |
+
# )
|
| 276 |
x=llm_model_glm.create_chat_completion(
|
| 277 |
messages = [
|
| 278 |
{"role": "system", "content": "hi"},
|