rphrp1985 commited on
Commit
1bacf29
·
verified ·
1 Parent(s): 5bd0236

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +37 -24
app.py CHANGED
@@ -199,7 +199,30 @@ print_tree("models")
199
  llm = None
200
  llm_model_glm = None
201
  llm_model_qwen= None
 
 
 
 
 
 
 
 
 
 
 
 
202
 
 
 
 
 
 
 
 
 
 
 
 
203
 
204
  @spaces.GPU(duration=30)
205
  def respond(
@@ -224,18 +247,8 @@ def respond(
224
 
225
 
226
  if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
227
- if llm_model_qwen == None:
228
- llm_model_qwen = Llama(
229
- model_path=f"models/{model}",
230
- flash_attn=True,
231
- n_gpu_layers=-1,
232
- n_batch=2048, # increase
233
- n_ctx= 8196, # reduce if you don’t need 8k
234
- n_threads=16, # set to your CPU cores
235
- use_mlock=True,
236
- verbose=True,
237
- chat_format="qwen"
238
- )
239
  x=llm_model_qwen.create_chat_completion(
240
  messages = [
241
  {"role": "system", "content": "hi"},
@@ -248,18 +261,18 @@ def respond(
248
  print(x)
249
  yield str(x)
250
  if model=="GLM-4.7-Flash-Q8_0.gguf" :
251
- if llm_model_glm == None:
252
- llm_model_glm = Llama(
253
- model_path=f"models/{model}",
254
- flash_attn=True,
255
- n_gpu_layers=-1,
256
- n_batch=2048, # increase
257
- n_ctx=8196, # reduce if you don’t need 8k
258
- n_threads=16, # set to your CPU cores
259
- use_mlock=True,
260
- verbose=True,
261
- chat_format="chatml"
262
- )
263
  x=llm_model_glm.create_chat_completion(
264
  messages = [
265
  {"role": "system", "content": "hi"},
 
199
  llm = None
200
  llm_model_glm = None
201
  llm_model_qwen= None
202
+ llm_model_qwen = Llama(
203
+ model_path=f"models/Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
204
+ flash_attn=True,
205
+ n_gpu_layers=-1,
206
+ n_batch=2048, # increase
207
+ n_ctx= 8196, # reduce if you don’t need 8k
208
+ n_threads=16, # set to your CPU cores
209
+ use_mlock=True,
210
+ verbose=True,
211
+ chat_format="qwen"
212
+ )
213
+
214
 
215
+ llm_model_glm = Llama(
216
+ model_path=f"models/GLM-4.7-Flash-Q8_0.gguf",
217
+ flash_attn=True,
218
+ n_gpu_layers=-1,
219
+ n_batch=2048, # increase
220
+ n_ctx=8196, # reduce if you don’t need 8k
221
+ n_threads=16, # set to your CPU cores
222
+ use_mlock=True,
223
+ verbose=True,
224
+ chat_format="chatml"
225
+ )
226
 
227
  @spaces.GPU(duration=30)
228
  def respond(
 
247
 
248
 
249
  if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
250
+ # if llm_model_qwen == None:
251
+
 
 
 
 
 
 
 
 
 
 
252
  x=llm_model_qwen.create_chat_completion(
253
  messages = [
254
  {"role": "system", "content": "hi"},
 
261
  print(x)
262
  yield str(x)
263
  if model=="GLM-4.7-Flash-Q8_0.gguf" :
264
+ # if llm_model_glm == None:
265
+ # llm_model_glm = Llama(
266
+ # model_path=f"models/{model}",
267
+ # flash_attn=True,
268
+ # n_gpu_layers=-1,
269
+ # n_batch=2048, # increase
270
+ # n_ctx=8196, # reduce if you don’t need 8k
271
+ # n_threads=16, # set to your CPU cores
272
+ # use_mlock=True,
273
+ # verbose=True,
274
+ # chat_format="chatml"
275
+ # )
276
  x=llm_model_glm.create_chat_completion(
277
  messages = [
278
  {"role": "system", "content": "hi"},