rphrp1985 commited on
Commit
511b1cc
·
verified ·
1 Parent(s): 4f287d1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -33
app.py CHANGED
@@ -198,33 +198,45 @@ def print_tree(start_path="models"):
198
  print_tree("models")
199
 
200
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  llm = None
202
  llm_model_glm = None
203
  llm_model_qwen= None
204
- llm_model_qwen = Llama(
205
- model_path=f"models/Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
206
- flash_attn=True,
207
- n_gpu_layers=-1,
208
- n_batch=2048, # increase
209
- n_ctx= 8196, # reduce if you don’t need 8k
210
- n_threads=16, # set to your CPU cores
211
- use_mlock=True,
212
- verbose=True,
213
- chat_format="qwen"
214
- )
215
 
216
 
217
- llm_model_glm = Llama(
218
- model_path=f"models/GLM-4.7-Flash-Q8_0.gguf",
219
- flash_attn=True,
220
- n_gpu_layers=-1,
221
- n_batch=2048, # increase
222
- n_ctx=8196, # reduce if you don’t need 8k
223
- n_threads=16, # set to your CPU cores
224
- use_mlock=True,
225
- verbose=True,
226
- chat_format="chatml"
227
- )
228
 
229
  @spaces.GPU(duration=30)
230
  def respond(
@@ -250,6 +262,17 @@ def respond(
250
 
251
  if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
252
  # if llm_model_qwen == None:
 
 
 
 
 
 
 
 
 
 
 
253
 
254
  x=llm_model_qwen.create_chat_completion(
255
  messages = [
@@ -261,20 +284,21 @@ def respond(
261
  ]
262
  )
263
  print(x)
 
264
  yield str(x)
265
  if model=="GLM-4.7-Flash-Q8_0.gguf" :
266
  # if llm_model_glm == None:
267
- # llm_model_glm = Llama(
268
- # model_path=f"models/{model}",
269
- # flash_attn=True,
270
- # n_gpu_layers=-1,
271
- # n_batch=2048, # increase
272
- # n_ctx=8196, # reduce if you don’t need 8k
273
- # n_threads=16, # set to your CPU cores
274
- # use_mlock=True,
275
- # verbose=True,
276
- # chat_format="chatml"
277
- # )
278
  x=llm_model_glm.create_chat_completion(
279
  messages = [
280
  {"role": "system", "content": "hi"},
@@ -284,6 +308,7 @@ def respond(
284
  }
285
  ]
286
  )
 
287
  print(x)
288
  yield str(x)
289
 
 
198
  print_tree("models")
199
 
200
 
201
+
202
+
203
+ import gc
204
+ import torch
205
+
206
+ def delete_llama_model(llm):
207
+ # global llm
208
+
209
+ if llm is not None:
210
+ try:
211
+ llm.close() # 🔥 VERY IMPORTANT
212
+ except Exception as e:
213
+ print("Close error:", e)
214
+
215
+ llm = None
216
+
217
+ # Force Python garbage collection
218
+ gc.collect()
219
+
220
+ # Clear GPU cache (if using CUDA)
221
+ try:
222
+ torch.cuda.empty_cache()
223
+ torch.cuda.ipc_collect()
224
+ torch.cuda.synchronize()
225
+ except:
226
+ pass
227
+
228
+ print("Model fully unloaded.")
229
+
230
+
231
+
232
  llm = None
233
  llm_model_glm = None
234
  llm_model_qwen= None
 
 
 
 
 
 
 
 
 
 
 
235
 
236
 
237
+
238
+
239
+
 
 
 
 
 
 
 
 
240
 
241
  @spaces.GPU(duration=30)
242
  def respond(
 
262
 
263
  if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
264
  # if llm_model_qwen == None:
265
+ llm_model_qwen = Llama(
266
+ model_path=f"models/Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
267
+ flash_attn=True,
268
+ n_gpu_layers=-1,
269
+ n_batch=2048, # increase
270
+ n_ctx= 8196, # reduce if you don’t need 8k
271
+ n_threads=16, # set to your CPU cores
272
+ use_mlock=True,
273
+ verbose=True,
274
+ chat_format="qwen"
275
+ )
276
 
277
  x=llm_model_qwen.create_chat_completion(
278
  messages = [
 
284
  ]
285
  )
286
  print(x)
287
+ delete_llama_model(llm_model_qwen)
288
  yield str(x)
289
  if model=="GLM-4.7-Flash-Q8_0.gguf" :
290
  # if llm_model_glm == None:
291
+ llm_model_glm = Llama(
292
+ model_path=f"models/{model}",
293
+ flash_attn=True,
294
+ n_gpu_layers=-1,
295
+ n_batch=2048, # increase
296
+ n_ctx=8196, # reduce if you don’t need 8k
297
+ n_threads=16, # set to your CPU cores
298
+ use_mlock=True,
299
+ verbose=True,
300
+ chat_format="chatml"
301
+ )
302
  x=llm_model_glm.create_chat_completion(
303
  messages = [
304
  {"role": "system", "content": "hi"},
 
308
  }
309
  ]
310
  )
311
+ delete_llama_model(llm_model_glm)
312
  print(x)
313
  yield str(x)
314