Update app.py
Browse files
app.py
CHANGED
|
@@ -198,33 +198,45 @@ def print_tree(start_path="models"):
|
|
| 198 |
print_tree("models")
|
| 199 |
|
| 200 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
llm = None
|
| 202 |
llm_model_glm = None
|
| 203 |
llm_model_qwen= None
|
| 204 |
-
llm_model_qwen = Llama(
|
| 205 |
-
model_path=f"models/Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
|
| 206 |
-
flash_attn=True,
|
| 207 |
-
n_gpu_layers=-1,
|
| 208 |
-
n_batch=2048, # increase
|
| 209 |
-
n_ctx= 8196, # reduce if you don’t need 8k
|
| 210 |
-
n_threads=16, # set to your CPU cores
|
| 211 |
-
use_mlock=True,
|
| 212 |
-
verbose=True,
|
| 213 |
-
chat_format="qwen"
|
| 214 |
-
)
|
| 215 |
|
| 216 |
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
n_gpu_layers=-1,
|
| 221 |
-
n_batch=2048, # increase
|
| 222 |
-
n_ctx=8196, # reduce if you don’t need 8k
|
| 223 |
-
n_threads=16, # set to your CPU cores
|
| 224 |
-
use_mlock=True,
|
| 225 |
-
verbose=True,
|
| 226 |
-
chat_format="chatml"
|
| 227 |
-
)
|
| 228 |
|
| 229 |
@spaces.GPU(duration=30)
|
| 230 |
def respond(
|
|
@@ -250,6 +262,17 @@ def respond(
|
|
| 250 |
|
| 251 |
if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
|
| 252 |
# if llm_model_qwen == None:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 253 |
|
| 254 |
x=llm_model_qwen.create_chat_completion(
|
| 255 |
messages = [
|
|
@@ -261,20 +284,21 @@ def respond(
|
|
| 261 |
]
|
| 262 |
)
|
| 263 |
print(x)
|
|
|
|
| 264 |
yield str(x)
|
| 265 |
if model=="GLM-4.7-Flash-Q8_0.gguf" :
|
| 266 |
# if llm_model_glm == None:
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
x=llm_model_glm.create_chat_completion(
|
| 279 |
messages = [
|
| 280 |
{"role": "system", "content": "hi"},
|
|
@@ -284,6 +308,7 @@ def respond(
|
|
| 284 |
}
|
| 285 |
]
|
| 286 |
)
|
|
|
|
| 287 |
print(x)
|
| 288 |
yield str(x)
|
| 289 |
|
|
|
|
| 198 |
print_tree("models")
|
| 199 |
|
| 200 |
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
import gc
|
| 204 |
+
import torch
|
| 205 |
+
|
| 206 |
+
def delete_llama_model(llm):
|
| 207 |
+
# global llm
|
| 208 |
+
|
| 209 |
+
if llm is not None:
|
| 210 |
+
try:
|
| 211 |
+
llm.close() # 🔥 VERY IMPORTANT
|
| 212 |
+
except Exception as e:
|
| 213 |
+
print("Close error:", e)
|
| 214 |
+
|
| 215 |
+
llm = None
|
| 216 |
+
|
| 217 |
+
# Force Python garbage collection
|
| 218 |
+
gc.collect()
|
| 219 |
+
|
| 220 |
+
# Clear GPU cache (if using CUDA)
|
| 221 |
+
try:
|
| 222 |
+
torch.cuda.empty_cache()
|
| 223 |
+
torch.cuda.ipc_collect()
|
| 224 |
+
torch.cuda.synchronize()
|
| 225 |
+
except:
|
| 226 |
+
pass
|
| 227 |
+
|
| 228 |
+
print("Model fully unloaded.")
|
| 229 |
+
|
| 230 |
+
|
| 231 |
+
|
| 232 |
llm = None
|
| 233 |
llm_model_glm = None
|
| 234 |
llm_model_qwen= None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
|
| 237 |
+
|
| 238 |
+
|
| 239 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
@spaces.GPU(duration=30)
|
| 242 |
def respond(
|
|
|
|
| 262 |
|
| 263 |
if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
|
| 264 |
# if llm_model_qwen == None:
|
| 265 |
+
llm_model_qwen = Llama(
|
| 266 |
+
model_path=f"models/Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
|
| 267 |
+
flash_attn=True,
|
| 268 |
+
n_gpu_layers=-1,
|
| 269 |
+
n_batch=2048, # increase
|
| 270 |
+
n_ctx= 8196, # reduce if you don’t need 8k
|
| 271 |
+
n_threads=16, # set to your CPU cores
|
| 272 |
+
use_mlock=True,
|
| 273 |
+
verbose=True,
|
| 274 |
+
chat_format="qwen"
|
| 275 |
+
)
|
| 276 |
|
| 277 |
x=llm_model_qwen.create_chat_completion(
|
| 278 |
messages = [
|
|
|
|
| 284 |
]
|
| 285 |
)
|
| 286 |
print(x)
|
| 287 |
+
delete_llama_model(llm_model_qwen)
|
| 288 |
yield str(x)
|
| 289 |
if model=="GLM-4.7-Flash-Q8_0.gguf" :
|
| 290 |
# if llm_model_glm == None:
|
| 291 |
+
llm_model_glm = Llama(
|
| 292 |
+
model_path=f"models/{model}",
|
| 293 |
+
flash_attn=True,
|
| 294 |
+
n_gpu_layers=-1,
|
| 295 |
+
n_batch=2048, # increase
|
| 296 |
+
n_ctx=8196, # reduce if you don’t need 8k
|
| 297 |
+
n_threads=16, # set to your CPU cores
|
| 298 |
+
use_mlock=True,
|
| 299 |
+
verbose=True,
|
| 300 |
+
chat_format="chatml"
|
| 301 |
+
)
|
| 302 |
x=llm_model_glm.create_chat_completion(
|
| 303 |
messages = [
|
| 304 |
{"role": "system", "content": "hi"},
|
|
|
|
| 308 |
}
|
| 309 |
]
|
| 310 |
)
|
| 311 |
+
delete_llama_model(llm_model_glm)
|
| 312 |
print(x)
|
| 313 |
yield str(x)
|
| 314 |
|