Spaces:

rphrp1985
/

chatbots1

Sleeping

App Files Files Community

rphrp1985 commited on 9 days ago

Commit

511b1cc

verified ·

1 Parent(s): 4f287d1

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -33

app.py CHANGED Viewed

@@ -198,33 +198,45 @@ def print_tree(start_path="models"):
 print_tree("models")
 llm = None
 llm_model_glm = None
 llm_model_qwen= None
-llm_model_qwen = Llama(
-               model_path=f"models/Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
-                flash_attn=True,
-                n_gpu_layers=-1,
-                n_batch=2048,        # increase
-                n_ctx= 8196,          # reduce if you don’t need 8k
-                n_threads=16,        # set to your CPU cores
-                use_mlock=True,
-                verbose=True,
-                chat_format="qwen"
-            )
-llm_model_glm = Llama(
-               model_path=f"models/GLM-4.7-Flash-Q8_0.gguf",
-                flash_attn=True,
-                n_gpu_layers=-1,
-                n_batch=2048,        # increase
-                n_ctx=8196,          # reduce if you don’t need 8k
-                n_threads=16,        # set to your CPU cores
-                use_mlock=True,
-                verbose=True,
-                chat_format="chatml"
-            )
 @spaces.GPU(duration=30)
 def respond(
@@ -250,6 +262,17 @@ def respond(
     if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
         # if llm_model_qwen == None:
         x=llm_model_qwen.create_chat_completion(
         messages = [
@@ -261,20 +284,21 @@ def respond(
               ]
                 )
         print(x)
         yield str(x)
     if model=="GLM-4.7-Flash-Q8_0.gguf" :
         # if llm_model_glm == None:
-        #     llm_model_glm = Llama(
-        #        model_path=f"models/{model}",
-        #         flash_attn=True,
-        #         n_gpu_layers=-1,
-        #         n_batch=2048,        # increase
-        #         n_ctx=8196,          # reduce if you don’t need 8k
-        #         n_threads=16,        # set to your CPU cores
-        #         use_mlock=True,
-        #         verbose=True,
-        #         chat_format="chatml"
-        #     )
         x=llm_model_glm.create_chat_completion(
         messages = [
                   {"role": "system", "content": "hi"},
@@ -284,6 +308,7 @@ def respond(
                   }
               ]
                 )
         print(x)
         yield str(x)

 print_tree("models")
+import gc
+import torch
+def delete_llama_model(llm):
+    # global llm
+    if llm is not None:
+        try:
+            llm.close()   # 🔥 VERY IMPORTANT
+        except Exception as e:
+            print("Close error:", e)
+        llm = None
+    # Force Python garbage collection
+    gc.collect()
+    # Clear GPU cache (if using CUDA)
+    try:
+        torch.cuda.empty_cache()
+        torch.cuda.ipc_collect()
+        torch.cuda.synchronize()
+    except:
+        pass
+    print("Model fully unloaded.")
 llm = None
 llm_model_glm = None
 llm_model_qwen= None
 @spaces.GPU(duration=30)
 def respond(
     if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
         # if llm_model_qwen == None:
+        llm_model_qwen = Llama(
+               model_path=f"models/Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
+                flash_attn=True,
+                n_gpu_layers=-1,
+                n_batch=2048,        # increase
+                n_ctx= 8196,          # reduce if you don’t need 8k
+                n_threads=16,        # set to your CPU cores
+                use_mlock=True,
+                verbose=True,
+                chat_format="qwen"
+            )
         x=llm_model_qwen.create_chat_completion(
         messages = [
               ]
                 )
         print(x)
+        delete_llama_model(llm_model_qwen)
         yield str(x)
     if model=="GLM-4.7-Flash-Q8_0.gguf" :
         # if llm_model_glm == None:
+        llm_model_glm = Llama(
+               model_path=f"models/{model}",
+                flash_attn=True,
+                n_gpu_layers=-1,
+                n_batch=2048,        # increase
+                n_ctx=8196,          # reduce if you don’t need 8k
+                n_threads=16,        # set to your CPU cores
+                use_mlock=True,
+                verbose=True,
+                chat_format="chatml"
+            )
         x=llm_model_glm.create_chat_completion(
         messages = [
                   {"role": "system", "content": "hi"},
                   }
               ]
                 )
+        delete_llama_model(llm_model_glm)
         print(x)
         yield str(x)