Spaces:

rphrp1985
/

chatbots1

Running on Zero

rphrp1985 commited on Feb 15

Commit

5d3c589

verified ·

1 Parent(s): 511b1cc

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -261,13 +261,13 @@ def respond(
     if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
-        # if llm_model_qwen == None:
-        llm_model_qwen = Llama(
                model_path=f"models/Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
                 flash_attn=True,
                 n_gpu_layers=-1,
                 n_batch=2048,        # increase
-                n_ctx= 8196,          # reduce if you don’t need 8k
                 n_threads=16,        # set to your CPU cores
                 use_mlock=True,
                 verbose=True,
@@ -284,16 +284,16 @@ def respond(
               ]
                 )
         print(x)
-        delete_llama_model(llm_model_qwen)
         yield str(x)
     if model=="GLM-4.7-Flash-Q8_0.gguf" :
-        # if llm_model_glm == None:
-        llm_model_glm = Llama(
                model_path=f"models/{model}",
                 flash_attn=True,
                 n_gpu_layers=-1,
                 n_batch=2048,        # increase
-                n_ctx=8196,          # reduce if you don’t need 8k
                 n_threads=16,        # set to your CPU cores
                 use_mlock=True,
                 verbose=True,
@@ -308,7 +308,7 @@ def respond(
                   }
               ]
                 )
-        delete_llama_model(llm_model_glm)
         print(x)
         yield str(x)

     if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
+        if llm_model_qwen == None:
+            llm_model_qwen = Llama(
                model_path=f"models/Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
                 flash_attn=True,
                 n_gpu_layers=-1,
                 n_batch=2048,        # increase
+                n_ctx= 4098,          # reduce if you don’t need 8k
                 n_threads=16,        # set to your CPU cores
                 use_mlock=True,
                 verbose=True,
               ]
                 )
         print(x)
+        # delete_llama_model(llm_model_qwen)
         yield str(x)
     if model=="GLM-4.7-Flash-Q8_0.gguf" :
+        if llm_model_glm == None:
+            llm_model_glm = Llama(
                model_path=f"models/{model}",
                 flash_attn=True,
                 n_gpu_layers=-1,
                 n_batch=2048,        # increase
+                n_ctx=4098,          # reduce if you don’t need 8k
                 n_threads=16,        # set to your CPU cores
                 use_mlock=True,
                 verbose=True,
                   }
               ]
                 )
+        # delete_llama_model(llm_model_glm)
         print(x)
         yield str(x)