Spaces:

rphrp1985
/

chatbots1

Running on Zero

App Files Files Community

rphrp1985 commited on Feb 15

Commit

1bacf29

verified ·

1 Parent(s): 5bd0236

Update app.py

Browse files

Files changed (1) hide show

app.py +37 -24

app.py CHANGED Viewed

@@ -199,7 +199,30 @@ print_tree("models")
 llm = None
 llm_model_glm = None
 llm_model_qwen= None
 @spaces.GPU(duration=30)
 def respond(
@@ -224,18 +247,8 @@ def respond(
     if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
-        if llm_model_qwen == None:
-            llm_model_qwen = Llama(
-               model_path=f"models/{model}",
-                flash_attn=True,
-                n_gpu_layers=-1,
-                n_batch=2048,        # increase
-                n_ctx= 8196,          # reduce if you don’t need 8k
-                n_threads=16,        # set to your CPU cores
-                use_mlock=True,
-                verbose=True,
-                chat_format="qwen"
-            )
         x=llm_model_qwen.create_chat_completion(
         messages = [
                   {"role": "system", "content": "hi"},
@@ -248,18 +261,18 @@ def respond(
         print(x)
         yield str(x)
     if model=="GLM-4.7-Flash-Q8_0.gguf" :
-        if llm_model_glm == None:
-            llm_model_glm = Llama(
-               model_path=f"models/{model}",
-                flash_attn=True,
-                n_gpu_layers=-1,
-                n_batch=2048,        # increase
-                n_ctx=8196,          # reduce if you don’t need 8k
-                n_threads=16,        # set to your CPU cores
-                use_mlock=True,
-                verbose=True,
-                chat_format="chatml"
-            )
         x=llm_model_glm.create_chat_completion(
         messages = [
                   {"role": "system", "content": "hi"},

 llm = None
 llm_model_glm = None
 llm_model_qwen= None
+llm_model_qwen = Llama(
+               model_path=f"models/Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
+                flash_attn=True,
+                n_gpu_layers=-1,
+                n_batch=2048,        # increase
+                n_ctx= 8196,          # reduce if you don’t need 8k
+                n_threads=16,        # set to your CPU cores
+                use_mlock=True,
+                verbose=True,
+                chat_format="qwen"
+            )
+llm_model_glm = Llama(
+               model_path=f"models/GLM-4.7-Flash-Q8_0.gguf",
+                flash_attn=True,
+                n_gpu_layers=-1,
+                n_batch=2048,        # increase
+                n_ctx=8196,          # reduce if you don’t need 8k
+                n_threads=16,        # set to your CPU cores
+                use_mlock=True,
+                verbose=True,
+                chat_format="chatml"
+            )
 @spaces.GPU(duration=30)
 def respond(
     if model == "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf" :
+        # if llm_model_qwen == None:
         x=llm_model_qwen.create_chat_completion(
         messages = [
                   {"role": "system", "content": "hi"},
         print(x)
         yield str(x)
     if model=="GLM-4.7-Flash-Q8_0.gguf" :
+        # if llm_model_glm == None:
+        #     llm_model_glm = Llama(
+        #        model_path=f"models/{model}",
+        #         flash_attn=True,
+        #         n_gpu_layers=-1,
+        #         n_batch=2048,        # increase
+        #         n_ctx=8196,          # reduce if you don’t need 8k
+        #         n_threads=16,        # set to your CPU cores
+        #         use_mlock=True,
+        #         verbose=True,
+        #         chat_format="chatml"
+        #     )
         x=llm_model_glm.create_chat_completion(
         messages = [
                   {"role": "system", "content": "hi"},