Spaces:

rphrp1985
/

chatbots1

Running on Zero

App Files Files Community

rphrp1985 commited on Feb 15

Commit

20fbcf3

verified ·

1 Parent(s): 43ae1a4

Update app.py

Browse files

Files changed (1) hide show

app.py +34 -18

app.py CHANGED Viewed

@@ -115,11 +115,11 @@ hf_hub_download(
 # )
-hf_hub_download(
-    repo_id="unsloth/Qwen3-VL-32B-Thinking-GGUF",
-    filename="Qwen3-VL-32B-Thinking-Q8_0.gguf",
-    local_dir="./models"
-)
 # hf_hub_download(
@@ -129,13 +129,13 @@ hf_hub_download(
 # )
 from huggingface_hub import snapshot_download
-# snapshot_download(
-#     repo_id="unsloth/MiniMax-M2.5-GGUF",
-#     repo_type="model",
-#     local_dir="./models/",
-#     allow_patterns=["Q3_K_S/*"],   # 👈 folder inside repo
-#     token=huggingface_token      # only if gated/private
-# )
@@ -197,7 +197,9 @@ print_tree("models")
 llm = None
-llm_model = None
 @spaces.GPU(duration=30)
 def respond(
@@ -217,9 +219,22 @@ def respond(
     global llm
     global llm_model
-    if llm is None or llm_model != model:
-        llm = Llama(
            model_path=f"models/{model}",
             flash_attn=True,
             n_gpu_layers=-1,
@@ -230,7 +245,8 @@ def respond(
             verbose=True,
             chat_format="chatml"
         )
-        llm_model = model
     x=llm.create_chat_completion(
       messages = [
@@ -313,11 +329,11 @@ demo = gr.ChatInterface(
                 # "Qwen3-Coder-Next-Q4_K_M.gguf",
                 # "gpt-oss-20b-Q4_K_M.gguf",
                 # "Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf",
-                "Qwen3-VL-32B-Thinking-Q8_0.gguf",
                 # "Qwen3-VL-32B-Thinking-Q8_0.gguf",
                 # "Q8_0/gpt-oss-120b-Q8_0-00001-of-00002.gguf"
             ],
-            value="Qwen3-VL-32B-Thinking-Q8_0.gguf",
             label="Model",
         ),
         gr.Textbox(

 # )
+# hf_hub_download(
+#     repo_id="unsloth/Qwen3-VL-32B-Thinking-GGUF",
+#     filename="Qwen3-VL-32B-Thinking-Q8_0.gguf",
+#     local_dir="./models"
+# )
 # hf_hub_download(
 # )
 from huggingface_hub import snapshot_download
+snapshot_download(
+    repo_id="unsloth/Qwen3-Coder-Next-GGUF",
+    repo_type="model",
+    local_dir="./models/",
+    allow_patterns=["Q5_K_M/*"],   # 👈 folder inside repo
+    token=huggingface_token      # only if gated/private
+)
 llm = None
+llm_model_glm = None
+llm_model_qwen= None
 @spaces.GPU(duration=30)
 def respond(
     global llm
     global llm_model
+    if model is "Qwen3-VL-32B-Thinking-Q8_0.gguf" and llm_model_qwen is None:
+        llm_model_qwen = Llama(
+           model_path=f"models/{model}",
+            flash_attn=True,
+            n_gpu_layers=-1,
+            n_batch=2048,        # increase
+            n_ctx=2048,          # reduce if you don’t need 8k
+            n_threads=16,        # set to your CPU cores
+            use_mlock=True,
+            verbose=True,
+            chat_format="qwen"
+        )
+    if model=="GLM-4.7-Flash-Q8_0.gguf" and llm_model_glm is None:
+        llm_model_qwen = Llama(
            model_path=f"models/{model}",
             flash_attn=True,
             n_gpu_layers=-1,
             verbose=True,
             chat_format="chatml"
         )
     x=llm.create_chat_completion(
       messages = [
                 # "Qwen3-Coder-Next-Q4_K_M.gguf",
                 # "gpt-oss-20b-Q4_K_M.gguf",
                 # "Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf",
+                "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
                 # "Qwen3-VL-32B-Thinking-Q8_0.gguf",
                 # "Q8_0/gpt-oss-120b-Q8_0-00001-of-00002.gguf"
             ],
+            value="Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
             label="Model",
         ),
         gr.Textbox(