Spaces:

rphrp1985
/

chatbots1

Running on Zero

App Files Files Community

rphrp1985 commited on 16 days ago

Commit

cada018

verified ·

1 Parent(s): c61ef02

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -38

app.py CHANGED Viewed

@@ -86,12 +86,12 @@ huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
 #     token=huggingface_token
 # )
-hf_hub_download(
-    repo_id="unsloth/GLM-4.7-Flash-GGUF",
-    filename="GLM-4.7-Flash-Q8_0.gguf",
-    local_dir="./models",
-    token=huggingface_token
-)
 # hf_hub_download(
 #     repo_id="unsloth/gpt-oss-20b-GGUF",
@@ -125,11 +125,11 @@ hf_hub_download(
 )
-# hf_hub_download(
-#     repo_id="unsloth/Qwen3-Coder-Next-GGUF",
-#     filename="Qwen3-Coder-Next-Q4_K_M.gguf",
-#     local_dir="./models"
-# )
 from huggingface_hub import snapshot_download
 # snapshot_download(
@@ -393,7 +393,7 @@ def respond(
                 use_mlock=True,
                 verbose=True,
                 chat_handler=Qwen3VLChatHandler(
-      clip_model_path=f"models/Qwen3-VL-32B-Thinking-Q8_0.gguf",
       force_reasoning=True,
       image_min_tokens=1024, # Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks
     ),
@@ -411,31 +411,7 @@ def respond(
         print(x)
         # delete_llama_model(llm_model_qwen)
         yield str(x)
-    if model=="GLM-4.7-Flash-Q8_0.gguf" :
-        if llm_model_glm == None:
-            llm_model_glm = Llama(
-               model_path=f"models/{model}",
-                flash_attn=True,
-                n_gpu_layers=-1,
-                n_batch=2048,        # increase
-                n_ctx=8196,          # reduce if you don’t need 8k
-                n_threads=16,        # set to your CPU cores
-                use_mlock=True,
-                verbose=True,
-                chat_format="chatml"
-            )
-        x=llm_model_glm.create_chat_completion(
-        messages = [
-                  {"role": "system", "content": "hi"},
-                  {
-                      "role": "user",
-                      "content": str(message)
-                  }
-              ]
-                )
-        # delete_llama_model(llm_model_glm)
-        print(x)
-        yield str(x)
@@ -507,7 +483,7 @@ demo = gr.ChatInterface(
                 # "gemma-2-9b-it-Q5_K_M.gguf",
                 # "gemma-2-27b-it-Q5_K_M.gguf",
                 # "2b_it_v2.gguf",
-                "GLM-4.7-Flash-Q8_0.gguf",
                 # "Qwen3-Coder-Next-Q4_K_M.gguf",
                 # "gpt-oss-20b-Q4_K_M.gguf",
                 # "Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf",

 #     token=huggingface_token
 # )
+# hf_hub_download(
+#     repo_id="unsloth/GLM-4.7-Flash-GGUF",
+#     filename="GLM-4.7-Flash-Q8_0.gguf",
+#     local_dir="./models",
+#     token=huggingface_token
+# )
 # hf_hub_download(
 #     repo_id="unsloth/gpt-oss-20b-GGUF",
 )
+hf_hub_download(
+    repo_id="Qwen/Qwen3-VL-8B-Thinking-GGUF",
+    filename="mmproj-Qwen3VL-8B-Thinking-F16.gguf",
+    local_dir="./models"
+)
 from huggingface_hub import snapshot_download
 # snapshot_download(
                 use_mlock=True,
                 verbose=True,
                 chat_handler=Qwen3VLChatHandler(
+      clip_model_path=f"models/mmproj-Qwen3VL-8B-Thinking-F16.gguf",
       force_reasoning=True,
       image_min_tokens=1024, # Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks
     ),
         print(x)
         # delete_llama_model(llm_model_qwen)
         yield str(x)
                 # "gemma-2-9b-it-Q5_K_M.gguf",
                 # "gemma-2-27b-it-Q5_K_M.gguf",
                 # "2b_it_v2.gguf",
+                # "GLM-4.7-Flash-Q8_0.gguf",
                 # "Qwen3-Coder-Next-Q4_K_M.gguf",
                 # "gpt-oss-20b-Q4_K_M.gguf",
                 # "Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf",