Update app.py
Browse files
app.py
CHANGED
|
@@ -86,12 +86,12 @@ huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
|
|
| 86 |
# token=huggingface_token
|
| 87 |
# )
|
| 88 |
|
| 89 |
-
hf_hub_download(
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
| 94 |
-
)
|
| 95 |
|
| 96 |
# hf_hub_download(
|
| 97 |
# repo_id="unsloth/gpt-oss-20b-GGUF",
|
|
@@ -125,11 +125,11 @@ hf_hub_download(
|
|
| 125 |
)
|
| 126 |
|
| 127 |
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
from huggingface_hub import snapshot_download
|
| 134 |
|
| 135 |
# snapshot_download(
|
|
@@ -393,7 +393,7 @@ def respond(
|
|
| 393 |
use_mlock=True,
|
| 394 |
verbose=True,
|
| 395 |
chat_handler=Qwen3VLChatHandler(
|
| 396 |
-
clip_model_path=f"models/
|
| 397 |
force_reasoning=True,
|
| 398 |
image_min_tokens=1024, # Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks
|
| 399 |
),
|
|
@@ -411,31 +411,7 @@ def respond(
|
|
| 411 |
print(x)
|
| 412 |
# delete_llama_model(llm_model_qwen)
|
| 413 |
yield str(x)
|
| 414 |
-
|
| 415 |
-
if llm_model_glm == None:
|
| 416 |
-
llm_model_glm = Llama(
|
| 417 |
-
model_path=f"models/{model}",
|
| 418 |
-
flash_attn=True,
|
| 419 |
-
n_gpu_layers=-1,
|
| 420 |
-
n_batch=2048, # increase
|
| 421 |
-
n_ctx=8196, # reduce if you don’t need 8k
|
| 422 |
-
n_threads=16, # set to your CPU cores
|
| 423 |
-
use_mlock=True,
|
| 424 |
-
verbose=True,
|
| 425 |
-
chat_format="chatml"
|
| 426 |
-
)
|
| 427 |
-
x=llm_model_glm.create_chat_completion(
|
| 428 |
-
messages = [
|
| 429 |
-
{"role": "system", "content": "hi"},
|
| 430 |
-
{
|
| 431 |
-
"role": "user",
|
| 432 |
-
"content": str(message)
|
| 433 |
-
}
|
| 434 |
-
]
|
| 435 |
-
)
|
| 436 |
-
# delete_llama_model(llm_model_glm)
|
| 437 |
-
print(x)
|
| 438 |
-
yield str(x)
|
| 439 |
|
| 440 |
|
| 441 |
|
|
@@ -507,7 +483,7 @@ demo = gr.ChatInterface(
|
|
| 507 |
# "gemma-2-9b-it-Q5_K_M.gguf",
|
| 508 |
# "gemma-2-27b-it-Q5_K_M.gguf",
|
| 509 |
# "2b_it_v2.gguf",
|
| 510 |
-
"GLM-4.7-Flash-Q8_0.gguf",
|
| 511 |
# "Qwen3-Coder-Next-Q4_K_M.gguf",
|
| 512 |
# "gpt-oss-20b-Q4_K_M.gguf",
|
| 513 |
# "Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf",
|
|
|
|
| 86 |
# token=huggingface_token
|
| 87 |
# )
|
| 88 |
|
| 89 |
+
# hf_hub_download(
|
| 90 |
+
# repo_id="unsloth/GLM-4.7-Flash-GGUF",
|
| 91 |
+
# filename="GLM-4.7-Flash-Q8_0.gguf",
|
| 92 |
+
# local_dir="./models",
|
| 93 |
+
# token=huggingface_token
|
| 94 |
+
# )
|
| 95 |
|
| 96 |
# hf_hub_download(
|
| 97 |
# repo_id="unsloth/gpt-oss-20b-GGUF",
|
|
|
|
| 125 |
)
|
| 126 |
|
| 127 |
|
| 128 |
+
hf_hub_download(
|
| 129 |
+
repo_id="Qwen/Qwen3-VL-8B-Thinking-GGUF",
|
| 130 |
+
filename="mmproj-Qwen3VL-8B-Thinking-F16.gguf",
|
| 131 |
+
local_dir="./models"
|
| 132 |
+
)
|
| 133 |
from huggingface_hub import snapshot_download
|
| 134 |
|
| 135 |
# snapshot_download(
|
|
|
|
| 393 |
use_mlock=True,
|
| 394 |
verbose=True,
|
| 395 |
chat_handler=Qwen3VLChatHandler(
|
| 396 |
+
clip_model_path=f"models/mmproj-Qwen3VL-8B-Thinking-F16.gguf",
|
| 397 |
force_reasoning=True,
|
| 398 |
image_min_tokens=1024, # Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks
|
| 399 |
),
|
|
|
|
| 411 |
print(x)
|
| 412 |
# delete_llama_model(llm_model_qwen)
|
| 413 |
yield str(x)
|
| 414 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 415 |
|
| 416 |
|
| 417 |
|
|
|
|
| 483 |
# "gemma-2-9b-it-Q5_K_M.gguf",
|
| 484 |
# "gemma-2-27b-it-Q5_K_M.gguf",
|
| 485 |
# "2b_it_v2.gguf",
|
| 486 |
+
# "GLM-4.7-Flash-Q8_0.gguf",
|
| 487 |
# "Qwen3-Coder-Next-Q4_K_M.gguf",
|
| 488 |
# "gpt-oss-20b-Q4_K_M.gguf",
|
| 489 |
# "Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf",
|