Update app.py
Browse files
app.py
CHANGED
|
@@ -115,11 +115,11 @@ hf_hub_download(
|
|
| 115 |
# )
|
| 116 |
|
| 117 |
|
| 118 |
-
hf_hub_download(
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
)
|
| 123 |
|
| 124 |
|
| 125 |
# hf_hub_download(
|
|
@@ -129,13 +129,13 @@ hf_hub_download(
|
|
| 129 |
# )
|
| 130 |
from huggingface_hub import snapshot_download
|
| 131 |
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
|
| 140 |
|
| 141 |
|
|
@@ -197,7 +197,9 @@ print_tree("models")
|
|
| 197 |
|
| 198 |
|
| 199 |
llm = None
|
| 200 |
-
|
|
|
|
|
|
|
| 201 |
|
| 202 |
@spaces.GPU(duration=30)
|
| 203 |
def respond(
|
|
@@ -217,9 +219,22 @@ def respond(
|
|
| 217 |
|
| 218 |
global llm
|
| 219 |
global llm_model
|
|
|
|
| 220 |
|
| 221 |
-
if
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
model_path=f"models/{model}",
|
| 224 |
flash_attn=True,
|
| 225 |
n_gpu_layers=-1,
|
|
@@ -230,7 +245,8 @@ def respond(
|
|
| 230 |
verbose=True,
|
| 231 |
chat_format="chatml"
|
| 232 |
)
|
| 233 |
-
|
|
|
|
| 234 |
|
| 235 |
x=llm.create_chat_completion(
|
| 236 |
messages = [
|
|
@@ -313,11 +329,11 @@ demo = gr.ChatInterface(
|
|
| 313 |
# "Qwen3-Coder-Next-Q4_K_M.gguf",
|
| 314 |
# "gpt-oss-20b-Q4_K_M.gguf",
|
| 315 |
# "Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf",
|
| 316 |
-
"Qwen3-
|
| 317 |
# "Qwen3-VL-32B-Thinking-Q8_0.gguf",
|
| 318 |
# "Q8_0/gpt-oss-120b-Q8_0-00001-of-00002.gguf"
|
| 319 |
],
|
| 320 |
-
value="Qwen3-
|
| 321 |
label="Model",
|
| 322 |
),
|
| 323 |
gr.Textbox(
|
|
|
|
| 115 |
# )
|
| 116 |
|
| 117 |
|
| 118 |
+
# hf_hub_download(
|
| 119 |
+
# repo_id="unsloth/Qwen3-VL-32B-Thinking-GGUF",
|
| 120 |
+
# filename="Qwen3-VL-32B-Thinking-Q8_0.gguf",
|
| 121 |
+
# local_dir="./models"
|
| 122 |
+
# )
|
| 123 |
|
| 124 |
|
| 125 |
# hf_hub_download(
|
|
|
|
| 129 |
# )
|
| 130 |
from huggingface_hub import snapshot_download
|
| 131 |
|
| 132 |
+
snapshot_download(
|
| 133 |
+
repo_id="unsloth/Qwen3-Coder-Next-GGUF",
|
| 134 |
+
repo_type="model",
|
| 135 |
+
local_dir="./models/",
|
| 136 |
+
allow_patterns=["Q5_K_M/*"], # 👈 folder inside repo
|
| 137 |
+
token=huggingface_token # only if gated/private
|
| 138 |
+
)
|
| 139 |
|
| 140 |
|
| 141 |
|
|
|
|
| 197 |
|
| 198 |
|
| 199 |
llm = None
|
| 200 |
+
llm_model_glm = None
|
| 201 |
+
llm_model_qwen= None
|
| 202 |
+
|
| 203 |
|
| 204 |
@spaces.GPU(duration=30)
|
| 205 |
def respond(
|
|
|
|
| 219 |
|
| 220 |
global llm
|
| 221 |
global llm_model
|
| 222 |
+
|
| 223 |
|
| 224 |
+
if model is "Qwen3-VL-32B-Thinking-Q8_0.gguf" and llm_model_qwen is None:
|
| 225 |
+
llm_model_qwen = Llama(
|
| 226 |
+
model_path=f"models/{model}",
|
| 227 |
+
flash_attn=True,
|
| 228 |
+
n_gpu_layers=-1,
|
| 229 |
+
n_batch=2048, # increase
|
| 230 |
+
n_ctx=2048, # reduce if you don’t need 8k
|
| 231 |
+
n_threads=16, # set to your CPU cores
|
| 232 |
+
use_mlock=True,
|
| 233 |
+
verbose=True,
|
| 234 |
+
chat_format="qwen"
|
| 235 |
+
)
|
| 236 |
+
if model=="GLM-4.7-Flash-Q8_0.gguf" and llm_model_glm is None:
|
| 237 |
+
llm_model_qwen = Llama(
|
| 238 |
model_path=f"models/{model}",
|
| 239 |
flash_attn=True,
|
| 240 |
n_gpu_layers=-1,
|
|
|
|
| 245 |
verbose=True,
|
| 246 |
chat_format="chatml"
|
| 247 |
)
|
| 248 |
+
|
| 249 |
+
|
| 250 |
|
| 251 |
x=llm.create_chat_completion(
|
| 252 |
messages = [
|
|
|
|
| 329 |
# "Qwen3-Coder-Next-Q4_K_M.gguf",
|
| 330 |
# "gpt-oss-20b-Q4_K_M.gguf",
|
| 331 |
# "Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf",
|
| 332 |
+
"Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
|
| 333 |
# "Qwen3-VL-32B-Thinking-Q8_0.gguf",
|
| 334 |
# "Q8_0/gpt-oss-120b-Q8_0-00001-of-00002.gguf"
|
| 335 |
],
|
| 336 |
+
value="Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
|
| 337 |
label="Model",
|
| 338 |
),
|
| 339 |
gr.Textbox(
|