rphrp1985 commited on
Commit
20fbcf3
·
verified ·
1 Parent(s): 43ae1a4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +34 -18
app.py CHANGED
@@ -115,11 +115,11 @@ hf_hub_download(
115
  # )
116
 
117
 
118
- hf_hub_download(
119
- repo_id="unsloth/Qwen3-VL-32B-Thinking-GGUF",
120
- filename="Qwen3-VL-32B-Thinking-Q8_0.gguf",
121
- local_dir="./models"
122
- )
123
 
124
 
125
  # hf_hub_download(
@@ -129,13 +129,13 @@ hf_hub_download(
129
  # )
130
  from huggingface_hub import snapshot_download
131
 
132
- # snapshot_download(
133
- # repo_id="unsloth/MiniMax-M2.5-GGUF",
134
- # repo_type="model",
135
- # local_dir="./models/",
136
- # allow_patterns=["Q3_K_S/*"], # 👈 folder inside repo
137
- # token=huggingface_token # only if gated/private
138
- # )
139
 
140
 
141
 
@@ -197,7 +197,9 @@ print_tree("models")
197
 
198
 
199
  llm = None
200
- llm_model = None
 
 
201
 
202
  @spaces.GPU(duration=30)
203
  def respond(
@@ -217,9 +219,22 @@ def respond(
217
 
218
  global llm
219
  global llm_model
 
220
 
221
- if llm is None or llm_model != model:
222
- llm = Llama(
 
 
 
 
 
 
 
 
 
 
 
 
223
  model_path=f"models/{model}",
224
  flash_attn=True,
225
  n_gpu_layers=-1,
@@ -230,7 +245,8 @@ def respond(
230
  verbose=True,
231
  chat_format="chatml"
232
  )
233
- llm_model = model
 
234
 
235
  x=llm.create_chat_completion(
236
  messages = [
@@ -313,11 +329,11 @@ demo = gr.ChatInterface(
313
  # "Qwen3-Coder-Next-Q4_K_M.gguf",
314
  # "gpt-oss-20b-Q4_K_M.gguf",
315
  # "Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf",
316
- "Qwen3-VL-32B-Thinking-Q8_0.gguf",
317
  # "Qwen3-VL-32B-Thinking-Q8_0.gguf",
318
  # "Q8_0/gpt-oss-120b-Q8_0-00001-of-00002.gguf"
319
  ],
320
- value="Qwen3-VL-32B-Thinking-Q8_0.gguf",
321
  label="Model",
322
  ),
323
  gr.Textbox(
 
115
  # )
116
 
117
 
118
+ # hf_hub_download(
119
+ # repo_id="unsloth/Qwen3-VL-32B-Thinking-GGUF",
120
+ # filename="Qwen3-VL-32B-Thinking-Q8_0.gguf",
121
+ # local_dir="./models"
122
+ # )
123
 
124
 
125
  # hf_hub_download(
 
129
  # )
130
  from huggingface_hub import snapshot_download
131
 
132
+ snapshot_download(
133
+ repo_id="unsloth/Qwen3-Coder-Next-GGUF",
134
+ repo_type="model",
135
+ local_dir="./models/",
136
+ allow_patterns=["Q5_K_M/*"], # 👈 folder inside repo
137
+ token=huggingface_token # only if gated/private
138
+ )
139
 
140
 
141
 
 
197
 
198
 
199
  llm = None
200
+ llm_model_glm = None
201
+ llm_model_qwen= None
202
+
203
 
204
  @spaces.GPU(duration=30)
205
  def respond(
 
219
 
220
  global llm
221
  global llm_model
222
+
223
 
224
+ if model is "Qwen3-VL-32B-Thinking-Q8_0.gguf" and llm_model_qwen is None:
225
+ llm_model_qwen = Llama(
226
+ model_path=f"models/{model}",
227
+ flash_attn=True,
228
+ n_gpu_layers=-1,
229
+ n_batch=2048, # increase
230
+ n_ctx=2048, # reduce if you don’t need 8k
231
+ n_threads=16, # set to your CPU cores
232
+ use_mlock=True,
233
+ verbose=True,
234
+ chat_format="qwen"
235
+ )
236
+ if model=="GLM-4.7-Flash-Q8_0.gguf" and llm_model_glm is None:
237
+ llm_model_qwen = Llama(
238
  model_path=f"models/{model}",
239
  flash_attn=True,
240
  n_gpu_layers=-1,
 
245
  verbose=True,
246
  chat_format="chatml"
247
  )
248
+
249
+
250
 
251
  x=llm.create_chat_completion(
252
  messages = [
 
329
  # "Qwen3-Coder-Next-Q4_K_M.gguf",
330
  # "gpt-oss-20b-Q4_K_M.gguf",
331
  # "Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf",
332
+ "Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
333
  # "Qwen3-VL-32B-Thinking-Q8_0.gguf",
334
  # "Q8_0/gpt-oss-120b-Q8_0-00001-of-00002.gguf"
335
  ],
336
+ value="Q5_K_M/Qwen3-Coder-Next-Q5_K_M-00001-of-00003.gguf",
337
  label="Model",
338
  ),
339
  gr.Textbox(