rphrp1985 commited on
Commit
cada018
·
verified ·
1 Parent(s): c61ef02

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -38
app.py CHANGED
@@ -86,12 +86,12 @@ huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
86
  # token=huggingface_token
87
  # )
88
 
89
- hf_hub_download(
90
- repo_id="unsloth/GLM-4.7-Flash-GGUF",
91
- filename="GLM-4.7-Flash-Q8_0.gguf",
92
- local_dir="./models",
93
- token=huggingface_token
94
- )
95
 
96
  # hf_hub_download(
97
  # repo_id="unsloth/gpt-oss-20b-GGUF",
@@ -125,11 +125,11 @@ hf_hub_download(
125
  )
126
 
127
 
128
- # hf_hub_download(
129
- # repo_id="unsloth/Qwen3-Coder-Next-GGUF",
130
- # filename="Qwen3-Coder-Next-Q4_K_M.gguf",
131
- # local_dir="./models"
132
- # )
133
  from huggingface_hub import snapshot_download
134
 
135
  # snapshot_download(
@@ -393,7 +393,7 @@ def respond(
393
  use_mlock=True,
394
  verbose=True,
395
  chat_handler=Qwen3VLChatHandler(
396
- clip_model_path=f"models/Qwen3-VL-32B-Thinking-Q8_0.gguf",
397
  force_reasoning=True,
398
  image_min_tokens=1024, # Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks
399
  ),
@@ -411,31 +411,7 @@ def respond(
411
  print(x)
412
  # delete_llama_model(llm_model_qwen)
413
  yield str(x)
414
- if model=="GLM-4.7-Flash-Q8_0.gguf" :
415
- if llm_model_glm == None:
416
- llm_model_glm = Llama(
417
- model_path=f"models/{model}",
418
- flash_attn=True,
419
- n_gpu_layers=-1,
420
- n_batch=2048, # increase
421
- n_ctx=8196, # reduce if you don’t need 8k
422
- n_threads=16, # set to your CPU cores
423
- use_mlock=True,
424
- verbose=True,
425
- chat_format="chatml"
426
- )
427
- x=llm_model_glm.create_chat_completion(
428
- messages = [
429
- {"role": "system", "content": "hi"},
430
- {
431
- "role": "user",
432
- "content": str(message)
433
- }
434
- ]
435
- )
436
- # delete_llama_model(llm_model_glm)
437
- print(x)
438
- yield str(x)
439
 
440
 
441
 
@@ -507,7 +483,7 @@ demo = gr.ChatInterface(
507
  # "gemma-2-9b-it-Q5_K_M.gguf",
508
  # "gemma-2-27b-it-Q5_K_M.gguf",
509
  # "2b_it_v2.gguf",
510
- "GLM-4.7-Flash-Q8_0.gguf",
511
  # "Qwen3-Coder-Next-Q4_K_M.gguf",
512
  # "gpt-oss-20b-Q4_K_M.gguf",
513
  # "Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf",
 
86
  # token=huggingface_token
87
  # )
88
 
89
+ # hf_hub_download(
90
+ # repo_id="unsloth/GLM-4.7-Flash-GGUF",
91
+ # filename="GLM-4.7-Flash-Q8_0.gguf",
92
+ # local_dir="./models",
93
+ # token=huggingface_token
94
+ # )
95
 
96
  # hf_hub_download(
97
  # repo_id="unsloth/gpt-oss-20b-GGUF",
 
125
  )
126
 
127
 
128
+ hf_hub_download(
129
+ repo_id="Qwen/Qwen3-VL-8B-Thinking-GGUF",
130
+ filename="mmproj-Qwen3VL-8B-Thinking-F16.gguf",
131
+ local_dir="./models"
132
+ )
133
  from huggingface_hub import snapshot_download
134
 
135
  # snapshot_download(
 
393
  use_mlock=True,
394
  verbose=True,
395
  chat_handler=Qwen3VLChatHandler(
396
+ clip_model_path=f"models/mmproj-Qwen3VL-8B-Thinking-F16.gguf",
397
  force_reasoning=True,
398
  image_min_tokens=1024, # Note: Qwen-VL models require at minimum 1024 image tokens to function correctly on bbox grounding tasks
399
  ),
 
411
  print(x)
412
  # delete_llama_model(llm_model_qwen)
413
  yield str(x)
414
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
415
 
416
 
417
 
 
483
  # "gemma-2-9b-it-Q5_K_M.gguf",
484
  # "gemma-2-27b-it-Q5_K_M.gguf",
485
  # "2b_it_v2.gguf",
486
+ # "GLM-4.7-Flash-Q8_0.gguf",
487
  # "Qwen3-Coder-Next-Q4_K_M.gguf",
488
  # "gpt-oss-20b-Q4_K_M.gguf",
489
  # "Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf",