import spaces import os os.environ["LLAMA_CUBLAS"] = "1" os.environ["GGML_CUDA_FORCE_DMMV"] = "1" import json import subprocess ### monkey patch import llama_cpp._internals as internals from llama_cpp.llama_chat_format import Qwen3VLChatHandler # 2οΈβ£ Monkey patch BEFORE creating Llama() _original_close = internals.LlamaModel.close def safe_close(self): try: if hasattr(self, "sampler") and self.sampler is not None: return _original_close(self) except Exception: pass internals.LlamaModel.close = safe_close def safe_del(self): try: self.close() except Exception: pass internals.LlamaModel.__del__ = safe_del ##### final verdict # GLM 4.7 flash fast infrence #qwen 3 VL #mini max 2.5 # qwen 3 coder next #gpt oss 120B from llama_cpp import Llama from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType from llama_cpp_agent.providers import LlamaCppPythonProvider from llama_cpp_agent.chat_history import BasicChatHistory from llama_cpp_agent.chat_history.messages import Roles import gradio as gr from huggingface_hub import hf_hub_download import llama_cpp print(llama_cpp.__file__) print(llama_cpp.__version__) huggingface_token = os.getenv("HUGGINGFACE_TOKEN") # hf_hub_download( # repo_id="bartowski/gemma-2-9b-it-GGUF", # filename="gemma-2-9b-it-Q5_K_M.gguf", # local_dir="./models" # ) # hf_hub_download( # repo_id="bartowski/gemma-2-27b-it-GGUF", # filename="gemma-2-27b-it-Q5_K_M.gguf", # local_dir="./models" # ) # hf_hub_download( # repo_id="google/gemma-2-2b-it-GGUF", # filename="2b_it_v2.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/GLM-4.7-Flash-GGUF", # filename="GLM-4.7-Flash-Q8_0.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/gpt-oss-20b-GGUF", # filename="gpt-oss-20b-Q4_K_M.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/gpt-oss-20b-GGUF", # filename="gpt-oss-20b-Q4_K_M.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF", # filename="Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/Qwen3-VL-32B-Thinking-GGUF", # filename="Qwen3-VL-32B-Thinking-Q8_0.gguf", # local_dir="./models" # ) # hf_hub_download( # repo_id="unsloth/Qwen3-Coder-Next-GGUF", # filename="Qwen3-Coder-Next-Q4_K_M.gguf", # local_dir="./models" # ) from huggingface_hub import snapshot_download # snapshot_download( # repo_id="unsloth/Qwen3-Coder-Next-GGUF", # repo_type="model", # local_dir="./models/", # allow_patterns=["Q5_K_M/*"], # π folder inside repo # token=huggingface_token # only if gated/private # ) #### Deploy Minimax 2.5 insplace of gpt oss 120b its larger . and better and more recet leeases snapshot_download( repo_id="unsloth/Qwen3-Coder-Next-GGUF", repo_type="model", local_dir="./models/", allow_patterns=["Q6_K/*"], # π folder inside repo token=huggingface_token # only if gated/private ) # llm = Llama.from_pretrained( # repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S", # # ALWAYS first shard only here # filename="step3p5_flash_Q4_K_S-00001-of-00012.gguf", # # Download all shards # additional_files=[ # f"step3p5_flash_Q4_K_S-{i:05d}-of-00012.gguf" # for i in range(2, 13) # ], # local_dir="./models", # # Performance settings # flash_attn=True, # n_gpu_layers=-1, # use full GPU (if you have enough VRAM) # n_batch=2048, # n_ctx=4096, # 8000 is heavy unless needed # ) # llm = Llama.from_pretrained( # repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S", # filename="step3p5_flash_Q4_K_S-00001-of-00012.gguf", # allow_patterns=["UD-TQ1_0/*.gguf"], # verbose=False # ) import os def print_tree(start_path="models"): for root, dirs, files in os.walk(start_path): level = root.replace(start_path, "").count(os.sep) indent = "β " * level print(f"{indent}βββ {os.path.basename(root)}/") subindent = "β " * (level + 1) for f in files: print(f"{subindent}βββ {f}") print_tree("models") import gc import torch def delete_llama_model(llm): # global llm if llm is not None: try: llm.close() # π₯ VERY IMPORTANT except Exception as e: print("Close error:", e) llm = None # Force Python garbage collection gc.collect() # Clear GPU cache (if using CUDA) try: torch.cuda.empty_cache() torch.cuda.ipc_collect() torch.cuda.synchronize() except: pass print("Model fully unloaded.") llm = None llm_model_glm = None llm_model_qwen= None @spaces.GPU(duration=120) def respond( message, history: list[tuple[str, str]], model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty, ): # chat_template = MessagesFormatterType.GEMMA_2 chat_template = MessagesFormatterType.CHATML # chat_template = global llm global llm_model global llm_model_glm global llm_model_qwen if llm_model_glm == None: llm_model_glm = Llama( model_path=f"models/{model}", flash_attn=True, n_gpu_layers=-1, n_batch=2048, # increase n_ctx=8196, # reduce if you donβt need 8k n_threads=16, # set to your CPU cores use_mlock=True, verbose=True, chat_format="chatml" ) x=llm_model_glm.create_chat_completion( messages = [ {"role": "system", "content": "hi"}, { "role": "user", "content": str(message) } ] ) # delete_llama_model(llm_model_glm) print(x) yield str(x) # provider = LlamaCppPythonProvider(llm) # agent = LlamaCppAgent( # provider, # system_prompt=f"{system_message}", # predefined_messages_formatter_type=chat_template, # debug_output=False # ) # settings = provider.get_provider_default_settings() # settings.temperature = temperature # settings.top_k = top_k # settings.top_p = top_p # settings.max_tokens = max_tokens # settings.repeat_penalty = repeat_penalty # # settings.stream = True # # settings.reasoning_effort ="low" # messages = BasicChatHistory() # for msn in history: # user = { # 'role': Roles.user, # 'content': msn[0] # } # assistant = { # 'role': Roles.assistant, # 'content': msn[1] # } # messages.add_message(user) # messages.add_message(assistant) # stream = agent.get_chat_response( # message, # # llm_sampling_settings=settings, # chat_history=messages, # # returns_streaming_generator=True, # print_output=False # ) # outputs = "" # for output in stream: # outputs += output # yield outputs description = """
Defaults to 2B (you can switch to 9B or 27B from additional inputs)