import spaces import os import json import subprocess ### monkey patch import llama_cpp._internals as internals # 2οΈβ£ Monkey patch BEFORE creating Llama() _original_close = internals.LlamaModel.close def safe_close(self): try: if hasattr(self, "sampler") and self.sampler is not None: return _original_close(self) except Exception: pass internals.LlamaModel.close = safe_close def safe_del(self): try: self.close() except Exception: pass internals.LlamaModel.__del__ = safe_del ##### final verdict # GLM 4.7 flash fast infrence #qwen 3 VL #mini max 2.5 # qwen 3 coder next #gpt oss 120B from llama_cpp import Llama from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType from llama_cpp_agent.providers import LlamaCppPythonProvider from llama_cpp_agent.chat_history import BasicChatHistory from llama_cpp_agent.chat_history.messages import Roles import gradio as gr from huggingface_hub import hf_hub_download import llama_cpp print(llama_cpp.__file__) print(llama_cpp.__version__) huggingface_token = os.getenv("HUGGINGFACE_TOKEN") # hf_hub_download( # repo_id="bartowski/gemma-2-9b-it-GGUF", # filename="gemma-2-9b-it-Q5_K_M.gguf", # local_dir="./models" # ) # hf_hub_download( # repo_id="bartowski/gemma-2-27b-it-GGUF", # filename="gemma-2-27b-it-Q5_K_M.gguf", # local_dir="./models" # ) # hf_hub_download( # repo_id="google/gemma-2-2b-it-GGUF", # filename="2b_it_v2.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_donload( # repo_id="unsloth/GLM-4.7-Flash-GGUF", # filename="GLM-4.7-Flash-UD-Q8_K_XL.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/gpt-oss-20b-GGUF", # filename="gpt-oss-20b-Q4_K_M.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/gpt-oss-20b-GGUF", # filename="gpt-oss-20b-Q4_K_M.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF", # filename="Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/Qwen3-VL-32B-Thinking-GGUF", # filename="Qwen3-VL-32B-Thinking-Q8_0.gguf", # local_dir="./models" # ) # hf_hub_download( # repo_id="unsloth/Qwen3-Coder-Next-GGUF", # filename="Qwen3-Coder-Next-Q4_K_M.gguf", # local_dir="./models" # ) from huggingface_hub import snapshot_download snapshot_download( repo_id="unsloth/MiniMax-M2.5-GGUF", repo_type="model", local_dir="./models/minmax", allow_patterns=["UD-IQ1_M/*"], # π folder inside repo token=huggingface_token # only if gated/private ) #### Deploy Minimax 2.5 insplace of gpt oss 120b its larger . and better and more recet leeases # snapshot_download( # repo_id="unsloth/gpt-oss-120b-GGUF", # repo_type="model", # local_dir="./models/", # allow_patterns=["Q8_0/*"], # π folder inside repo # token=huggingface_token # only if gated/private # ) # llm = Llama.from_pretrained( # repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S", # # ALWAYS first shard only here # filename="step3p5_flash_Q4_K_S-00001-of-00012.gguf", # # Download all shards # additional_files=[ # f"step3p5_flash_Q4_K_S-{i:05d}-of-00012.gguf" # for i in range(2, 13) # ], # local_dir="./models", # # Performance settings # flash_attn=True, # n_gpu_layers=-1, # use full GPU (if you have enough VRAM) # n_batch=2048, # n_ctx=4096, # 8000 is heavy unless needed # ) # llm = Llama.from_pretrained( # repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S", # filename="step3p5_flash_Q4_K_S-00001-of-00012.gguf", # allow_patterns=["UD-TQ1_0/*.gguf"], # verbose=False # ) import os def print_tree(start_path="models"): for root, dirs, files in os.walk(start_path): level = root.replace(start_path, "").count(os.sep) indent = "β " * level print(f"{indent}βββ {os.path.basename(root)}/") subindent = "β " * (level + 1) for f in files: print(f"{subindent}βββ {f}") print_tree("models") llm = None llm_model = None @spaces.GPU(duration=120) def respond( message, history: list[tuple[str, str]], model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty, ): # chat_template = MessagesFormatterType.GEMMA_2 chat_template = MessagesFormatterType.CHATML # chat_template = global llm global llm_model if llm is None or llm_model != model: llm = Llama( model_path=f"models/{model}", flash_attn=True, n_gpu_layers=-1, n_batch=8196, # increase n_ctx=2048, # reduce if you donβt need 8k n_threads=16, # set to your CPU cores use_mlock=True, verbose=True, chat_format="chatml" ) llm_model = model x=llm.create_chat_completion( messages = [ {"role": "system", "content": "hi"}, { "role": "user", "content": str(message) } ] ) print(x) yield str(x) # provider = LlamaCppPythonProvider(llm) # agent = LlamaCppAgent( # provider, # system_prompt=f"{system_message}", # predefined_messages_formatter_type=chat_template, # debug_output=False # ) # settings = provider.get_provider_default_settings() # settings.temperature = temperature # settings.top_k = top_k # settings.top_p = top_p # settings.max_tokens = max_tokens # settings.repeat_penalty = repeat_penalty # # settings.stream = True # # settings.reasoning_effort ="low" # messages = BasicChatHistory() # for msn in history: # user = { # 'role': Roles.user, # 'content': msn[0] # } # assistant = { # 'role': Roles.assistant, # 'content': msn[1] # } # messages.add_message(user) # messages.add_message(assistant) # stream = agent.get_chat_response( # message, # # llm_sampling_settings=settings, # chat_history=messages, # # returns_streaming_generator=True, # print_output=False # ) # outputs = "" # for output in stream: # outputs += output # yield outputs description = """
Defaults to 2B (you can switch to 9B or 27B from additional inputs)