import spaces import os import json import subprocess ### monkey patch import llama_cpp._internals as internals # 2️⃣ Monkey patch BEFORE creating Llama() _original_close = internals.LlamaModel.close def safe_close(self): try: if hasattr(self, "sampler") and self.sampler is not None: return _original_close(self) except Exception: pass internals.LlamaModel.close = safe_close def safe_del(self): try: self.close() except Exception: pass internals.LlamaModel.__del__ = safe_del ##### final verdict # GLM 4.7 flash fast infrence #qwen 3 VL #mini max 2.5 # qwen 3 coder next #gpt oss 120B from llama_cpp import Llama from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType from llama_cpp_agent.providers import LlamaCppPythonProvider from llama_cpp_agent.chat_history import BasicChatHistory from llama_cpp_agent.chat_history.messages import Roles import gradio as gr from huggingface_hub import hf_hub_download import llama_cpp print(llama_cpp.__file__) print(llama_cpp.__version__) huggingface_token = os.getenv("HUGGINGFACE_TOKEN") # hf_hub_download( # repo_id="bartowski/gemma-2-9b-it-GGUF", # filename="gemma-2-9b-it-Q5_K_M.gguf", # local_dir="./models" # ) # hf_hub_download( # repo_id="bartowski/gemma-2-27b-it-GGUF", # filename="gemma-2-27b-it-Q5_K_M.gguf", # local_dir="./models" # ) # hf_hub_download( # repo_id="google/gemma-2-2b-it-GGUF", # filename="2b_it_v2.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_donload( # repo_id="unsloth/GLM-4.7-Flash-GGUF", # filename="GLM-4.7-Flash-UD-Q8_K_XL.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/gpt-oss-20b-GGUF", # filename="gpt-oss-20b-Q4_K_M.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/gpt-oss-20b-GGUF", # filename="gpt-oss-20b-Q4_K_M.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF", # filename="Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf", # local_dir="./models", # token=huggingface_token # ) # hf_hub_download( # repo_id="unsloth/Qwen3-VL-32B-Thinking-GGUF", # filename="Qwen3-VL-32B-Thinking-Q8_0.gguf", # local_dir="./models" # ) # hf_hub_download( # repo_id="unsloth/Qwen3-Coder-Next-GGUF", # filename="Qwen3-Coder-Next-Q4_K_M.gguf", # local_dir="./models" # ) from huggingface_hub import snapshot_download snapshot_download( repo_id="unsloth/MiniMax-M2.5-GGUF", repo_type="model", local_dir="./models/minmax", allow_patterns=["UD-IQ1_M/*"], # πŸ‘ˆ folder inside repo token=huggingface_token # only if gated/private ) #### Deploy Minimax 2.5 insplace of gpt oss 120b its larger . and better and more recet leeases # snapshot_download( # repo_id="unsloth/gpt-oss-120b-GGUF", # repo_type="model", # local_dir="./models/", # allow_patterns=["Q8_0/*"], # πŸ‘ˆ folder inside repo # token=huggingface_token # only if gated/private # ) # llm = Llama.from_pretrained( # repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S", # # ALWAYS first shard only here # filename="step3p5_flash_Q4_K_S-00001-of-00012.gguf", # # Download all shards # additional_files=[ # f"step3p5_flash_Q4_K_S-{i:05d}-of-00012.gguf" # for i in range(2, 13) # ], # local_dir="./models", # # Performance settings # flash_attn=True, # n_gpu_layers=-1, # use full GPU (if you have enough VRAM) # n_batch=2048, # n_ctx=4096, # 8000 is heavy unless needed # ) # llm = Llama.from_pretrained( # repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S", # filename="step3p5_flash_Q4_K_S-00001-of-00012.gguf", # allow_patterns=["UD-TQ1_0/*.gguf"], # verbose=False # ) import os def print_tree(start_path="models"): for root, dirs, files in os.walk(start_path): level = root.replace(start_path, "").count(os.sep) indent = "β”‚ " * level print(f"{indent}β”œβ”€β”€ {os.path.basename(root)}/") subindent = "β”‚ " * (level + 1) for f in files: print(f"{subindent}β”œβ”€β”€ {f}") print_tree("models") llm = None llm_model = None @spaces.GPU(duration=120) def respond( message, history: list[tuple[str, str]], model, system_message, max_tokens, temperature, top_p, top_k, repeat_penalty, ): # chat_template = MessagesFormatterType.GEMMA_2 chat_template = MessagesFormatterType.CHATML # chat_template = global llm global llm_model if llm is None or llm_model != model: llm = Llama( model_path=f"models/{model}", flash_attn=True, n_gpu_layers=-1, n_batch=8196, # increase n_ctx=2048, # reduce if you don’t need 8k n_threads=16, # set to your CPU cores use_mlock=True, verbose=True, chat_format="chatml" ) llm_model = model x=llm.create_chat_completion( messages = [ {"role": "system", "content": "hi"}, { "role": "user", "content": str(message) } ] ) print(x) yield str(x) # provider = LlamaCppPythonProvider(llm) # agent = LlamaCppAgent( # provider, # system_prompt=f"{system_message}", # predefined_messages_formatter_type=chat_template, # debug_output=False # ) # settings = provider.get_provider_default_settings() # settings.temperature = temperature # settings.top_k = top_k # settings.top_p = top_p # settings.max_tokens = max_tokens # settings.repeat_penalty = repeat_penalty # # settings.stream = True # # settings.reasoning_effort ="low" # messages = BasicChatHistory() # for msn in history: # user = { # 'role': Roles.user, # 'content': msn[0] # } # assistant = { # 'role': Roles.assistant, # 'content': msn[1] # } # messages.add_message(user) # messages.add_message(assistant) # stream = agent.get_chat_response( # message, # # llm_sampling_settings=settings, # chat_history=messages, # # returns_streaming_generator=True, # print_output=False # ) # outputs = "" # for output in stream: # outputs += output # yield outputs description = """

Defaults to 2B (you can switch to 9B or 27B from additional inputs)

[27B it Model] [9B it Model] [2B it Model] [27B it Model GGUF] [9B it Model GGUF] [2B it Model GGUF]

""" import gradio as gr demo = gr.ChatInterface( fn=respond, additional_inputs=[ gr.Dropdown( [ # "gemma-2-9b-it-Q5_K_M.gguf", # "gemma-2-27b-it-Q5_K_M.gguf", # # "2b_it_v2.gguf", # "GLM-4.7-Flash-UD-Q8_K_XL.gguf", # "Qwen3-Coder-Next-Q4_K_M.gguf", # "gpt-oss-20b-Q4_K_M.gguf", # "Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf", "minmax/UD-IQ1_M/MiniMax-M2.5-UD-IQ1_M-00001-of-00003.gguf", # "Qwen3-VL-32B-Thinking-Q8_0.gguf", # "Q8_0/gpt-oss-120b-Q8_0-00001-of-00002.gguf" ], value= "minmax/UD-IQ1_M/MiniMax-M2.5-UD-IQ1_M-00001-of-00003.gguf", label="Model", ), gr.Textbox( value="You are a helpful assistant.", label="System message", ), gr.Slider(1, 4096, value=2048, step=1, label="Max tokens"), gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature"), gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"), gr.Slider(0, 100, value=40, step=1, label="Top-k"), gr.Slider(0.0, 2.0, value=1.1, step=0.1, label="Repetition penalty"), ], title="Chat with Gemma 2 using llama.cpp", description=description, ) # demo.launch() if __name__ == "__main__": demo.launch()