| import spaces | |
| import os | |
| import json | |
| import subprocess | |
| ### monkey patch | |
| import llama_cpp._internals as internals | |
| # 2️⃣ Monkey patch BEFORE creating Llama() | |
| _original_close = internals.LlamaModel.close | |
| def safe_close(self): | |
| try: | |
| if hasattr(self, "sampler") and self.sampler is not None: | |
| return _original_close(self) | |
| except Exception: | |
| pass | |
| internals.LlamaModel.close = safe_close | |
| def safe_del(self): | |
| try: | |
| self.close() | |
| except Exception: | |
| pass | |
| internals.LlamaModel.__del__ = safe_del | |
| ##### final verdict | |
| # GLM 4.7 flash fast infrence | |
| #qwen 3 VL | |
| #mini max 2.5 | |
| # qwen 3 coder next | |
| #gpt oss 120B | |
| from llama_cpp import Llama | |
| from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType | |
| from llama_cpp_agent.providers import LlamaCppPythonProvider | |
| from llama_cpp_agent.chat_history import BasicChatHistory | |
| from llama_cpp_agent.chat_history.messages import Roles | |
| import gradio as gr | |
| from huggingface_hub import hf_hub_download | |
| import llama_cpp | |
| print(llama_cpp.__file__) | |
| print(llama_cpp.__version__) | |
| huggingface_token = os.getenv("HUGGINGFACE_TOKEN") | |
| # hf_hub_download( | |
| # repo_id="bartowski/gemma-2-9b-it-GGUF", | |
| # filename="gemma-2-9b-it-Q5_K_M.gguf", | |
| # local_dir="./models" | |
| # ) | |
| # hf_hub_download( | |
| # repo_id="bartowski/gemma-2-27b-it-GGUF", | |
| # filename="gemma-2-27b-it-Q5_K_M.gguf", | |
| # local_dir="./models" | |
| # ) | |
| # hf_hub_download( | |
| # repo_id="google/gemma-2-2b-it-GGUF", | |
| # filename="2b_it_v2.gguf", | |
| # local_dir="./models", | |
| # token=huggingface_token | |
| # ) | |
| # hf_hub_donload( | |
| # repo_id="unsloth/GLM-4.7-Flash-GGUF", | |
| # filename="GLM-4.7-Flash-UD-Q8_K_XL.gguf", | |
| # local_dir="./models", | |
| # token=huggingface_token | |
| # ) | |
| # hf_hub_download( | |
| # repo_id="unsloth/gpt-oss-20b-GGUF", | |
| # filename="gpt-oss-20b-Q4_K_M.gguf", | |
| # local_dir="./models", | |
| # token=huggingface_token | |
| # ) | |
| # hf_hub_download( | |
| # repo_id="unsloth/gpt-oss-20b-GGUF", | |
| # filename="gpt-oss-20b-Q4_K_M.gguf", | |
| # local_dir="./models", | |
| # token=huggingface_token | |
| # ) | |
| # hf_hub_download( | |
| # repo_id="unsloth/Qwen3-Next-80B-A3B-Instruct-GGUF", | |
| # filename="Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf", | |
| # local_dir="./models", | |
| # token=huggingface_token | |
| # ) | |
| # hf_hub_download( | |
| # repo_id="unsloth/Qwen3-VL-32B-Thinking-GGUF", | |
| # filename="Qwen3-VL-32B-Thinking-Q8_0.gguf", | |
| # local_dir="./models" | |
| # ) | |
| # hf_hub_download( | |
| # repo_id="unsloth/Qwen3-Coder-Next-GGUF", | |
| # filename="Qwen3-Coder-Next-Q4_K_M.gguf", | |
| # local_dir="./models" | |
| # ) | |
| from huggingface_hub import snapshot_download | |
| snapshot_download( | |
| repo_id="unsloth/MiniMax-M2.5-GGUF", | |
| repo_type="model", | |
| local_dir="./models/minmax", | |
| allow_patterns=["UD-IQ1_M/*"], # 👈 folder inside repo | |
| token=huggingface_token # only if gated/private | |
| ) | |
| #### Deploy Minimax 2.5 insplace of gpt oss 120b its larger . and better and more recet leeases | |
| # snapshot_download( | |
| # repo_id="unsloth/gpt-oss-120b-GGUF", | |
| # repo_type="model", | |
| # local_dir="./models/", | |
| # allow_patterns=["Q8_0/*"], # 👈 folder inside repo | |
| # token=huggingface_token # only if gated/private | |
| # ) | |
| # llm = Llama.from_pretrained( | |
| # repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S", | |
| # # ALWAYS first shard only here | |
| # filename="step3p5_flash_Q4_K_S-00001-of-00012.gguf", | |
| # # Download all shards | |
| # additional_files=[ | |
| # f"step3p5_flash_Q4_K_S-{i:05d}-of-00012.gguf" | |
| # for i in range(2, 13) | |
| # ], | |
| # local_dir="./models", | |
| # # Performance settings | |
| # flash_attn=True, | |
| # n_gpu_layers=-1, # use full GPU (if you have enough VRAM) | |
| # n_batch=2048, | |
| # n_ctx=4096, # 8000 is heavy unless needed | |
| # ) | |
| # llm = Llama.from_pretrained( | |
| # repo_id="stepfun-ai/Step-3.5-Flash-GGUF-Q4_K_S", | |
| # filename="step3p5_flash_Q4_K_S-00001-of-00012.gguf", | |
| # allow_patterns=["UD-TQ1_0/*.gguf"], | |
| # verbose=False | |
| # ) | |
| import os | |
| def print_tree(start_path="models"): | |
| for root, dirs, files in os.walk(start_path): | |
| level = root.replace(start_path, "").count(os.sep) | |
| indent = "│ " * level | |
| print(f"{indent}├── {os.path.basename(root)}/") | |
| subindent = "│ " * (level + 1) | |
| for f in files: | |
| print(f"{subindent}├── {f}") | |
| print_tree("models") | |
| llm = None | |
| llm_model = None | |
| def respond( | |
| message, | |
| history: list[tuple[str, str]], | |
| model, | |
| system_message, | |
| max_tokens, | |
| temperature, | |
| top_p, | |
| top_k, | |
| repeat_penalty, | |
| ): | |
| # chat_template = MessagesFormatterType.GEMMA_2 | |
| chat_template = MessagesFormatterType.CHATML | |
| # chat_template = | |
| global llm | |
| global llm_model | |
| if llm is None or llm_model != model: | |
| llm = Llama( | |
| model_path=f"models/{model}", | |
| flash_attn=True, | |
| n_gpu_layers=-1, | |
| n_batch=8196, # increase | |
| n_ctx=2048, # reduce if you don’t need 8k | |
| n_threads=16, # set to your CPU cores | |
| use_mlock=True, | |
| verbose=True, | |
| chat_format="chatml" | |
| ) | |
| llm_model = model | |
| x=llm.create_chat_completion( | |
| messages = [ | |
| {"role": "system", "content": "hi"}, | |
| { | |
| "role": "user", | |
| "content": str(message) | |
| } | |
| ] | |
| ) | |
| print(x) | |
| yield str(x) | |
| # provider = LlamaCppPythonProvider(llm) | |
| # agent = LlamaCppAgent( | |
| # provider, | |
| # system_prompt=f"{system_message}", | |
| # predefined_messages_formatter_type=chat_template, | |
| # debug_output=False | |
| # ) | |
| # settings = provider.get_provider_default_settings() | |
| # settings.temperature = temperature | |
| # settings.top_k = top_k | |
| # settings.top_p = top_p | |
| # settings.max_tokens = max_tokens | |
| # settings.repeat_penalty = repeat_penalty | |
| # # settings.stream = True | |
| # # settings.reasoning_effort ="low" | |
| # messages = BasicChatHistory() | |
| # for msn in history: | |
| # user = { | |
| # 'role': Roles.user, | |
| # 'content': msn[0] | |
| # } | |
| # assistant = { | |
| # 'role': Roles.assistant, | |
| # 'content': msn[1] | |
| # } | |
| # messages.add_message(user) | |
| # messages.add_message(assistant) | |
| # stream = agent.get_chat_response( | |
| # message, | |
| # # llm_sampling_settings=settings, | |
| # chat_history=messages, | |
| # # returns_streaming_generator=True, | |
| # print_output=False | |
| # ) | |
| # outputs = "" | |
| # for output in stream: | |
| # outputs += output | |
| # yield outputs | |
| description = """<p align="center">Defaults to 2B (you can switch to 9B or 27B from additional inputs)</p> | |
| <p><center> | |
| <a href="https://huggingface.co/google/gemma-2-27b-it" target="_blank">[27B it Model]</a> | |
| <a href="https://huggingface.co/google/gemma-2-9b-it" target="_blank">[9B it Model]</a> | |
| <a href="https://huggingface.co/google/gemma-2-2b-it" target="_blank">[2B it Model]</a> | |
| <a href="https://huggingface.co/bartowski/gemma-2-27b-it-GGUF" target="_blank">[27B it Model GGUF]</a> | |
| <a href="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF" target="_blank">[9B it Model GGUF]</a> | |
| <a href="https://huggingface.co/google/gemma-2-2b-it-GGUF" target="_blank">[2B it Model GGUF]</a> | |
| </center></p> | |
| """ | |
| import gradio as gr | |
| demo = gr.ChatInterface( | |
| fn=respond, | |
| additional_inputs=[ | |
| gr.Dropdown( | |
| [ | |
| # "gemma-2-9b-it-Q5_K_M.gguf", | |
| # "gemma-2-27b-it-Q5_K_M.gguf", | |
| # # "2b_it_v2.gguf", | |
| # "GLM-4.7-Flash-UD-Q8_K_XL.gguf", | |
| # "Qwen3-Coder-Next-Q4_K_M.gguf", | |
| # "gpt-oss-20b-Q4_K_M.gguf", | |
| # "Qwen3-Next-80B-A3B-Instruct-Q4_K_M.gguf", | |
| "minmax/UD-IQ1_M/MiniMax-M2.5-UD-IQ1_M-00001-of-00003.gguf", | |
| # "Qwen3-VL-32B-Thinking-Q8_0.gguf", | |
| # "Q8_0/gpt-oss-120b-Q8_0-00001-of-00002.gguf" | |
| ], | |
| value= "minmax/UD-IQ1_M/MiniMax-M2.5-UD-IQ1_M-00001-of-00003.gguf", | |
| label="Model", | |
| ), | |
| gr.Textbox( | |
| value="You are a helpful assistant.", | |
| label="System message", | |
| ), | |
| gr.Slider(1, 4096, value=2048, step=1, label="Max tokens"), | |
| gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature"), | |
| gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"), | |
| gr.Slider(0, 100, value=40, step=1, label="Top-k"), | |
| gr.Slider(0.0, 2.0, value=1.1, step=0.1, label="Repetition penalty"), | |
| ], | |
| title="Chat with Gemma 2 using llama.cpp", | |
| description=description, | |
| ) | |
| # demo.launch() | |
| if __name__ == "__main__": | |
| demo.launch() |