| | import spaces |
| | import os |
| | import json |
| | import subprocess |
| |
|
| | |
| |
|
| | import llama_cpp._internals as internals |
| |
|
| |
|
| | |
| |
|
| | _original_close = internals.LlamaModel.close |
| |
|
| | def safe_close(self): |
| | try: |
| | if hasattr(self, "sampler") and self.sampler is not None: |
| | return _original_close(self) |
| | except Exception: |
| | pass |
| |
|
| | internals.LlamaModel.close = safe_close |
| |
|
| |
|
| | def safe_del(self): |
| | try: |
| | self.close() |
| | except Exception: |
| | pass |
| |
|
| | internals.LlamaModel.__del__ = safe_del |
| |
|
| |
|
| |
|
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| |
|
| | from llama_cpp import Llama |
| | from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType |
| | from llama_cpp_agent.providers import LlamaCppPythonProvider |
| | from llama_cpp_agent.chat_history import BasicChatHistory |
| | from llama_cpp_agent.chat_history.messages import Roles |
| | import gradio as gr |
| | from huggingface_hub import hf_hub_download |
| |
|
| | import llama_cpp |
| | print(llama_cpp.__file__) |
| | print(llama_cpp.__version__) |
| |
|
| | huggingface_token = os.getenv("HUGGINGFACE_TOKEN") |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |
| | from huggingface_hub import snapshot_download |
| |
|
| |
|
| |
|
| | snapshot_download( |
| | repo_id="unsloth/Qwen3-Next-80B-A3B-Thinking-GGUF", |
| | repo_type="model", |
| | local_dir="./models/", |
| | allow_patterns=["Q6_K/*"], |
| | token=huggingface_token |
| | ) |
| |
|
| |
|
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| |
|
| |
|
| | |
| | |
| |
|
| | |
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import os |
| |
|
| | def print_tree(start_path="models"): |
| | for root, dirs, files in os.walk(start_path): |
| | level = root.replace(start_path, "").count(os.sep) |
| | indent = "│ " * level |
| | print(f"{indent}├── {os.path.basename(root)}/") |
| | |
| | subindent = "│ " * (level + 1) |
| | for f in files: |
| | print(f"{subindent}├── {f}") |
| |
|
| | print_tree("models") |
| |
|
| |
|
| | llm = None |
| | llm_model = None |
| |
|
| |
|
| | import json |
| |
|
| | def str_to_json(str_obj): |
| | json_obj = json.loads(str_obj) |
| | return json_obj |
| |
|
| |
|
| |
|
| | @spaces.GPU(duration=100) |
| | def respond( |
| | message, |
| | history: list[tuple[str, str]], |
| | model, |
| | system_message, |
| | max_tokens, |
| | temperature, |
| | top_p, |
| | top_k, |
| | repeat_penalty, |
| | ): |
| | |
| | |
| | |
| |
|
| | |
| | |
| |
|
| | try: |
| | j_r= str_to_json(message) |
| | messages= j_r['messages'] |
| | max_token= j_r['max_token'] |
| | except Exception as e: |
| | print(e) |
| | print("not valid json") |
| | max_token=8196 |
| | messages=[ |
| | { |
| | "role":"user", |
| | "content":str(message) |
| | } |
| | ] |
| | |
| | |
| | |
| | |
| | llm = Llama( |
| | model_path=f"models/{model}", |
| | flash_attn=False, |
| | n_gpu_layers=-1, |
| | n_batch=2048, |
| | n_ctx= max_token, |
| | n_threads= os.cpu_count(), |
| | |
| | verbose=True, |
| | chat_format="qwen" |
| | ) |
| | |
| |
|
| | x=llm.create_chat_completion( |
| | messages = messages |
| | ) |
| | print(x) |
| | x= x['choices'][0]['message']['content'] |
| | yield str(x) |
| |
|
| | |
| |
|
| | description = """<p align="center">Defaults to 2B (you can switch to 9B or 27B from additional inputs)</p> |
| | <p><center> |
| | <a href="https://huggingface.co/google/gemma-2-27b-it" target="_blank">[27B it Model]</a> |
| | <a href="https://huggingface.co/google/gemma-2-9b-it" target="_blank">[9B it Model]</a> |
| | <a href="https://huggingface.co/google/gemma-2-2b-it" target="_blank">[2B it Model]</a> |
| | <a href="https://huggingface.co/bartowski/gemma-2-27b-it-GGUF" target="_blank">[27B it Model GGUF]</a> |
| | <a href="https://huggingface.co/bartowski/gemma-2-9b-it-GGUF" target="_blank">[9B it Model GGUF]</a> |
| | <a href="https://huggingface.co/google/gemma-2-2b-it-GGUF" target="_blank">[2B it Model GGUF]</a> |
| | </center></p> |
| | """ |
| | import gradio as gr |
| |
|
| | demo = gr.ChatInterface( |
| | fn=respond, |
| | additional_inputs=[ |
| | gr.Dropdown( |
| | [ |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | "Q6_K/Qwen3-Next-80B-A3B-Thinking-Q6_K-00001-of-00002.gguf", |
| | |
| | |
| | ], |
| | value= "Q6_K/Qwen3-Next-80B-A3B-Thinking-Q6_K-00001-of-00002.gguf", |
| | label="Model", |
| | ), |
| | gr.Textbox( |
| | value="You are a helpful assistant.", |
| | label="System message", |
| | ), |
| | gr.Slider(1, 4096, value=2048, step=1, label="Max tokens"), |
| | gr.Slider(0.1, 4.0, value=0.7, step=0.1, label="Temperature"), |
| | gr.Slider(0.1, 1.0, value=0.95, step=0.05, label="Top-p"), |
| | gr.Slider(0, 100, value=40, step=1, label="Top-k"), |
| | gr.Slider(0.0, 2.0, value=1.1, step=0.1, label="Repetition penalty"), |
| | ], |
| | title="Chat with Gemma 2 using llama.cpp", |
| | description=description, |
| | ) |
| |
|
| |
|
| | |
| |
|
| | if __name__ == "__main__": |
| | demo.launch() |