Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from llama_cpp import Llama | |
| from huggingface_hub import hf_hub_download | |
| # Download your GGUF model from HF Hub | |
| model_path = hf_hub_download( | |
| repo_id="astegaras/lora_merged", | |
| filename="llama-3.2-3b-instruct.Q2_K.gguf" | |
| ) | |
| # Load GGUF with safe HF settings | |
| llm = Llama( | |
| model_path=model_path, | |
| n_ctx=4096, | |
| n_threads=4, | |
| n_batch=64, | |
| n_gpu_layers=0, # IMPORTANT | |
| use_mmap=False, # IMPORTANT | |
| use_mlock=False, # IMPORTANT | |
| low_vram=True, # IMPORTANT | |
| verbose=False | |
| ) | |
| def chat_fn(message, history): | |
| # Reformat history for llama.cpp chat template | |
| messages = [] | |
| for user, assistant in history: | |
| messages.append({"role": "user", "content": user}) | |
| messages.append({"role": "assistant", "content": assistant}) | |
| messages.append({"role": "user", "content": message}) | |
| output = llm.create_chat_completion( | |
| messages=messages, | |
| max_tokens=256, | |
| temperature=0.2, | |
| top_p=0.5 | |
| ) | |
| reply = output["choices"][0]["message"]["content"] | |
| return reply | |
| # Gradio UI | |
| chatbot = gr.ChatInterface( | |
| fn=chat_fn, | |
| title="Merged Kaggle Model (GGUF)", | |
| description="Running llama.cpp inference on GGUF model", | |
| ) | |
| chatbot.launch() | |