import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # Download your GGUF model from HF Hub model_path = hf_hub_download( repo_id="astegaras/lora_merged", filename="llama-3.2-3b-instruct.Q2_K.gguf" ) # Load GGUF with safe HF settings llm = Llama( model_path=model_path, n_ctx=4096, n_threads=4, n_batch=64, n_gpu_layers=0, # IMPORTANT use_mmap=False, # IMPORTANT use_mlock=False, # IMPORTANT low_vram=True, # IMPORTANT verbose=False ) def chat_fn(message, history): # Reformat history for llama.cpp chat template messages = [] for user, assistant in history: messages.append({"role": "user", "content": user}) messages.append({"role": "assistant", "content": assistant}) messages.append({"role": "user", "content": message}) output = llm.create_chat_completion( messages=messages, max_tokens=256, temperature=0.2, top_p=0.5 ) reply = output["choices"][0]["message"]["content"] return reply # Gradio UI chatbot = gr.ChatInterface( fn=chat_fn, title="Merged Kaggle Model (GGUF)", description="Running llama.cpp inference on GGUF model", ) chatbot.launch()