Spaces:
Sleeping
Sleeping
File size: 1,256 Bytes
b7adb02 251dafb e104971 485a33e e104971 23a8ec8 89d0feb 251dafb af8d9d1 e104971 af8d9d1 e104971 e497580 e104971 e497580 485a33e e497580 485a33e e104971 bbe2c8f e104971 62f86f8 e104971 e497580 805934c e497580 e104971 251dafb e497580 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 |
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# Download your GGUF model from HF Hub
model_path = hf_hub_download(
repo_id="astegaras/lora_merged",
filename="llama-3.2-3b-instruct.Q2_K.gguf"
)
# Load GGUF with safe HF settings
llm = Llama(
model_path=model_path,
n_ctx=4096,
n_threads=4,
n_batch=64,
n_gpu_layers=0, # IMPORTANT
use_mmap=False, # IMPORTANT
use_mlock=False, # IMPORTANT
low_vram=True, # IMPORTANT
verbose=False
)
def chat_fn(message, history):
# Reformat history for llama.cpp chat template
messages = []
for user, assistant in history:
messages.append({"role": "user", "content": user})
messages.append({"role": "assistant", "content": assistant})
messages.append({"role": "user", "content": message})
output = llm.create_chat_completion(
messages=messages,
max_tokens=256,
temperature=0.2,
top_p=0.5
)
reply = output["choices"][0]["message"]["content"]
return reply
# Gradio UI
chatbot = gr.ChatInterface(
fn=chat_fn,
title="Merged Kaggle Model (GGUF)",
description="Running llama.cpp inference on GGUF model",
)
chatbot.launch()
|