iris_before_code

Sleeping

File size: 1,256 Bytes

b7adb02
251dafb
e104971
485a33e
e104971
 
23a8ec8
89d0feb
251dafb
 
af8d9d1
e104971
 
af8d9d1
 
 
 
 
 
 
e104971
 
e497580
e104971
 
e497580
 
 
 
485a33e
e497580
485a33e
e104971
 
bbe2c8f
 
 
e104971
62f86f8
e104971
e497580
805934c
e497580
e104971
 
 
 
 
 
 
 
251dafb
e497580

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Download your GGUF model from HF Hub
model_path = hf_hub_download(
    repo_id="astegaras/lora_merged",
    filename="llama-3.2-3b-instruct.Q2_K.gguf"
)

# Load GGUF with safe HF settings
llm = Llama(
    model_path=model_path,
    n_ctx=4096,
    n_threads=4,
    n_batch=64,
    n_gpu_layers=0,     # IMPORTANT
    use_mmap=False,     # IMPORTANT
    use_mlock=False,    # IMPORTANT
    low_vram=True,      # IMPORTANT
    verbose=False
)

def chat_fn(message, history):
    # Reformat history for llama.cpp chat template
    messages = []
    for user, assistant in history:
        messages.append({"role": "user", "content": user})
        messages.append({"role": "assistant", "content": assistant})

    messages.append({"role": "user", "content": message})

    output = llm.create_chat_completion(
        messages=messages,
        max_tokens=256,
        temperature=0.2,
        top_p=0.5
    )

    reply = output["choices"][0]["message"]["content"]
    return reply


# Gradio UI
chatbot = gr.ChatInterface(
    fn=chat_fn,
    title="Merged Kaggle Model (GGUF)",
    description="Running llama.cpp inference on GGUF model",
)

chatbot.launch()