hermes3-llama-cpp

Running

File size: 948 Bytes

cbf8005
e9ddae9
cbf8005
398f222
 
689f1fc
e9ddae9
 
 
863eb49
e9ddae9
398f222
4323878
689f1fc
9df24f1
e9ddae9
9df24f1
 
 
 
 
398f222
9df24f1
398f222
de96a1d
e9ddae9
 
e6b8d52
689f1fc
 
 
de96a1d
807809c
 
 
 
 
689f1fc
e9ddae9
807809c

import gradio as gr
from ctransformers import AutoModelForCausalLM

MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"

llm = AutoModelForCausalLM.from_pretrained(
    MODEL_REPO,
    model_file=MODEL_FILE,
    model_type="llama",
    gpu_layers=0,
    context_length=4096,
)

def respond(message: str, history):
    prompt = ""
    for turn in history:
        if isinstance(turn, (list, tuple)) and len(turn) >= 2:
            user_msg, bot_msg = turn[0], turn[1]
        else:
            continue
        prompt += f"[INST]\n{user_msg}\n[/INST]\n{bot_msg}\n"

    prompt += f"[INST]\n{message}\n[/INST]"

    out = llm(
        prompt,
        max_new_tokens=64,
        temperature=0.7,
        top_p=0.9,
    )

    if isinstance(out, dict) and "text" in out:
        return out["text"]
    return str(out)

demo = gr.ChatInterface(respond)

if __name__ == "__main__":
    demo.launch()