File size: 948 Bytes
cbf8005
e9ddae9
cbf8005
398f222
 
689f1fc
e9ddae9
 
 
863eb49
e9ddae9
398f222
4323878
689f1fc
9df24f1
e9ddae9
9df24f1
 
 
 
 
398f222
9df24f1
398f222
de96a1d
e9ddae9
 
e6b8d52
689f1fc
 
 
de96a1d
807809c
 
 
 
 
689f1fc
e9ddae9
807809c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
import gradio as gr
from ctransformers import AutoModelForCausalLM

MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"

llm = AutoModelForCausalLM.from_pretrained(
    MODEL_REPO,
    model_file=MODEL_FILE,
    model_type="llama",
    gpu_layers=0,
    context_length=4096,
)

def respond(message: str, history):
    prompt = ""
    for turn in history:
        if isinstance(turn, (list, tuple)) and len(turn) >= 2:
            user_msg, bot_msg = turn[0], turn[1]
        else:
            continue
        prompt += f"[INST]\n{user_msg}\n[/INST]\n{bot_msg}\n"

    prompt += f"[INST]\n{message}\n[/INST]"

    out = llm(
        prompt,
        max_new_tokens=64,
        temperature=0.7,
        top_p=0.9,
    )

    if isinstance(out, dict) and "text" in out:
        return out["text"]
    return str(out)

demo = gr.ChatInterface(respond)

if __name__ == "__main__":
    demo.launch()