Spaces:
Running
Running
File size: 948 Bytes
cbf8005 e9ddae9 cbf8005 398f222 689f1fc e9ddae9 863eb49 e9ddae9 398f222 4323878 689f1fc 9df24f1 e9ddae9 9df24f1 398f222 9df24f1 398f222 de96a1d e9ddae9 e6b8d52 689f1fc de96a1d 807809c 689f1fc e9ddae9 807809c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | import gradio as gr
from ctransformers import AutoModelForCausalLM
MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF"
MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf"
llm = AutoModelForCausalLM.from_pretrained(
MODEL_REPO,
model_file=MODEL_FILE,
model_type="llama",
gpu_layers=0,
context_length=4096,
)
def respond(message: str, history):
prompt = ""
for turn in history:
if isinstance(turn, (list, tuple)) and len(turn) >= 2:
user_msg, bot_msg = turn[0], turn[1]
else:
continue
prompt += f"[INST]\n{user_msg}\n[/INST]\n{bot_msg}\n"
prompt += f"[INST]\n{message}\n[/INST]"
out = llm(
prompt,
max_new_tokens=64,
temperature=0.7,
top_p=0.9,
)
if isinstance(out, dict) and "text" in out:
return out["text"]
return str(out)
demo = gr.ChatInterface(respond)
if __name__ == "__main__":
demo.launch()
|