Spaces:
Running
Running
| import gradio as gr | |
| from ctransformers import AutoModelForCausalLM | |
| MODEL_REPO = "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF" | |
| MODEL_FILE = "tinyllama-1.1b-chat-v1.0.Q4_K_M.gguf" | |
| llm = AutoModelForCausalLM.from_pretrained( | |
| MODEL_REPO, | |
| model_file=MODEL_FILE, | |
| model_type="llama", | |
| gpu_layers=0, | |
| context_length=4096, | |
| ) | |
| def respond(message: str, history): | |
| prompt = "" | |
| for turn in history: | |
| if isinstance(turn, (list, tuple)) and len(turn) >= 2: | |
| user_msg, bot_msg = turn[0], turn[1] | |
| else: | |
| continue | |
| prompt += f"[INST]\n{user_msg}\n[/INST]\n{bot_msg}\n" | |
| prompt += f"[INST]\n{message}\n[/INST]" | |
| out = llm( | |
| prompt, | |
| max_new_tokens=64, | |
| temperature=0.7, | |
| top_p=0.9, | |
| ) | |
| if isinstance(out, dict) and "text" in out: | |
| return out["text"] | |
| return str(out) | |
| demo = gr.ChatInterface(respond) | |
| if __name__ == "__main__": | |
| demo.launch() | |