| from llama_cpp import Llama | |
| import gradio as gr | |
| import time | |
| llm = Llama(model_path="zephyr-7B-beta-GGUF/zephyr-7b-beta.Q4_K_M.gguf") | |
| def predict(prompt,history): | |
| output = llm(prompt) | |
| response = output['choices'][0]['text'] | |
| for i in range(len(response)): | |
| time.sleep(0.05) | |
| yield response[:i+1] | |
| gr.ChatInterface(predict).queue().launch() | |