Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| from llama_cpp import Llama | |
| llm = Llama( | |
| model_path="gemma-2b-uk.gguf", | |
| n_threads=2, | |
| n_threads_batch=2, | |
| ) | |
| def convert_history(message, history): | |
| chat_history = "" | |
| for block in history[-1:]: | |
| chat_history += f"<|user|>\n{block[0]}<eos>\n<|assistant|>\n{block[1]}<eos>\n" | |
| chat_history += f"<|user|>\n{message}<eos>\n<|assistant|>\n" | |
| return chat_history | |
| def ask(message, history): | |
| chat_history = convert_history(message, history) | |
| chunks = llm( | |
| chat_history, | |
| temperature = 0.2, | |
| top_p=0.9, | |
| stream = True, | |
| repeat_penalty = 1.05, | |
| max_tokens = 128, | |
| ) | |
| response = "" | |
| for chunk in chunks: | |
| delta = chunk["choices"][0]["text"] | |
| print(delta) | |
| response += delta | |
| yield response | |
| demo = gr.ChatInterface(ask) | |
| if __name__ == "__main__": | |
| demo.queue().launch() |