# import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # Download your model automatically model_path = hf_hub_download( repo_id="stevendhasoi/phi_2223", filename="model_q4_k_m.gguf" ) # Load GGUF model llm = Llama( model_path=model_path, n_ctx=2048, n_threads=4, ) def chat_fn(message, history): prompt = "" for user, bot in history: prompt += f"User: {user}\nAssistant: {bot}\n" prompt += f"User: {message}\nAssistant:" output = llm( prompt, max_tokens=256, stop=["User:"], echo=False ) reply = output["choices"][0]["text"].strip() return reply gr.ChatInterface(chat_fn).launch()