import gradio as gr from llama_cpp import Llama # Load model locally - this will take a few minutes on first startup llm = Llama.from_pretrained( repo_id="bartowski/Llama-3-8B-Lexi-Uncensored-GGUF", filename="*Q4_K_M.gguf", # 4-bit quantization for CPU n_ctx=4096, n_threads=4, n_gpu_layers=0, verbose=False ) def chat(message, history): messages = [] for human, assistant in history: messages.append({"role": "user", "content": human}) messages.append({"role": "assistant", "content": assistant}) messages.append({"role": "user", "content": message}) try: response = llm.create_chat_completion( messages=messages, max_tokens=512, temperature=0.7, ) return response["choices"][0]["message"]["content"] except Exception as e: return f"Error: {str(e)}" demo = gr.ChatInterface( chat, title="AI Chat", description="Context maintained during session, resets on refresh" ) demo.launch()