import os import gradio as gr from llama_cpp import Llama # Load GGUF model MODEL_PATH = "./models/mistral.gguf" llm = Llama( model_path=MODEL_PATH, n_ctx=2048, n_threads=9, # Increase for more speed if CPU allows n_batch=128, use_mlock=True, use_mmap=True, verbose=False ) # Streaming generator def generate_response(prompt): stream = llm( prompt=f"[INST] {prompt.strip()} [/INST]", max_tokens=512, stop=[""], stream=True ) partial = "" for chunk in stream: partial += chunk["choices"][0]["text"] yield partial # Gradio UI gr.ChatInterface( fn=generate_response, title="Leo9 AI Tutor", description="An ai chatbots who answer any question.", ).launch()