import gradio as gr from llama_cpp import Llama # verbose=False stops the model from printing its internal logic to the logs llm = Llama( model_path="llama3.2-1b-Uncensored-Q4_K_M.gguf", n_ctx=512, n_threads=4, verbose=False ) def stream_chat(message, history): # Minimalist prompt to prevent the model from entering "test/quiz" mode prompt = f"User: {message}\nAssistant:" stream = llm( prompt, max_tokens=256, stop=["User:", "Assistant:", "\n"], stream=True, temperature=0, # Greedy search: prevents "wandering" into reasoning repeat_penalty=1.2 ) partial_text = "" # Phrases that usually indicate the model is starting a monologue forbidden_start = ["Okay", "I think", "First", "Let me", "The user"] for output in stream: token = output["choices"][0]["text"] combined = partial_text + token # Kill generation if it starts meta-commentary if any(combined.startswith(phrase) for phrase in forbidden_start): break partial_text += token yield partial_text.strip() with gr.Blocks() as demo: gr.ChatInterface( fn=stream_chat, title="DIRECT-LLAMA-MAX", description="Fast, direct, and uncensored." ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)