import gradio as gr
from llama_cpp import Llama

# verbose=False stops the model from printing its internal logic to the logs
llm = Llama(
    model_path="llama3.2-1b-Uncensored-Q4_K_M.gguf",
    n_ctx=512,
    n_threads=4,
    verbose=False
)

def stream_chat(message, history):
    # Minimalist prompt to prevent the model from entering "test/quiz" mode
    prompt = f"User: {message}\nAssistant:"

    stream = llm(
        prompt,
        max_tokens=256,
        stop=["User:", "Assistant:", "\n"],
        stream=True,
        temperature=0, # Greedy search: prevents "wandering" into reasoning
        repeat_penalty=1.2
    )

    partial_text = ""
    # Phrases that usually indicate the model is starting a monologue
    forbidden_start = ["Okay", "I think", "First", "Let me", "The user"]

    for output in stream:
        token = output["choices"][0]["text"]
        combined = partial_text + token
        
        # Kill generation if it starts meta-commentary
        if any(combined.startswith(phrase) for phrase in forbidden_start):
            break
            
        partial_text += token
        yield partial_text.strip()

with gr.Blocks() as demo:
    gr.ChatInterface(
        fn=stream_chat,
        title="DIRECT-LLAMA-MAX",
        description="Fast, direct, and uncensored."
    )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)