FASTEST / app.py
shuarya2011's picture
Update app.py
94820f0 verified
import gradio as gr
from llama_cpp import Llama
# verbose=False stops the model from printing its internal logic to the logs
llm = Llama(
model_path="llama3.2-1b-Uncensored-Q4_K_M.gguf",
n_ctx=512,
n_threads=4,
verbose=False
)
def stream_chat(message, history):
# Minimalist prompt to prevent the model from entering "test/quiz" mode
prompt = f"User: {message}\nAssistant:"
stream = llm(
prompt,
max_tokens=256,
stop=["User:", "Assistant:", "\n"],
stream=True,
temperature=0, # Greedy search: prevents "wandering" into reasoning
repeat_penalty=1.2
)
partial_text = ""
# Phrases that usually indicate the model is starting a monologue
forbidden_start = ["Okay", "I think", "First", "Let me", "The user"]
for output in stream:
token = output["choices"][0]["text"]
combined = partial_text + token
# Kill generation if it starts meta-commentary
if any(combined.startswith(phrase) for phrase in forbidden_start):
break
partial_text += token
yield partial_text.strip()
with gr.Blocks() as demo:
gr.ChatInterface(
fn=stream_chat,
title="DIRECT-LLAMA-MAX",
description="Fast, direct, and uncensored."
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)