Spaces:
Build error
Build error
File size: 1,395 Bytes
e1914a0 cedfec7 94820f0 cedfec7 94820f0 e3ec7c7 a9995a4 94820f0 cedfec7 e3ec7c7 94820f0 cedfec7 94820f0 cedfec7 a9995a4 94820f0 e3ec7c7 cedfec7 94820f0 e3ec7c7 94820f0 e3ec7c7 cedfec7 e3ec7c7 e1914a0 6d39b27 d2a3695 94820f0 d2a3695 e1914a0 e3ec7c7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 | import gradio as gr
from llama_cpp import Llama
# verbose=False stops the model from printing its internal logic to the logs
llm = Llama(
model_path="llama3.2-1b-Uncensored-Q4_K_M.gguf",
n_ctx=512,
n_threads=4,
verbose=False
)
def stream_chat(message, history):
# Minimalist prompt to prevent the model from entering "test/quiz" mode
prompt = f"User: {message}\nAssistant:"
stream = llm(
prompt,
max_tokens=256,
stop=["User:", "Assistant:", "\n"],
stream=True,
temperature=0, # Greedy search: prevents "wandering" into reasoning
repeat_penalty=1.2
)
partial_text = ""
# Phrases that usually indicate the model is starting a monologue
forbidden_start = ["Okay", "I think", "First", "Let me", "The user"]
for output in stream:
token = output["choices"][0]["text"]
combined = partial_text + token
# Kill generation if it starts meta-commentary
if any(combined.startswith(phrase) for phrase in forbidden_start):
break
partial_text += token
yield partial_text.strip()
with gr.Blocks() as demo:
gr.ChatInterface(
fn=stream_chat,
title="DIRECT-LLAMA-MAX",
description="Fast, direct, and uncensored."
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860) |