Spaces:

shuarya2011
/

FASTEST

Build error

FASTEST / app.py

Update app.py

94820f0 verified 9 days ago

1.4 kB

	import gradio as gr
	from llama_cpp import Llama

	# verbose=False stops the model from printing its internal logic to the logs
	llm = Llama(
	model_path="llama3.2-1b-Uncensored-Q4_K_M.gguf",
	n_ctx=512,
	n_threads=4,
	verbose=False
	)

	def stream_chat(message, history):
	# Minimalist prompt to prevent the model from entering "test/quiz" mode
	prompt = f"User: {message}\nAssistant:"

	stream = llm(
	prompt,
	max_tokens=256,
	stop=["User:", "Assistant:", "\n"],
	stream=True,
	temperature=0, # Greedy search: prevents "wandering" into reasoning
	repeat_penalty=1.2
	)

	partial_text = ""
	# Phrases that usually indicate the model is starting a monologue
	forbidden_start = ["Okay", "I think", "First", "Let me", "The user"]

	for output in stream:
	token = output["choices"][0]["text"]
	combined = partial_text + token

	# Kill generation if it starts meta-commentary
	if any(combined.startswith(phrase) for phrase in forbidden_start):
	break

	partial_text += token
	yield partial_text.strip()

	with gr.Blocks() as demo:
	gr.ChatInterface(
	fn=stream_chat,
	title="DIRECT-LLAMA-MAX",
	description="Fast, direct, and uncensored."
	)

	if __name__ == "__main__":
	demo.launch(server_name="0.0.0.0", server_port=7860)