Spaces:

shuarya2011
/

fast

Build error

fast / app.py

Update app.py

8d9e69c verified 27 days ago

1.25 kB

	import gradio as gr
	from llama_cpp import Llama

	# Initialize the model
	# We set n_threads=2 to match the Free Tier vCPU allocation
	# n_gpu_layers=0 ensures we don't look for a non-existent GPU
	llm = Llama.from_pretrained(
	repo_id="mradermacher/llama3.2-1b-Uncensored-GGUF",
	filename="llama3.2-1b-Uncensored.Q4_K_M.gguf",
	n_ctx=2048,
	n_threads=2,
	n_gpu_layers=0,
	verbose=False
	)

	def stream_chat(message, history):
	# Prepare the prompt template
	prompt = f"User: {message}\nAssistant: "

	# Create the generation stream
	stream = llm(
	prompt,
	max_tokens=512,
	stop=["User:", "\n"],
	stream=True, # Enable token-by-token output
	temperature=0.8,
	top_p=0.95
	)

	partial_text = ""
	for chunk in stream:
	# Extract the new token text
	new_token = chunk['choices'][0]['text']
	partial_text += new_token
	# Yielding the string updates the Gradio UI in real-time
	yield partial_text

	# Set up the Gradio interface
	demo = gr.ChatInterface(
	fn=stream_chat,
	title="Llama 3.2 1B Uncensored",
	description="Smart, uncensored, and fast word-by-word streaming on CPU."
	)

	if __name__ == "__main__":
	demo.launch()