import gradio as gr from llama_cpp import Llama # Initialize the model # We set n_threads=2 to match the Free Tier vCPU allocation # n_gpu_layers=0 ensures we don't look for a non-existent GPU llm = Llama.from_pretrained( repo_id="mradermacher/llama3.2-1b-Uncensored-GGUF", filename="llama3.2-1b-Uncensored.Q4_K_M.gguf", n_ctx=2048, n_threads=2, n_gpu_layers=0, verbose=False ) def stream_chat(message, history): # Prepare the prompt template prompt = f"User: {message}\nAssistant: " # Create the generation stream stream = llm( prompt, max_tokens=512, stop=["User:", "\n"], stream=True, # Enable token-by-token output temperature=0.8, top_p=0.95 ) partial_text = "" for chunk in stream: # Extract the new token text new_token = chunk['choices'][0]['text'] partial_text += new_token # Yielding the string updates the Gradio UI in real-time yield partial_text # Set up the Gradio interface demo = gr.ChatInterface( fn=stream_chat, title="Llama 3.2 1B Uncensored", description="Smart, uncensored, and fast word-by-word streaming on CPU." ) if __name__ == "__main__": demo.launch()