Spaces:
Build error
Build error
| import gradio as gr | |
| from llama_cpp import Llama | |
| # Initialize the model | |
| # We set n_threads=2 to match the Free Tier vCPU allocation | |
| # n_gpu_layers=0 ensures we don't look for a non-existent GPU | |
| llm = Llama.from_pretrained( | |
| repo_id="mradermacher/llama3.2-1b-Uncensored-GGUF", | |
| filename="llama3.2-1b-Uncensored.Q4_K_M.gguf", | |
| n_ctx=2048, | |
| n_threads=2, | |
| n_gpu_layers=0, | |
| verbose=False | |
| ) | |
| def stream_chat(message, history): | |
| # Prepare the prompt template | |
| prompt = f"User: {message}\nAssistant: " | |
| # Create the generation stream | |
| stream = llm( | |
| prompt, | |
| max_tokens=512, | |
| stop=["User:", "\n"], | |
| stream=True, # Enable token-by-token output | |
| temperature=0.8, | |
| top_p=0.95 | |
| ) | |
| partial_text = "" | |
| for chunk in stream: | |
| # Extract the new token text | |
| new_token = chunk['choices'][0]['text'] | |
| partial_text += new_token | |
| # Yielding the string updates the Gradio UI in real-time | |
| yield partial_text | |
| # Set up the Gradio interface | |
| demo = gr.ChatInterface( | |
| fn=stream_chat, | |
| title="Llama 3.2 1B Uncensored", | |
| description="Smart, uncensored, and fast word-by-word streaming on CPU." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |